## Hledání skupin podobných měst na základě dostupnosti školských zařízení a věkového složení obyvatel.

| mesto | MS | ZS | SS | druziny | jedalne | U/JS | 0-14r | 15-64r | >64r |
|:-----:|:--:|:--:|:--:|:-------:|:-------:|:----:|:-----:|:------:|:----:|

Nacitanie dat z DB a ulozenie do csv.

In [97]:
import pandas as pd
import numpy as np
from pymongo import MongoClient

mongo_client = MongoClient("mongodb://root:password@localhost:27017")
database = mongo_client.get_database("schools")
collection = database["population"]

OKRES_CITY_CODES = ['CZ0615', 'CZ0214', 'CZ0216', 'CZ0316', 'CZ0321', 'CZ0811', 'CZ0412', 'CZ0213', 'CZ0625', 'CZ0317', 'CZ0411', 'CZ0524', 'CZ0423', 'CZ0626', 'CZ0612', 'CZ0514', 'CZ0313', 'CZ0622', 'CZ0816', 'CZ0812', 'CZ0813', 'CZ0521', 'CZ0211', 'CZ0621', 'CZ0219', 'CZ0713', 'CZ0513', 'CZ021B', 'CZ0312', 'CZ0323', 'CZ0712', 'CZ0613', 'CZ0715', 'CZ021A', 'CZ0531', 'CZ0814', 'CZ0523', 'CZ0217', 'CZ0534', 'CZ0327', 'CZ0611', 'CZ0324', 'CZ0212', 'CZ0422', 'CZ0424', 'CZ0426', 'CZ0627', 'CZ0215', 'CZ0325', 'CZ0532']
YOUNG_AGE = [400000600005000, 400005610010000, 410010610015000]
MIDLE_AGE = [410015610020000, 410020610025000, 410025610030000, 410030610035000, 410035610040000, 410040610045000, 410045610050000, 410050610055000, 410055610060000, 410060610065000]
OLD_AGE = [410065610070000, 410070610075000, 410075610080000, 410080610085000, 410085610090000, 410090610095000, 410095799999000]

# school types codes
# materske skoly
MS = ["A00", "A10", "A13", "A14", "A15", "A16"]
# zakladne skoly
ZS = ["B00", "B10", "B13", "B14", "B16", "B31"]
# stredne skoly
SS = ["C00", "C10", "C16", "C93", "B16", "B31"]
# umelecke a jazykove skoly
US_JS = ["F10", "F20", "F29", "D00", "D10", "D16"]
# jedalne
CANTEENS = ["L11", "L12", "L13", "L15", "L19"]
# kluby a druziny
SCHOOL_CLUB = ["G21", "G22"]

def select_age(category):
    return {
        '$match': {
            'vek_kod': {
                '$in': category
            }
        }
    }

def select_school_type(type):
    return {
        '$match': {
            'zariadenia.typ': {
                '$in': type
            }
        }
    }

##### SELECT POPULATION BY AGE IN OKRES CITY
select_50_in_population = {
    '$match': {
        'vuzemi_kod': {
            '$in': OKRES_CITY_CODES
        }
    }
}
select_young = select_age(YOUNG_AGE)
select_middle = select_age(MIDLE_AGE)
select_old = select_age(OLD_AGE)

group_by_okres_sum_population = {
    '$group': {
        '_id':'$vuzemi_txt',
        'kod':{'$first':'$vuzemi_kod'},
        'sum': {
            '$sum': '$hodnota'
        }

    }
}

pipeline_young = [
    select_50_in_population,
    select_young,
    group_by_okres_sum_population
]

pipeline_middle = [
    select_50_in_population,
    select_middle,
    group_by_okres_sum_population
]

pipeline_old = [
    select_50_in_population,
    select_old,
    group_by_okres_sum_population
]

young = pd.DataFrame(list(collection.aggregate(pipeline_young)))
young = young.rename(columns={'_id':'City','sum':'0-14'})
middle = pd.DataFrame(list(collection.aggregate(pipeline_middle)))
middle = middle.drop(columns=['_id'])
middle = middle.rename(columns={'sum':'15-64'})
old = pd.DataFrame(list(collection.aggregate(pipeline_old)))
old = old.drop(columns=['_id'])
old = old.rename(columns={'sum':'>64'})

##### SELECT SCHOOLS CAPACITIES
collection = database["all_schools"]
select_ms = select_school_type(MS)
select_zs = select_school_type(ZS)
select_ss = select_school_type(SS)
select_us_js = select_school_type(US_JS)
select_canteens = select_school_type(CANTEENS)
select_school_club = select_school_type(SCHOOL_CLUB)

unwind_zariadenia = {
    "$unwind": "$zariadenia"
}

select_50_in_schools = {
    '$match': {
        'adresa.okres_kod': {
            '$in': OKRES_CITY_CODES
        }
    }
}

group_by_okres_sum_population = {
    '$group': {
        '_id':'$adresa.okres_kod',
        'sum': {
            '$sum': '$zariadenia.kapacita'
        }
    }
}
pipline_ms = [
    unwind_zariadenia,
    select_50_in_schools,
    select_ms,
    group_by_okres_sum_population,
]
pipline_zs = [
    unwind_zariadenia,
    select_50_in_schools,
    select_zs,
    group_by_okres_sum_population,
]
pipline_ss = [
    unwind_zariadenia,
    select_50_in_schools,
    select_ss,
    group_by_okres_sum_population,
]
pipline_us_js = [
    unwind_zariadenia,
    select_50_in_schools,
    select_us_js,
    group_by_okres_sum_population,
]
pipline_canteens = [
    unwind_zariadenia,
    select_50_in_schools,
    select_canteens,
    group_by_okres_sum_population,
]
pipline_school_clubs = [
    unwind_zariadenia,
    select_50_in_schools,
    select_school_club,
    group_by_okres_sum_population,
]

ms = pd.DataFrame(list(collection.aggregate(pipline_ms)))
ms = ms.rename(columns={'sum':'ms','_id':'kod'})
zs = pd.DataFrame(list(collection.aggregate(pipline_zs)))
zs = zs.rename(columns={'sum':'zs','_id':'kod'})
ss = pd.DataFrame(list(collection.aggregate(pipline_ss)))
ss = ss.rename(columns={'sum':'ss','_id':'kod'})
us_js = pd.DataFrame(list(collection.aggregate(pipline_us_js)))
us_js = us_js.rename(columns={'sum':'us_js','_id':'kod'})
canteens = pd.DataFrame(list(collection.aggregate(pipline_canteens)))
canteens = canteens.rename(columns={'sum':'canteens','_id':'kod'})
school_clubs = pd.DataFrame(list(collection.aggregate(pipline_school_clubs)))
school_clubs = school_clubs.rename(columns={'sum':'school_clubs','_id':'kod'})

result = young
result = result.merge(middle, left_on='kod', right_on='kod')
result = result.merge(old, left_on='kod', right_on='kod')
result = result.merge(ms, left_on='kod', right_on='kod')
result = result.merge(zs, left_on='kod', right_on='kod')
result = result.merge(ss, left_on='kod', right_on='kod')
result = result.merge(us_js, left_on='kod', right_on='kod')
result = result.merge(canteens, left_on='kod', right_on='kod')
result = result.merge(school_clubs, left_on='kod', right_on='kod')
result = result.drop(columns=['kod'])
result = result[["City", "ms", "zs", "ss", "us_js", "canteens", "school_clubs", "0-14", "15-64", ">64"]]


# save to csv file
result.to_csv('output/csv/task_c_1.csv', header=["mesto", "ms", "zs", "ss", "us_js", "jedalne", "druziny", "0-14", "15-64", ">64"], index=False)




1. Normalizácia: Z-score
2. Diskretizácia: Kvantitativny atribut populacia rozdeleny na 3 vekove intervaly. <0,14> <15,64> <65,...>
3. Odlahlé hodnoty: Rozdiely medzi hodnotami kapacit jednotlivych zariadeni boli zredukovane prepoctom na 1000 obyvatelov

pracovne csv: [task_c_1.csv](./output/csv/task_c_1.csv)

In [123]:
# load csv
df = pd.read_csv('./output/csv/task_c_1.csv')

df['population'] = df['0-14'] + df['15-64'] + df['>64']
# prepocet na 1000 obyvatelov
df['ms'] = (df['ms']/(df['population']/1000))
df['zs'] = (df['zs']/(df['population']/1000))
df['ss'] = (df['ss']/(df['population']/1000))
df['us_js'] = (df['us_js']/(df['population']/1000))
df['jedalne'] = (df['jedalne']/(df['population']/1000))
df['druziny'] = (df['druziny']/(df['population']/1000))
# normalizacia Z-score
df['ms'] = (df['ms'] - np.mean(list(df['ms']))) / np.std(list(df['ms']))
df['zs'] = (df['zs'] - np.mean(list(df['zs']))) / np.std(list(df['zs']))
df['ss'] = (df['ss'] - np.mean(list(df['ss']))) / np.std(list(df['ss']))
df['us_js'] = (df['us_js'] - np.mean(list(df['us_js']))) / np.std(list(df['us_js']))
df['jedalne'] = (df['jedalne'] - np.mean(list(df['jedalne']))) / np.std(list(df['jedalne']))
df['druziny'] = (df['druziny'] - np.mean(list(df['druziny']))) / np.std(list(df['druziny']))
df = df.drop(columns=['population'])
# save to csv file
df.to_csv('output/csv/task_c_2.csv', header=["mesto", "ms", "zs", "ss", "us_js", "jedalne", "druziny", "0-14", "15-64", ">64"], index=False)