In [6]:
import pandas as pd
import numpy as np
import json
import io
import re

In [7]:
def reset_index(df):
    df.reset_index(inplace=True)
    df.drop('index', axis=1, inplace=True)
    df.index = np.arange(1, len(df) + 1)
    return(df)

def open_csv(fname):
    df = pd.read_csv(fname)
    try: 
        df.drop('type_name', axis=1, inplace=True)
    except:
        print(fname + '   ***   ' + 'Error with dropping type_name column')
        
    try: 
        df.drop('difference', axis=1, inplace=True)
    except:
        print(fname + '   ***   ' + 'Error with dropping difference column')
        
    if fname == 'statistics - age.csv':
        df.fillna(value=0, inplace=True)
    
    try:
        df['date_stamp'] = pd.to_datetime(df['date_stamp'], format='%Y-%m-%d')
    except:
        print(fname + '   ***   ' + 'Error with datetime')
    
    for col in list(df):
        if col not in ['name', 'date_stamp']:
            try:
                df[col] = df[col].astype(int)
            except:
                continue
    return reset_index(df)

def json_to_df(fname):
    f = io.open(fname, 'r', encoding='utf-8-sig')
    df_json = pd.read_json(f, orient='columns')
    df = pd.read_json( (df_json['features']).to_json(), orient='index')
    df = pd.concat([pd.read_json((df['geometry']).to_json(), orient='index'),\
               pd.read_json((df['properties']).to_json(), orient='index')],axis=1)
    try:
        df.drop('type', axis=1, inplace=True)
    except:
        print('Couldnt drop type column')
    df.replace(['nan', 'NaN', 'Nan', 'None', 'NULL'], np.nan, inplace=True)
    return df

def age_check(df):
    col_list = [col for col in list(df) if col.startswith('age') and col != 'age_general']
    #df2 = pd.DataFrame({'summed': df[col_list].sum(axis=1), 'age_general': df['age_general']})
    df_copy = df.set_index('name').copy()
    diff = df_copy[col_list].sum(axis=1) - df_copy['age_general']
    diff_cols = diff[diff != 0]
    if len(diff_cols) != 0:
        print('These regions still dont add up:    ' + str(diff_cols.index.values.tolist()))
    else:
        print('All good with age pyramid data!')

In [8]:
empl = open_csv('statistics - empl.csv')
empl_cat = open_csv('statistics - empl_category.csv')
age = open_csv('statistics - age.csv')
#admin = pd.concat([json_to_df('admin.json'), pd.read_csv('admin_df.csv')]).reset_index()
admin = pd.read_csv('admin_df.csv')
age_check(age)

statistics - empl.csv   ***   Error with dropping difference column
statistics - empl_category.csv   ***   Error with dropping type_name column
statistics - empl_category.csv   ***   Error with dropping difference column
statistics - empl_category.csv   ***   Error with datetime
statistics - age.csv   ***   Error with dropping type_name column
All good with age pyramid data!


In [23]:
age['name'] = age['name'].replace(pd.Series(admin['admin_id'].values, index=admin['name']).to_dict())
age.drop('Unnamed: 0', axis=1, inplace=True)

In [25]:
age.to_csv('age.csv')

In [4]:
empl_melted = pd.melt(empl, id_vars=['name', 'date_stamp'], var_name='category', value_name='value').dropna()
empl_melted['value'] = empl_melted['value'].astype(int)
empl_melted = reset_index(empl_melted)
#replace industry category names with IDs
empl_melted['category'] = empl_melted['category'].replace(empl_cat.set_index('name_eng')['code'].to_dict())
empl_melted['name'] = empl_melted['name'].replace(pd.Series(admin['admin_id'].values, index=admin['name']).to_dict())

In [5]:
empl_melted.to_csv('empl.csv')

In [5]:
dyn = open_csv("statistics - dynamics.csv")
dyn['name'] = dyn['name'].replace(pd.Series(admin['admin_id'].values, index=admin['name']).to_dict())

statistics - dynamics.csv   ***   Error with dropping type_name column
statistics - dynamics.csv   ***   Error with dropping difference column


In [7]:
dyn.to_csv('dyn.csv')

In [15]:
for col in ['population', 'born', 'died', 'move_in', 'move_out']:
    dyn[col] = dyn[col].astype(int, errors='ignore')
dyn.replace(1000000, np.nan, inplace=True)

In [12]:
pn = json_to_df('osm/soc_point.geojson')
pl = json_to_df('osm/soc_poly.geojson')

In [13]:
pl

Unnamed: 0,coordinates,addr:city,addr:housenumber,addr:postcode,addr:street,after_school,amenity,brand,building,building:levels,...,power,religion,ruins,shop,smoothness,sport,supervised,surface,tourism,townhall:type
0,"[[[33.5465382, 49.6435805], [33.5466911, 49.64...",,,,,,post_office,,yes,,...,,,,,,,,,,
1,"[[[33.5480673, 49.6432058], [33.5484926, 49.64...",,,,,,kindergarten,,yes,,...,,,,,,,,,,
10,"[[[33.5426131, 49.6447864], [33.5434599, 49.64...",,,,,,school,,,,...,,,,,,,,,,
100,"[[[33.7300589, 49.7858664], [33.7302395, 49.78...",,,,,,,,shop,,...,,,,supermarket,,,,,,
1000,"[[[33.5491098, 49.6465692], [33.5492363, 49.64...",,,,,,,,yes,,...,,,,,,,,,,
1001,"[[[33.5484717, 49.6464818], [33.5486092, 49.64...",,,,,,,,yes,,...,,,,,,,,,,
1002,"[[[33.5485255, 49.6462773], [33.548589, 49.646...",,,,,,,,yes,,...,,,,,,,,,,
1003,"[[[33.548467, 49.6468892], [33.5486084, 49.646...",,,,,,,,yes,,...,,,,,,,,,,
1004,"[[[33.5494121, 49.6467718], [33.5495371, 49.64...",,,,,,,,yes,,...,,,,,,,,,,
1005,"[[[33.5493218, 49.6467076], [33.5494304, 49.64...",,,,,,,,yes,,...,,,,,,,,,,
