# Data Wrangling

of Project 'Demography behind household sizes in Switzerland and its change in time'

In [124]:
%matplotlib inline
import os
import pandas as pd
import json
import folium
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [125]:
data_folder = './data/'
wage_folder = data_folder+'Gross_monthly_wage_GMW/'

## 1) Load and Prepare Data

In [409]:
#Load raw data and load entire excel sheets
hh_sizes = pd.read_excel(data_folder + 'HH_sizes_commune.xlsx', skiprows=3, skipfooter=10, sheet_name = None, usecols=list(range(9)))
hh_size_agglo = pd.read_excel(data_folder + 'HH_aglo_rural .xlsx', skiprows=7, skipfooter=10, sheet_name = None)
incomes = pd.read_csv(data_folder + 'Incomes.csv')
cantonal_parties = pd.read_excel(data_folder + 'Kantonale Parlamentswahlen.xls', skiprows=3, skipfooter=10, sheet_name = None)
foreign_population = pd.read_excel(data_folder + 'population_regions.xlsx', skiprows=4, skipfooter=10, sheet_name = None)
wage_swiss_foreign_ch = pd.read_excel(wage_folder + 'GMW_switzerland.xlsx', skiprows=42, skipfooter=10, sheet_name = None, usecols = 10)
wage_swiss_foreign_lake_ge = pd.read_excel(wage_folder + 'GMW_lake_geneva.xlsx', skiprows=42, skipfooter=10, sheet_name = None, usecols = 10)
wage_swiss_foreign_mittelland = pd.read_excel(wage_folder + 'GMW_mittelland.xlsx', skiprows=42, skipfooter=10, sheet_name = None, usecols = 10)
wage_swiss_foreign_north = pd.read_excel(wage_folder + 'GMW_north.xlsx', skiprows=42, skipfooter=10, sheet_name = None, usecols = 10)
wage_swiss_foreign_zh = pd.read_excel(wage_folder + 'GMW_zurich.xlsx', skiprows=42, skipfooter=10, sheet_name = None, usecols = 10)
wage_swiss_foreign_east = pd.read_excel(wage_folder + 'GMW_east.xlsx', skiprows=42, skipfooter=10, sheet_name = None, usecols = 10)
wage_swiss_foreign_central = pd.read_excel(wage_folder + 'GMW_central.xlsx', skiprows=42, skipfooter=10, sheet_name = None, usecols = 10)
wage_swiss_foreign_ti = pd.read_excel(wage_folder + 'GMW_ticino.xlsx', skiprows=42, skipfooter=10, sheet_name = None, usecols = 10)
rental_11_13_rooms = pd.read_excel(data_folder + 'rental_prices_age_11-13.xls', skiprows=6, skipfooter=10)
rental_13_16_all = pd.read_excel(data_folder + 'rental_prices_age.xlsx', skiprows=6, skipfooter=10)
rental_age = pd.read_excel(data_folder + 'rental_agel_regions.xlsx', skiprows=6, skipfooter=10)
rental_foreign_rooms = pd.read_excel(data_folder + 'rental_prices_nationality.xlsx', skiprows=6, skipfooter=10)
rental_rooms_cities_11_13 = pd.read_excel(data_folder + 'rental_prices_room_size_11-13.xls', skiprows=6, skipfooter=10)
rental_rooms_cities_12_14 = pd.read_excel(data_folder + 'rental_prices_room_size_12-14.xls', skiprows=6, skipfooter=10)
rental_sqm_demographics = pd.read_excel(data_folder + 'rental_prices_size.xlsx', skiprows=6, skipfooter=10)


In [408]:
#definitions to apply arguments to all sheets of the excel
#rename headers
def rename_header(df, header):
    for y in df.keys():
        df[y].columns = header
        df[y] = df[y]
    return df

def drop_NA(df):
    for y in df.keys():
        df[y] = df[y].replace(r'^\s*$',np.nan,regex=True)
        df[y] = df[y].dropna(axis= 0,how = 'all')
        df[y] = df[y].dropna(axis= 1,how = 'all')
    return df

def select_X_rows(df,num_rows):
    for y in df.keys():
        df[y] = df[y].iloc[:num_rows]
    return df

def rename_columns(df, new_names):
    for y in df.keys():
        df[y] = df[y].rename(index=str, columns=new_names)
    return df

def drop_index(df, ind):
    for y in df.keys():
        df[y] = df[y].drop(ind).reset_index().drop('index', 1)
    return df

def clean_wage_region_data(df):
    header= ["Residents","total_median","quartile_from","quartile_to",
            "women_median","women_quartile_from","woman_quartile_to",
            "men_median","men_quartile_from","man_quartile_to"]
    df = drop_NA(df)
    df = select_X_rows(df,8)
    df = rename_header(df, header)
    return df

In [354]:
#for household size: 
col_header = ['Commune', 'Total', '1-person households','2-person households', 
               '3-person households', '4-person households','5-person households', 
               '6-person or larger households', 'not plausible hh']
hh_sizes = rename_header(hh_sizes, col_header)
hh_sizes['2017'].head()

Unnamed: 0,Commune,Total,1-person households,2-person households,3-person households,4-person households,5-person households,6-person or larger households,not plausible hh
0,Switzerland,8317056,1320230,2434588,1477575,1892056,792445,400162,< 0.5
1,0001 Aeugst am Albis,1931,233,566,351,516,170,95,< 0.5
2,0002 Affoltern am Albis,11741,1771,3498,1995,2628,1205,644,< 0.5
3,0003 Bonstetten,5491,646,1606,960,1620,500,159,0
4,0004 Hausen am Albis,3641,432,1084,609,980,365,171,0


In [304]:
cantonal_parties = drop_NA(cantonal_parties)
cantonal_parties = select_X_rows(cantonal_parties,26)
rename_cols = {"Unnamed: 0": "Canton"}
cantonal_parties = rename_columns(cantonal_parties, rename_cols)
cantonal_parties['aktuell (2015-2018)'].head()

Unnamed: 0,Canton,Wahljahr 2),Wahlbeteiligung,FDP 6),CVP 7),SP,SVP,LPS 6),EVP,CSP,...,PSA,GPS,FGA 8),Sol.,SD,EDU,Lega,MCR,Übrige 11),Total
2,Zürich,2015.0,32.652497,17.32782,4.878712,19.716444,30.023215,,4.271767,,...,,7.218776,2.977664,,,2.662277,,,0.669707,100
3,Bern,2018.0,30.516313,11.71791,0.671415,22.328775,26.760869,,6.172897,,...,0.681873,10.104467,0.495841,,0.179432,3.710624,,,0.912781,100
4,Luzern,2015.0,38.741267,21.039516,30.862513,11.848915,24.115646,,0.199143,,...,,6.700011,,,,,,,0.036129,100
5,Uri 1),2016.0,61.989056,26.856698,31.298814,12.984966,24.053191,,,,...,,2.378061,,,,,,,2.42827,100
6,Schwyz,2016.0,37.747088,21.628996,27.167707,12.925363,33.11509,,0.304428,,...,,0.800215,,,,,,,1.513578,100


In [312]:
foreign_population = drop_NA(foreign_population)
rename_cols= {"Unnamed: 0": "Region",
             "Unnamed: 1":"Population",
             "Urban core":"Urban core area",
             "Area influenced":"influenced by urban cores",
             "Area beyond":"influenced beyond urban cores"}
foreign_population = rename_columns(foreign_population, rename_cols)
drop_indices = ['0','1']
foreign_population = drop_index(foreign_population, drop_indices)
foreign_population['2017'].head()

Unnamed: 0,Region,Population,0–19,20–64,65 and over,Male,Female,Swiss,Foreigner,Single,Married,Widowed,Divorced,Unmarried,In a registered partnership,Partnership dissolved,Urban core area,influenced by urban cores,influenced beyond urban cores
0,Total,8484130.0,1700494.0,5233271.0,1550365.0,4206434.0,4277696.0,6357738.0,2126392.0,3755372.0,3592368.0,406117.0,711025.0,588.0,16435.0,2001.0,5339043,1852223,1292864
1,Lake Geneva region,1629841.0,346875.0,1005903.0,277063.0,798780.0,831061.0,1086874.0,542967.0,747544.0,661155.0,73063.0,143847.0,163.0,3558.0,458.0,1129428,344955,155458
2,Vaud,793129.0,174446.0,488905.0,129778.0,389504.0,403625.0,527117.0,266012.0,368315.0,319792.0,34382.0,68554.0,57.0,1786.0,216.0,491022,219482,82625
3,Valais,341463.0,67825.0,207639.0,65999.0,169162.0,172301.0,263082.0,78381.0,145510.0,149095.0,18326.0,28051.0,35.0,378.0,52.0,187341,81289,72833
4,Geneva,495249.0,104604.0,309359.0,81286.0,240114.0,255135.0,296675.0,198574.0,233719.0,192268.0,20355.0,47242.0,71.0,1394.0,190.0,451065,44184,0


In [410]:
wage_swiss_foreign_ch = clean_wage_region_data(wage_swiss_foreign_ch)
wage_swiss_foreign_lake_ge = clean_wage_region_data(wage_swiss_foreign_lake_ge)
wage_swiss_foreign_mittelland = clean_wage_region_data(wage_swiss_foreign_mittelland)
wage_swiss_foreign_north = clean_wage_region_data(wage_swiss_foreign_north)
wage_swiss_foreign_zh = clean_wage_region_data(wage_swiss_foreign_zh)
wage_swiss_foreign_east = clean_wage_region_data(wage_swiss_foreign_east)
wage_swiss_foreign_central = clean_wage_region_data(wage_swiss_foreign_central)
wage_swiss_foreign_ti = clean_wage_region_data(wage_swiss_foreign_ti)

**Note :** The wage level with "No management function" was chosen.

In [420]:
wage_swiss_foreign_zh['2016']

Unnamed: 0,Residents,total_median,quartile_from,quartile_to,women_median,women_quartile_from,woman_quartile_to,men_median,men_quartile_from,man_quartile_to
1,TOTAL,5921,4772.0,7379.0,5374,4459,6864,6282.0,5166.0,7794.0
3,Swiss,6136,4951.0,7559.0,5620,4633,7000,6584.0,5398.0,8113.0
5,Foreigners,5534,4498.0,6955.0,4826,4135,6468,5849.0,4860.0,7175.0
7,Short-term permit (L),5471,4444.0,6734.0,[4 473],[3 650],[6 588],5568.0,4882.0,6746.0
8,Temporary residence permit (B),5284,4294.0,7059.0,4862,4034,6795,5514.0,4509.0,7221.0
9,Permanent residence permit (C),5595,4584.0,6872.0,4782,4170,6189,5967.0,5060.0,7136.0
10,Cross-border worker (G),5860,4808.0,7263.0,5035,4385,6469,6157.0,5178.0,7579.0
11,Other,4234,3784.0,5709.0,4329,3796,5773,4162.0,3774.0,5590.0
