In [39]:
import pandas as pd

In [40]:
# Read in value -9999 as NA
# Read in everything as str (avoids e.g. dropping leading zeros in PLR_ID)
df_index = pd.read_csv("raw/mss_2021_index_indikatoren.csv", dtype=str, na_values=-9999)
df_indizes = pd.read_csv("raw/mss_2021_indizes.csv", dtype=str, na_values=-9999)
df_kontext1 = pd.read_csv("raw/mss_2021_kontext_indikatoren.csv", dtype=str, na_values=-9999)
df_kontext2 = pd.read_csv("raw/mss_2021_kontext_indikatoren_k08_k14_k15.csv", dtype=str, na_values=-9999)

In [41]:
# These features are identical in all dataframes and we will use PLR_ID
# for merging later on, so in all the dfs we keep the PLR_ID
# except df_kontext2, it follows the old plr convention
df_plr = df_index.loc[:, ["PLR_NAME", "PLR_ID", "EW"]].copy()

# Drop empty columns
df_index = df_index.dropna(axis=1, how="all")
df_indizes = df_indizes.dropna(axis=1, how="all")
df_kontext1 = df_kontext1.dropna(axis=1, how="all")
df_kontext2 = df_kontext2.dropna(axis=1, how="all")

# Drop invalid rows
df_index = df_index[df_index["KOM"] == "gültig"]
df_indizes = df_indizes[df_indizes["KOM"] == "gültig"]
df_kontext1 = df_kontext1[df_kontext1["KOM"] == "gültig"]
df_kontext2 = df_kontext2[df_kontext2["KOM"] == "gültig"]

In [42]:
# By hand create list of relevant features
("index", df_index.columns,
 "indizes", df_indizes.columns,
 "kontext1", df_kontext1.columns,
 "kontext2",df_kontext2.columns)

('index',
 Index(['ZEIT', 'PLR_ID', 'PLR_NAME', 'EW', 'BEZ_ID', 'S1', 'S1_SD_GR',
        'S1_AG_GR', 'S3', 'S3_SD_GR', 'S3_AG_GR', 'S4', 'S4_SD_GR', 'S4_AG_GR',
        'D1', 'D1_SD_GR', 'D1_AG_GR', 'D3', 'D3_SD_GR', 'D3_AG_GR', 'D4',
        'D4_SD_GR', 'D4_AG_GR', 'KOM'],
       dtype='object'),
 'indizes',
 Index(['ZEIT', 'PLR_ID', 'PLR_NAME', 'EW', 'BEZ_ID', 'SI_N', 'SI_V', 'DI_N',
        'DI_V', 'SDI', 'SDI_N', 'SDI_V', 'KOM'],
       dtype='object'),
 'kontext1',
 Index(['ZEIT', 'PLR_ID', 'PLR_NAME', 'EW', 'BEZ_ID', 'KI_01', 'KI_01_SD_GR',
        'KI_01_AG_GR', 'KI_02', 'KI_02_SD_GR', 'KI_02_AG_GR', 'KI_03',
        'KI_03_SD_GR', 'KI_03_AG_GR', 'KI_04', 'KI_04_SD_GR', 'KI_04_AG_GR',
        'KI_05', 'KI_05_SD_GR', 'KI_05_AG_GR', 'KI_16', 'KI_16_SD_GR',
        'KI_16_AG_GR', 'KI_06', 'KI_06_SD_GR', 'KI_06_AG_GR', 'KI_17',
        'KI_17_SD_GR', 'KI_17_AG_GR', 'KI_07', 'KI_07_SD_GR', 'KI_07_AG_GR',
        'KI_09', 'KI_09_AG_GR', 'KI_10', 'KI_10_SD_GR', 'KI_10_AG_GR', 'KI_11',

## 

In [43]:
# Select relevant features for each df

features_index = ['PLR_ID', 'S1', 'S3', 'S4', 'D1', 'D3', 'D4']
df_index = df_index.loc[:, features_index]

features_indizes = ['PLR_ID', 'SI_N', 'SI_V', 'DI_N', 'DI_V', 'SDI_V']
df_indizes = df_indizes.loc[:, features_indizes]

features_kontext1 = [
    'PLR_ID','KI_01', 'KI_02','KI_03', 'KI_04', 'KI_05', 'KI_16', 'KI_06',
    'KI_17', 'KI_07', 'KI_09', 'KI_10', 'KI_11', 'KI_12', 'KI_13']
df_kontext1 = df_kontext1.loc[:, features_kontext1]

features_kontext2 = ['PLR_ID', 'PLR_NAME', 'EW', 'KI_08', 'KI_14', 'KI_15']
df_kontext2 = df_kontext2.loc[:, features_kontext2]

In [51]:
# Quick and dirty sequential merge (avoids functools reduce...)
df = df_plr \
    .merge(df_index, on="PLR_ID", how="outer") \
    .merge(df_indizes, on="PLR_ID", how="outer") \
    .merge(df_kontext1, on="PLR_ID", how="outer")

display(df.shape)
display(df)

(542, 28)

Unnamed: 0,PLR_NAME,PLR_ID,EW,S1,S3,S4,D1,D3,D4,SI_N,...,KI_05,KI_16,KI_06,KI_17,KI_07,KI_09,KI_10,KI_11,KI_12,KI_13
0,Stülerstraße,01100101,3419,4.22594,8.04329,19.39799,1.46443,0.38963,3.96175,2,...,59.93,40.57,-1.31,70.44,8.55,0,55.92,38.36,0.51,-4.83
1,Großer Tiergarten,01100102,1791,1.15858,3.46175,8.33333,0.24344,0.92619,4.92424,1,...,66.95,50.75,6.17,71.29,3.08,0.61,40.15,49.56,5.18,1.51
2,Lützowstraße,01100103,5211,5.31544,15.52485,32.63158,0.93486,-2.16074,-4.86842,2,...,58.36,35.56,0.8,65.84,21.04,6.04,57.49,29.78,2.41,0.3
3,Körnerstraße,01100104,4636,6.18102,18.01122,39.93994,1.249,-0.7112,-2.65548,2,...,64.58,38.55,-1.4,75.94,25.29,8.24,59.03,32.69,-2.09,-2.65
4,Wilhelmstraße,01100205,2573,2.8826,6.33502,19.75806,0.50703,-1.64835,0.72821,2,...,61.91,44.07,3.08,74.25,7.05,0,56.44,42.22,3.04,-5.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537,Rollbergesiedlung,12601032,5922,10.6202,32.53968,65.22076,1.62241,-2.36978,-0.99239,4,...,49.95,31.29,1.48,72.64,56.03,99.75,65.45,16.24,-0.45,-1.64
538,Treuenbrietzener Straße,12601133,11610,10.84252,28.29457,54.34873,2.83575,-4.17485,-5.44593,4,...,52.7,26.33,0.97,71.21,43.32,100,71.7,13.31,-0.26,-0.64
539,Märkisches Zentrum,12601134,14722,9.34148,23.14224,51.52757,1.91236,-3.22177,-5.82263,4,...,45.65,22.81,2.67,75.91,41.73,74.01,70.65,13.97,1.04,2.89
540,Dannenwalder Weg,12601235,10656,9.55309,27.41179,52.52393,1.76839,-2.17966,-3.94666,4,...,57.78,30.47,1.97,77.95,46.87,84.04,72.83,14.93,-0.08,1.55


In [55]:
# Rename the df cols for easier use later on
df.columns

Index(['PLR_NAME', 'PLR_ID', 'EW', 'S1', 'S3', 'S4', 'D1', 'D3', 'D4', 'SI_N',
       'SI_V', 'DI_N', 'DI_V', 'SDI_V', 'KI_01', 'KI_02', 'KI_03', 'KI_04',
       'KI_05', 'KI_16', 'KI_06', 'KI_17', 'KI_07', 'KI_09', 'KI_10', 'KI_11',
       'KI_12', 'KI_13'],
      dtype='object')

In [60]:
# Rename dict, thank you ChatGPT!
rename_dict1 = {
    'PLR_NAME': 'PLR_NAME',
    'PLR_ID': 'PLR_ID',
    'EW': 'Resident_Count',
    'S1': 'Unemployment_Percent',
    'S3': 'Social_Benefits_Receivers_Percent',
    'S4': 'Child_Poverty_Percent',
    'D1': 'Change_in_Unemployment',
    'D3': 'Change_in_Social_Benefits_Receivers',
    'D4': 'Change_in_Child_Poverty',
    'SI_N': 'Status_Index_num',
    'SI_V': 'Status_Index_verb',
    'DI_N': 'Dynamics_Index_num',
    'DI_V': 'Dynamics_Index_verb',
    'SDI_V': 'Status_Dynamics_Index_Value',
    'KI_01': 'Youth_Unemployment',
    'KI_02': 'Single_Parents',
    'KI_03': 'Old_Age_Poverty',
    'KI_04': 'Children_Adolescents_Migration_Background',
    'KI_05': 'Residents_with_Migration_Background',
    'KI_16': 'Foreigners',
    'KI_06': 'Change_in_Foreigners_Over_Two_Years',
    'KI_17': 'Non_EU_Foreigners',
    'KI_07': 'Foreign_Transfer_Recipients_SGB_II',
    'KI_09': 'Residents_in_Simple_Living_Conditions',
    'KI_10': 'Residence_Duration_Over_Five_Years',
    'KI_11': 'Migration_Volume',
    'KI_12': 'Total_Migration_Balance',
    'KI_13': 'Migration_Balance_Children_Under_6_Years'
}

In [59]:
rename_dict2 = {
    'PLR_NAME': 'PLR_NAME',
    'PLR_ID': 'PLR_ID',
    'EW': 'Resident Count',
    'S1': 'Unemployment Percent',
    'S3': 'Social Benefits Receivers Percent',
    'S4': 'Child Poverty Percent',
    'D1': 'Change in Unemployment',
    'D3': 'Change in Social Benefits Receivers',
    'D4': 'Change in Child Poverty',
    'SI_N': 'Status Index num',
    'SI_V': 'Status Index verb',
    'DI_N': 'Dynamics Index num',
    'DI_V': 'Dynamics Index verb',
    'SDI_V': 'Status Dynamics Index Value',
    'KI_01': 'Youth Unemployment',
    'KI_02': 'Single Parents',
    'KI_03': 'Old Age Poverty',
    'KI_04': 'Children Adolescents Migration Background',
    'KI_05': 'Residents with Migration Background',
    'KI_16': 'Foreigners',
    'KI_06': 'Change in Foreigners Over Two Years',
    'KI_17': 'Non EU Foreigners',
    'KI_07': 'Foreign Transfer Recipients SGB II',
    'KI_09': 'Residents in Simple Living Conditions',
    'KI_10': 'Residence Duration Over Five Years',
    'KI_11': 'Migration Volume',
    'KI_12': 'Total Migration Balance',
    'KI_13': 'Migration Balance Children Under 6 Years'
}


In [61]:
# Rename the columns
df_simple = df.rename(columns=rename_dict2)

In [62]:
df_simple

Unnamed: 0,PLR_NAME,PLR_ID,Resident Count,Unemployment Percent,Social Benefits Receivers Percent,Child Poverty Percent,Change in Unemployment,Change in Social Benefits Receivers,Change in Child Poverty,Status Index num,...,Residents with Migration Background,Foreigners,Change in Foreigners Over Two Years,Non EU Foreigners,Foreign Transfer Recipients SGB II,Residents in Simple Living Conditions,Residence Duration Over Five Years,Migration Volume,Total Migration Balance,Migration Balance Children Under 6 Years
0,Stülerstraße,01100101,3419,4.22594,8.04329,19.39799,1.46443,0.38963,3.96175,2,...,59.93,40.57,-1.31,70.44,8.55,0,55.92,38.36,0.51,-4.83
1,Großer Tiergarten,01100102,1791,1.15858,3.46175,8.33333,0.24344,0.92619,4.92424,1,...,66.95,50.75,6.17,71.29,3.08,0.61,40.15,49.56,5.18,1.51
2,Lützowstraße,01100103,5211,5.31544,15.52485,32.63158,0.93486,-2.16074,-4.86842,2,...,58.36,35.56,0.8,65.84,21.04,6.04,57.49,29.78,2.41,0.3
3,Körnerstraße,01100104,4636,6.18102,18.01122,39.93994,1.249,-0.7112,-2.65548,2,...,64.58,38.55,-1.4,75.94,25.29,8.24,59.03,32.69,-2.09,-2.65
4,Wilhelmstraße,01100205,2573,2.8826,6.33502,19.75806,0.50703,-1.64835,0.72821,2,...,61.91,44.07,3.08,74.25,7.05,0,56.44,42.22,3.04,-5.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537,Rollbergesiedlung,12601032,5922,10.6202,32.53968,65.22076,1.62241,-2.36978,-0.99239,4,...,49.95,31.29,1.48,72.64,56.03,99.75,65.45,16.24,-0.45,-1.64
538,Treuenbrietzener Straße,12601133,11610,10.84252,28.29457,54.34873,2.83575,-4.17485,-5.44593,4,...,52.7,26.33,0.97,71.21,43.32,100,71.7,13.31,-0.26,-0.64
539,Märkisches Zentrum,12601134,14722,9.34148,23.14224,51.52757,1.91236,-3.22177,-5.82263,4,...,45.65,22.81,2.67,75.91,41.73,74.01,70.65,13.97,1.04,2.89
540,Dannenwalder Weg,12601235,10656,9.55309,27.41179,52.52393,1.76839,-2.17966,-3.94666,4,...,57.78,30.47,1.97,77.95,46.87,84.04,72.83,14.93,-0.08,1.55


In [64]:
df.to_csv("mss_2021.csv", index=False)
df_simple.to_csv("mss_2021_easy.csv", index=False)
df_kontext2.to_csv("mss_2021_k_08_14_15_old_plr.csv", index=False)