In [1]:
import pandas as pd

In [2]:
# Dtype string to avoid dropping leading zeros in PLR_ID
df_overall = pd.read_csv('raw/gesamt.csv', dtype=str)
df_fam_part = pd.read_csv('raw/innerfam_partner.csv', dtype=str)
df_youth = pd.read_csv('raw/jugend.csv', dtype=str)

In [3]:
# Well merge based on the plr_id later
df_plr = df_overall.loc[:, ['plr_name', 'plr_id']].copy()
# Rename
df_plr = df_plr.rename(columns={'plr_name': 'PLR_NAME', 'plr_id': 'PLR_ID'})

In [4]:
# columns are the same for all three dataframes
df_overall.columns

Index(['gisid', 'kategorie', 'bez_name', 'bez_id', 'pgr_name', 'pgr_id',
       'bzr_name', 'bzr_id', 'plr_name', 'plr_id'],
      dtype='object')

In [5]:
# Split the category entry into two columns: 1 - Niedrig --> 1, Niedrig
# Only keep relevant rows
def preprocess_inplace(df, prefix):
    entries = df['kategorie'].str.split(' - ', expand=True)
    df[prefix + 'n'] = entries[0]
    df[prefix + 'v'] = entries[1]
    drop_list = [
        'gisid', 'kategorie', 'bez_name', 'bez_id', 'pgr_name', 'pgr_id',
        'bzr_name', 'bzr_id', 'plr_name']
    df.drop(drop_list, axis=1, inplace=True)
    df.rename(columns={'plr_id': 'PLR_ID'}, inplace=True)

preprocess_inplace(df_overall, 'overall_')
preprocess_inplace(df_fam_part, 'innerfam_partner_')
preprocess_inplace(df_youth, 'youth_')

In [6]:
df = df_plr \
    .merge(df_overall, on='PLR_ID', how='outer') \
    .merge(df_fam_part, on='PLR_ID', how='outer') \
    .merge(df_youth, on='PLR_ID', how='outer')

In [7]:
df

Unnamed: 0,PLR_NAME,PLR_ID,overall_n,overall_v,innerfam_partner_n,innerfam_partner_v,youth_n,youth_v
0,Stülerstraße,01100101,2,Mittel,2,Mittel,1,Niedrig
1,Großer Tiergarten,01100102,4,Stark erhöht,3,Erhöht,4,Stark erhöht
2,Lützowstraße,01100103,3,Erhöht,3,Erhöht,2,Mittel
3,Körnerstraße,01100104,2,Mittel,2,Mittel,2,Mittel
4,Wilhelmstraße,01100205,3,Erhöht,2,Mittel,3,Erhöht
...,...,...,...,...,...,...,...,...
537,Rollbergesiedlung,12601032,2,Mittel,3,Erhöht,2,Mittel
538,Treuenbrietzener Straße,12601133,2,Mittel,4,Stark erhöht,2,Mittel
539,Märkisches Zentrum,12601134,2,Mittel,3,Erhöht,3,Erhöht
540,Dannenwalder Weg,12601235,2,Mittel,3,Erhöht,2,Mittel


In [8]:
df.columns

Index(['PLR_NAME', 'PLR_ID', 'overall_n', 'overall_v', 'innerfam_partner_n',
       'innerfam_partner_v', 'youth_n', 'youth_v'],
      dtype='object')

In [9]:
# Export easy version
easy_columns = [
    'PLR_NAME', 'PLR_ID', 'overall_v', 'innerfam_partner_v', 'youth_v']
df_easy = df[easy_columns].copy()

rename_dict = {
    'overall_v': 'Violent Delinquency Overall',
    'innerfam_partner_v': 'Domestic and Partner Violence',
    'youth_v': 'Youth Violence'
}
df_easy.rename(columns=rename_dict, inplace=True)

In [10]:
df_easy

Unnamed: 0,PLR_NAME,PLR_ID,Violent Delinquency Overall,Domestic and Partner Violence,Youth Violence
0,Stülerstraße,01100101,Mittel,Mittel,Niedrig
1,Großer Tiergarten,01100102,Stark erhöht,Erhöht,Stark erhöht
2,Lützowstraße,01100103,Erhöht,Erhöht,Mittel
3,Körnerstraße,01100104,Mittel,Mittel,Mittel
4,Wilhelmstraße,01100205,Erhöht,Mittel,Erhöht
...,...,...,...,...,...
537,Rollbergesiedlung,12601032,Mittel,Erhöht,Mittel
538,Treuenbrietzener Straße,12601133,Mittel,Stark erhöht,Mittel
539,Märkisches Zentrum,12601134,Mittel,Erhöht,Erhöht
540,Dannenwalder Weg,12601235,Mittel,Erhöht,Mittel


In [11]:
df.to_csv('violent_delinquency.csv', index=False)
df_easy.to_csv('violent_delinquency_easy.csv', index=False)