In [113]:
import pandas as pd
import numpy as np

## Hospital details

In [124]:
# load details data
df_hospital_details = pd.read_csv('../data/in/staging/atlas_details.csv')

In [125]:
# clean up the emergency_service column
df_hospital_details['emergency_service'] = df_hospital_details['emergency_service'].apply(lambda x: x.split('\n')[0])

# create new column has_emergency_service
has_emergency_service_mapping = {
    'Stufe 1 - Basisnotfallversorgung': True,
    'Stufe 2 - Erweiterte Notfallversorgung': True,
    'Stufe 3 - Umfassende Notfallversorgung': True,
    'Ja, die Notfallstufe ist noch nicht vereinbart.': True,
    'Keine Teilnahme an einer Notfallstufe.': False,
    'Keine Information verfügbar.': False
}
df_hospital_details['has_emergency_service'] = df_hospital_details['emergency_service'].map(has_emergency_service_mapping)

# rename column emergency_service to emergency_service_level
df_hospital_details = df_hospital_details.rename(columns={'emergency_service': 'emergency_service_level'})

In [126]:
# create mapping for emergency_service_level
emergency_service_level_mapping = {
    'Stufe 1 - Basisnotfallversorgung': 1,
    'Stufe 2 - Erweiterte Notfallversorgung': 2,
    'Stufe 3 - Umfassende Notfallversorgung': 3,
    'Ja, die Notfallstufe ist noch nicht vereinbart.': 0,
    'Keine Teilnahme an einer Notfallstufe.': -1,
    'Keine Information verfügbar.': -2
}

# map emergency_service_level
df_hospital_details['emergency_service_level'] = df_hospital_details['emergency_service_level'].map(emergency_service_level_mapping)

In [127]:
# save details data ready for db
df_hospital_details.to_csv('../data/db_csv/hospital_details.csv', index=False)

## Hospital certificates

In [44]:
df_hospital_certificates = pd.read_csv('../data/in/staging/atlas_certificates.csv')

In [47]:
# clean up the certificate column
df_hospital_certificates['certificate'] = df_hospital_certificates['certificate'].replace(r'^(Zertifizierte |Zertifiziertes )\s*', '', regex=True)

In [49]:
# save certificates data ready for db
df_hospital_certificates.to_csv('../data/db_csv/hospital_certificates.csv', index=False)

## Hospital departments

### Data In

In [98]:
# load department labels and hospital department data
df_department_labels = pd.read_csv('../data/in/staging/departments_general_dict.csv', dtype={'department_id': str})
df_department_focus_labels = pd.read_csv('../data/in/staging/departments_focus_dict.csv', dtype={'department_id': str})
df_department_labels_concat = pd.concat([df_department_labels, df_department_focus_labels])
df_hospital_departments = pd.read_csv('../data/in/staging/atlas_departments.csv')

### Processing

#### Prepare for dictionary application

In [99]:
# rename department_labeles with 'ohne Differenzierung' (="no focus specified")
df_hospital_departments['department_name'] = df_hospital_departments['department_name'].apply(lambda x: x.split('/')[0] if 'ohne Differenzierung' in x else x)

# streamline department_labels with '/ ' instead of '/'
df_hospital_departments['department_name'] = df_hospital_departments['department_name'].apply(lambda x: x.replace('/ ', '/').replace(' /', '/'))

# correct '(Intensivabteilung)' to 'Intensivmedizin'
df_hospital_departments['department_name'] = df_hospital_departments['department_name'].apply(lambda x: x.replace(' (Intensivabteilung)', '/Intensivmedizin'))

# rename department_label 'Neurologie/Schwerpunkt Schlaganfallpatienten (stroke units, Artikel 7 § 1 Abs. 3 GKV-SolG)'
df_hospital_departments['department_name'] = df_hospital_departments['department_name'].replace('Neurologie/Schwerpunkt Schlaganfallpatienten (stroke units, Artikel 7 § 1 Abs. 3 GKV-SolG)', 'Neurologie/Schwerpunkt Schlaganfallpatienten')

# rename department_label 'Innere Medizin/Schwerpunkt Schlaganfallpatienten (stroke units, Artikel 7 § 1 Abs. 3 GKV-SolG)'
df_hospital_departments['department_name'] = df_hospital_departments['department_name'].replace('Innere Medizin/Schwerpunkt Schlaganfallpatienten (stroke units, Artikel 7 § 1 Abs. 3 GKV-SolG)', 'Innere Medizin/Schwerpunkt Schlaganfallpatienten')

# rename department_label 'Orthopädie/Rheumatologie'
df_hospital_departments['department_name'] = df_hospital_departments['department_name'].replace('Orthopädie/Rheumatologie', 'Orthopädie/Schwerpunkt Rheumatologie')

#### Apply dictionary

In [100]:
# create dictionary to translate department labels to department ids
departments_dict = {}
for department_label in df_department_labels_concat['department_label']:
    departments_dict.update({department_label: df_department_labels_concat[df_department_labels_concat['department_label'] == department_label]['department_id'].values[0]})

# apply dictionary to rename columns
def map_values(value):
    return departments_dict.get(value, value)

df_hospital_departments['department_id'] = df_hospital_departments['department_name'].apply(map_values)

# remove department name column
df_hospital_departments = df_hospital_departments.drop(columns=['department_name'])

# rename column department_count to treatment_count for database
df_hospital_departments = df_hospital_departments.rename(columns={'department_count': 'treatment_count'})

# rename column department_label to department_name for database
df_hospital_departments = df_hospital_departments.rename(columns={'department_label': 'department_name'})

### Data Out

In [101]:
# save data ready for database import
df_hospital_departments.to_csv('../data/db_csv/hospital_departments.csv', index=False, encoding='utf-8')
df_department_labels_concat.to_csv('../data/db_csv/departments_dict.csv', index=False, encoding='utf-8')

## Hospital treatments

In [129]:
# load treatments data
df_hospital_treatments = pd.DataFrame()
m = 50
for k in range(34):
    df_hospital_treatments_ = pd.read_csv(f'../data/in/staging/treatments_chunks/atlas_treatments_sample_{k*m}-{k*m+m-1}.csv', dtype={'hospital_id': str})
    df_hospital_treatments = pd.concat([df_hospital_treatments, df_hospital_treatments_])

In [132]:
# rename columns for database
df_hospital_treatments = df_hospital_treatments.rename(columns={'count_number': 'treatment_count', 'count_label': 'treatment_count_label'})

In [133]:
# save data ready for database import
df_hospital_treatments.to_csv('../data/db_csv/hospital_treatments.csv', index=False, encoding='utf-8')

## Federal states

In [99]:
# load zip dict data
df_zip_dict = pd.read_csv('../data/in/raw/zip_dict/georef-germany-postleitzahl.csv', dtype={'Name': str}, sep=';')[['Name', 'Land name']]
df_zip_dict = df_zip_dict.rename(columns={'Name': 'zip', 'Land name': 'federal_state_name'})

In [104]:
# load and prepare population data
df_population = pd.read_csv('../data/in/raw/federal_states/Bevölkerung - Bundesländer.csv', dtype={'Gesamt': str})[['Bundesland', 'Gesamt']]
df_population = df_population.rename(columns={'Bundesland': 'federal_state_name', 'Gesamt': 'population'})
df_population['population'] = df_population['population'].apply(lambda x: x.replace(' ', '')).astype(int)

# load and prepare area data
df_area = pd.read_csv('../data/in/raw/federal_states/Gebietsfläche.csv', dtype={'Gebietsfläche': str})[['Bundesland', 'Gebietsfläche']]
df_area = df_area.rename(columns={'Bundesland': 'federal_state_name', 'Gebietsfläche': 'area'})
df_area['area'] = df_area['area'].replace(',', '.', regex=True).astype(float)

In [100]:
# translate state codes
federal_state_codes = {
    'Baden-Württemberg': 'BW',
    'Bayern': 'BY',
    'Berlin': 'BE',
    'Brandenburg': 'BB',
    'Bremen': 'HB',
    'Hamburg': 'HH',
    'Hessen': 'HE',
    'Niedersachsen': 'NI',
    'Mecklenburg-Vorpommern': 'MV',
    'Nordrhein-Westfalen': 'NW',
    'Rheinland-Pfalz': 'RP',
    'Saarland': 'SL',
    'Sachsen': 'SN',
    'Sachsen-Anhalt': 'ST',
    'Schleswig-Holstein': 'SH',
    'Thüringen': 'TH'
}

df_zip_dict['federal_state_code'] = df_zip_dict['federal_state_name'].map(federal_state_codes)

In [101]:
# create federal state dataframe
df_federal_states = pd.DataFrame({'federal_state_code': list(federal_state_codes.values()), 'federal_state_name': list(federal_state_codes.keys())})

In [105]:
# merge federal state  with area data
df_federal_states = pd.merge(df_federal_states, df_area, on='federal_state_name')

# merge federal states with population data
df_federal_states = pd.merge(df_federal_states, df_population, on='federal_state_name')

In [108]:
# save data ready for database import
df_federal_states.to_csv('../data/db_csv/federal_states.csv', index=False, encoding='utf-8')

## Places and Hospital locations

### Data In

In [61]:
# load places data
df_places = pd.read_json('../data/in/raw/atlas/german-places.json', dtype={'p': str})
df_places = df_places.rename(columns={'p': 'zip', 'c': 'city_district', 'm': 'name', 'd': 'rural_district', 'lt': 'latitude', 'ln': 'longitude'}).drop(columns=['ct'])

# load hospital_locations data
df_hospital_locations = pd.read_csv('../data/in/staging/hospital_locations.csv', dtype={'hospital_id': str, 'zip': str})

### Processing

In [62]:
# get federal state code for each hospital location
df_hospital_locations['federal_state_code'] = df_hospital_locations['zip'].apply(lambda x: df_zip_dict[df_zip_dict['zip'] == x]['federal_state_code'].values[0] if len(df_zip_dict[df_zip_dict['zip'] == x]['federal_state_code'].values) > 0 else np.nan)

# get federal state code for missing zips in df_zip_dict for hospital_locations
missing_zips_dict_locations = {
    '99437': 'TH',
    '19049': 'MV',
}

df_hospital_locations['federal_state_code'] = df_hospital_locations['federal_state_code'].fillna(df_hospital_locations['zip'].map(missing_zips_dict_locations))

In [63]:
# get federal state code for all places
df_places['federal_state_code'] = df_places['zip'].apply(lambda x: df_zip_dict[df_zip_dict['zip'] == x]['federal_state_code'].values[0] if len(df_zip_dict[df_zip_dict['zip'] == x]['federal_state_code'].values) > 0 else np.nan)

# get federal state code for missing zips in df_zip_dict for places
missing_zips_dict_places = {
    '60312': 'HE',
    '60315': 'HE',
    '64743': 'HE',
    '81248': 'BY',
    '99331': 'TH'
}

df_places['federal_state_code'] = df_places['federal_state_code'].fillna(df_places['zip'].map(missing_zips_dict_places))

### Data Out

In [64]:
# save data ready for database import
df_hospital_locations.to_csv('../data/db_csv/hospital_locations.csv', index=False, encoding='utf-8')
df_places.to_csv('../data/db_csv/places.csv', index=False, encoding='utf-8')