In [264]:
import pandas as pd
import geopandas as gpd
from pathlib import Path

Adding geo-data to df

In [265]:
# Load Your Data

# Path to the CSV file we are enriching
csv_path = Path('../clean/doctors_clean.csv')
df = pd.read_csv(csv_path)

# Path to the GeoJSON file (in the same 'scripts' folder)
geojson_path = 'lor_ortsteile.geojson'
gdf_polygons = gpd.read_file(geojson_path)


# Create GeoDataFrames

# Convert DataFrame into a GeoDataFrame
gdf_points = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326"
)

# Ensure both GeoDataFrames use the same Coordinate Reference System (CRS)
gdf_polygons = gdf_polygons.to_crs(gdf_points.crs)


# Perform the Spatial Join

# This finds which polygon (neighborhood) each point is in
gdf_joined = gpd.sjoin(gdf_points, gdf_polygons, how="left", predicate='within')


# Add the New Columns to DataFrame

# We use the final column names we identified
district_col_name = 'BEZIRK'
neighborhood_col_name = 'OTEIL'

# Add the new columns from the joined data back to DataFrame
df['district'] = gdf_joined[district_col_name].reset_index(drop=True)
df['neighborhood'] = gdf_joined[neighborhood_col_name].reset_index(drop=True)


# Check the Result 
print("New columns have been added successfully! ✅")
# The redundant 'neighborhood_id' column has been removed from the check
print(df[['city', 'district', 'neighborhood']].head())

New columns have been added successfully! ✅
     city             district neighborhood
0  Berlin  Marzahn-Hellersdorf    Kaulsdorf
1  Berlin                Mitte      Wedding
2  Berlin              Spandau      Staaken
3  Berlin  Marzahn-Hellersdorf    Kaulsdorf
4  Berlin  Marzahn-Hellersdorf     Biesdorf


In [266]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1643 entries, 0 to 1642
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    1512 non-null   object 
 1   housenumber             1335 non-null   object 
 2   postcode                1571 non-null   float64
 3   street                  1639 non-null   object 
 4   amenity                 1643 non-null   object 
 5   speciality              1433 non-null   object 
 6   name                    1611 non-null   object 
 7   opening_hours           1130 non-null   object 
 8   website                 910 non-null    object 
 9   id                      1643 non-null   int64  
 10  longitude               1643 non-null   float64
 11  latitude                1643 non-null   float64
 12  country                 755 non-null    object 
 13  suburb                  1105 non-null   object 
 14  wheelchair              582 non-null    

Now I need to check that all doctors are within Berlin.

In [267]:
from shapely.ops import unary_union

# Create a single "Berlin" polygon
berlin_boundary = gpd.GeoSeries(unary_union(gdf_polygons.geometry), crs=gdf_polygons.crs)

# Perform the check
# .within() checks if each point is inside the berlin_boundary
is_inside_berlin = gdf_points.within(berlin_boundary.geometry[0])

# Report the results
num_outside = (~is_inside_berlin).sum() # ~ inverts True/False, counting the Falses (those outside)

if num_outside == 0:
    print(" All points are correctly located within the Berlin boundaries.")
else:
    print(f" Warning: Found {num_outside} point(s) outside the Berlin boundaries.")
    
    # Show the rows that are outside
    print("\nClubs located outside Berlin:")
    print(df[~is_inside_berlin][['club_name', 'street', 'city', 'latitude', 'longitude']])

 All points are correctly located within the Berlin boundaries.


As all doctors are within Berlin, I'll fill missing values in the 'city' column and drop 'country' column.

In [268]:
df['city'].unique()

array(['Berlin', nan], dtype=object)

In [269]:
df['city'] = df['city'].fillna('Berlin')
df.drop(columns='country', inplace=True)

In [270]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1643 entries, 0 to 1642
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    1643 non-null   object 
 1   housenumber             1335 non-null   object 
 2   postcode                1571 non-null   float64
 3   street                  1639 non-null   object 
 4   amenity                 1643 non-null   object 
 5   speciality              1433 non-null   object 
 6   name                    1611 non-null   object 
 7   opening_hours           1130 non-null   object 
 8   website                 910 non-null    object 
 9   id                      1643 non-null   int64  
 10  longitude               1643 non-null   float64
 11  latitude                1643 non-null   float64
 12  suburb                  1105 non-null   object 
 13  wheelchair              582 non-null    object 
 14  description             104 non-null    

Now we need to add a column district_id and neighborhood_id from database tables districts.csv and neighborhoods.csv 

In [271]:
# Load the new lookup tables

districts_path = Path('../source/districts.csv')
neighborhoods_path = Path('../source/neighborhoods.csv')

# Read the CSV files into DataFrames
districts_df = pd.read_csv(districts_path)
neighborhoods_df = pd.read_csv(neighborhoods_path)

# Inspect the DataFrames

print(" Districts Table Info ")
districts_df.info()
print("\n Districts Table Head ")
print(districts_df.head())

print("\n" + "="*50 + "\n")

print(" Neighborhoods Table Info ")
neighborhoods_df.info()
print("\n Neighborhoods Table Head ")
print(neighborhoods_df.head())

 Districts Table Info 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   district_id  12 non-null     int64 
 1   district     12 non-null     object
 2   geometry     12 non-null     object
dtypes: int64(1), object(2)
memory usage: 420.0+ bytes

 Districts Table Head 
   district_id                    district  \
0     11012012               Reinickendorf   
1     11004004  Charlottenburg-Wilmersdorf   
2     11009009            Treptow-Köpenick   
3     11003003                      Pankow   
4     11008008                    Neukölln   

                                                                                                                                                                                                                                                                                                                          

In [272]:
# Merge with Districts Table to add 'district_id'

# We only need the ID and the key from the districts table
districts_lookup = districts_df[['district_id', 'district']]

# Perform the merge ('how="left"' keeps all rows from original 'df')
df = pd.merge(df, districts_lookup, on='district', how='left')

# Merge with Neighborhoods Table to add 'neighborhood_id'

# We only need the ID and the key from the neighborhoods table
neighborhoods_lookup = neighborhoods_df[['neighborhood_id', 'neighborhood']]

# Perform the second merge
df = pd.merge(df, neighborhoods_lookup, on='neighborhood', how='left')

print("IDs have been added successfully! ")
print(df[['district', 'district_id', 'neighborhood', 'neighborhood_id', 'suburb']].head())

IDs have been added successfully! 
              district  district_id neighborhood  neighborhood_id     suburb
0  Marzahn-Hellersdorf     11010010    Kaulsdorf             1003        NaN
1                Mitte     11001001      Wedding              105    Wedding
2              Spandau     11005005      Staaken              504        NaN
3  Marzahn-Hellersdorf     11010010    Kaulsdorf             1003  Kaulsdorf
4  Marzahn-Hellersdorf     11010010     Biesdorf             1002   Biesdorf


Next, I'm removing the 'district', 'neighborhood', and 'suburb' columns, as they are superfluous.

In [273]:
df.drop(columns=['district', 'neighborhood', 'suburb'], inplace=True, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1643 entries, 0 to 1642
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    1643 non-null   object 
 1   housenumber             1335 non-null   object 
 2   postcode                1571 non-null   float64
 3   street                  1639 non-null   object 
 4   amenity                 1643 non-null   object 
 5   speciality              1433 non-null   object 
 6   name                    1611 non-null   object 
 7   opening_hours           1130 non-null   object 
 8   website                 910 non-null    object 
 9   id                      1643 non-null   int64  
 10  longitude               1643 non-null   float64
 11  latitude                1643 non-null   float64
 12  wheelchair              582 non-null    object 
 13  description             104 non-null    object 
 14  email                   189 non-null    

We need to change some columns types.

In [274]:
# This handles NaNs and removes the .0 from floats
df['postcode'] = df['postcode'].astype(pd.Int64Dtype()).astype(str).replace('<NA>', None)

# Convert IDs to string
df['id'] = df['id'].astype(str)
df['district_id'] = df['district_id'].astype(str)
df['neighborhood_id'] = df['neighborhood_id'].astype(str)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1643 entries, 0 to 1642
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    1643 non-null   object 
 1   housenumber             1335 non-null   object 
 2   postcode                1571 non-null   object 
 3   street                  1639 non-null   object 
 4   amenity                 1643 non-null   object 
 5   speciality              1433 non-null   object 
 6   name                    1611 non-null   object 
 7   opening_hours           1130 non-null   object 
 8   website                 910 non-null    object 
 9   id                      1643 non-null   object 
 10  longitude               1643 non-null   float64
 11  latitude                1643 non-null   float64
 12  wheelchair              582 non-null    object 
 13  description             104 non-null    object 
 14  email                   189 non-null    

In [275]:
# Find the index of the row(s) where 'name' is null 
index_to_drop = df[df['name'].isnull()].index
print(index_to_drop)

Index([   3,   51,   76,  140,  387,  388,  389,  463,  518,  558,  629,  693,
        708,  871,  883,  937, 1021, 1185, 1186, 1244, 1268, 1319, 1320, 1321,
       1322, 1344, 1406, 1543, 1608, 1628, 1630, 1631],
      dtype='int64')


I would like to inspect the rows where the 'name' column is not filled, and fill them if possible.

In [276]:
# Create a mask for the rows where 'name' is null

is_name_missing = df['name'].isnull()
columns_to_show = ['name', 'website','speciality']

print(f"--- {is_name_missing.sum()} rows where 'name' is missing ---")
print("--- Showing only 'name' and 'website' ---")

# .to_string() forces pandas to print ALL the rows
print(df[is_name_missing][columns_to_show].to_string())

--- 32 rows where 'name' is missing ---
--- Showing only 'name' and 'website' ---
     name                                                                     website                          speciality
3     NaN                                         https://www.drhenriettefriedrich.de                                 NaN
51    NaN                                                                         NaN                         gynaecology
76    NaN                                          https://www.berliner-augenarzt.de/                       ophthalmology
140   NaN                                                                         NaN                                 NaN
387   NaN                                            https://www.mvz-berlin-rudow.de/                            internal
388   NaN                                            https://www.mvz-berlin-rudow.de/                        orthopaedics
389   NaN                                            https://www

I'll fill names from websites manuly where possible.

In [277]:
# Create a filter for the rows we need to fix (where 'name' IS null AND 'website' IS NOT null)
condition_to_fix = df['name'].isnull() & df['website'].notna()

# Create a small DataFrame with just these rows
df_to_fix = df[condition_to_fix]

# Define columns to show I added 'amenity' and 'speciality' as context
columns_to_show = ['name', 'website', 'amenity', 'speciality']
existing_cols = [col for col in columns_to_show if col in df.columns]

print(f" Found {len(df_to_fix)} rows to fix manually")
print("These have a 'website' but no 'name':\n")

# Print the list of websites to check
print(df_to_fix[existing_cols].to_string())

 Found 13 rows to fix manually
These have a 'website' but no 'name':

     name                                                                     website  amenity                          speciality
3     NaN                                         https://www.drhenriettefriedrich.de  doctors                                 NaN
76    NaN                                          https://www.berliner-augenarzt.de/  doctors                       ophthalmology
387   NaN                                            https://www.mvz-berlin-rudow.de/  doctors                            internal
388   NaN                                            https://www.mvz-berlin-rudow.de/  doctors                        orthopaedics
389   NaN                                            https://www.mvz-berlin-rudow.de/  doctors                             general
518   NaN                                      https://www.gastroenterologie-horn.de/  doctors           internal;gastroenterology
693   NaN    

In [278]:
# We use .loc[index, column_name] = value

df.loc[3, 'name'] = 'Dr. Henriette Friedrich'
df.loc[3, 'speciality'] = 'general' # 'general' is a common OSM tag for this

df.loc[76, 'name'] = 'Christoph J. Huber'

df.loc[387, 'name'] = 'MVZ Berlin Rudow'
df.loc[388, 'name'] = 'MVZ Berlin Rudow'
df.loc[389, 'name'] = 'MVZ Berlin Rudow'

df.loc[518, 'name'] = 'Dr. med. Andreas Horn'

df.loc[693, 'name'] = 'Arztpraxis Berlin Friedenau'

df.loc[708, 'name'] = 'Dr. Strunz'

df.loc[1021, 'name'] = 'Kinderarzt Zimmermann'

df.loc[1244, 'name'] = 'Dr. med. Thorsten Löbbert'

df.loc[1321, 'name'] = 'Dr. Nicolai Sedlaczek, Dr. Moritz Knies, Prof. Dr. Saskia Rohrbach, Dr. Tobias Maier'

df.loc[1406, 'name'] = 'Dr. med. Kathrin Irrgang'

df.loc[1608, 'name'] = 'Medicum'
df.loc[1608, 'speciality'] = 'surgery, dermatology, gynaecology, ear_nose_throat, internal, cardiology, dentistry'

In [279]:
# Create a filter for the rows we need to check
# (where 'name' IS null AND 'website' IS ALSO null)
condition_to_check = df['name'].isnull() & df['website'].isnull()

# Create a small DataFrame with just these rows
df_to_check = df[condition_to_check]

# Define columns to show We now want to see the address and context columns
columns_to_show = [
    'name',        
    'website',    
    'street',      
    'housenumber', 
    'postcode',    
    'amenity',     
    'speciality'   # (Context: Does it have a speciality?)
]
existing_cols = [col for col in columns_to_show if col in df.columns]

print(f" Found {len(df_to_check)} rows with NO 'name' AND NO 'website'")
print("Checking if they have address or speciality data:\n")

# .to_string() ensures all rows are printed
print(df_to_check[existing_cols].to_string())

 Found 19 rows with NO 'name' AND NO 'website'
Checking if they have address or speciality data:

     name website                  street housenumber postcode  amenity            speciality
51    NaN     NaN          Skarbinastraße          79    12309  doctors           gynaecology
140   NaN     NaN          Berliner Allee          97    13088  doctors                   NaN
463   NaN     NaN             Kormoranweg          31     None  doctors          radiotherapy
558   NaN     NaN    Wilmersdorfer Straße          45     None  doctors         ophthalmology
629   NaN     NaN           Dunckerstraße         70A    10437  doctors      child_psychiatry
871   NaN     NaN    Martin-Luther-Straße         124    10825  doctors       plastic_surgery
883   NaN     NaN     Wilhelm-Blos-Straße          61    12623  doctors               general
937   NaN     NaN     Hugo-Distler-Straße          24    12619  doctors   general;paediatrics
1185  NaN     NaN             Hauptstraße          19   

These rows are being dropped, as I was unable to source the required information for them.

Additionally, I've observed that the 'amenity' column is unreliable due to inconsistent data entry. Some clinics have a separate record for each specialty, while others list all specialties in a single 'doctors' record. I want to fix it.

In [280]:
df['name'].value_counts()

name
Ärztehaus                               25
Praxis für Allgemeinmedizin              6
MVZ Berlin Rudow                         4
Praxis für Kinder- und Jugendmedizin     4
Hausarztpraxis                           4
                                        ..
Ärztehaus Nordflügel                     1
Ärztehaus Südflügel                      1
Helios Arthropädicum Kaulsdorf           1
Medical Park Berlin Humboldtmühle        1
Kinderarztpraxis am Schlachtensee        1
Name: count, Length: 1560, dtype: int64

In [281]:
print(f"Original size before aggregation: {len(df)} rows")

# Custom Aggregation Function
def aggregate_row(rows):
    
    # Take the *first* value for most columns
    data = rows.iloc[0].to_dict()
    
    # Get all unique, non-empty specialities from all duplicate rows
    all_specialities = rows['speciality'].dropna().unique()
    
    # Join them with a comma
    data['speciality'] = ', '.join(all_specialities)
      
    # We ONLY set amenity to 'clinic' IF we ACTUALLY found more than one speciality for this *exact* object.
    if len(all_specialities) > 1:
        data['amenity'] = 'clinic'
       
    return pd.Series(data)

# Group by 'name' AND 'ADDRESS' and apply our function
df = df.groupby(
    ['name', 'street', 'housenumber'], # Group by unique object
    as_index=False,
    dropna=False # Keep rows where name/street might be NaN
).apply(aggregate_row, include_groups=False) # 'include_groups=False' silences the warning

print(f"New size after aggregation: {len(df)} rows")

# Check 'MVZ Berlin Rudow' example
# This should now be 1 row, with 'amenity=clinic' and a combined 'speciality' list.
print("\n Checking 'MVZ Berlin Rudow' ")
print(df[df['name'] == 'MVZ Berlin Rudow'][['name', 'amenity', 'speciality', 'street']])

# Check 'Ärztehaus' example
# This should still be 25 rows, and their 'amenity' will *not* be changed (unless one of them # had duplicates at the *same address*). This is correct.
print("\n Checking 'Ärztehaus' ")
print(f"Total 'Ärztehaus' entries: {len(df[df['name'] == 'Ärztehaus'])}")

Original size before aggregation: 1643 rows
New size after aggregation: 1636 rows

 Checking 'MVZ Berlin Rudow' 
                  name amenity                       speciality  \
1001  MVZ Berlin Rudow  clinic  internal, orthopaedics, general   

                       street  
1001  Waßmannsdorfer Chaussee  

 Checking 'Ärztehaus' 
Total 'Ärztehaus' entries: 25


In [282]:
# Get a count of the rows we are about to drop
# These are the "ghost rows" that we couldn't save (where 'name' is still NaN).
ghost_rows_count = df['name'].isnull().sum()

print(f"Found {ghost_rows_count} rows that still have no 'name'.")

# Drop all rows where 'name' is still NaN
df.dropna(subset=['name'], inplace=True)

print(f"Final clean size: {len(df)} rows.")

Found 17 rows that still have no 'name'.
Final clean size: 1619 rows.


In [283]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1619 entries, 0 to 1618
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    1619 non-null   object 
 1   street                  1615 non-null   object 
 2   housenumber             1311 non-null   object 
 3   city                    1619 non-null   object 
 4   postcode                1549 non-null   object 
 5   amenity                 1619 non-null   object 
 6   speciality              1619 non-null   object 
 7   opening_hours           1118 non-null   object 
 8   website                 906 non-null    object 
 9   id                      1619 non-null   object 
 10  longitude               1619 non-null   float64
 11  latitude                1619 non-null   float64
 12  wheelchair              576 non-null    object 
 13  description             102 non-null    object 
 14  email                   188 non-null    objec

I have noticed that the 'amenity' column, which is critical for creating the labels, contains more incorrect values. Also, it lacks a third category: besides clinics and individual doctors, there are also joint practices with two or more doctors. I will create a third category for them: 'group_practice'.

In [284]:
import numpy as np

# FIX: Temporarily tell pandas to show full text
pd.set_option('display.max_colwidth', None)

# CREATE 'amenity_clean' (Infrastructure - 3-Tier Logic)

# Define Regex pattern for LARGE clinics (Tier 3)
clinic_name_pattern = (
    r'MVZ|Klinik|Zentrum|Institut|Ärztehaus|Ärztezentrum|Poliklinik|Medicum'
    r'|Tagesklinik|Versorgungszentrum|Diagnostikzentrum|Center|Polikum|Medico|Kontor'
    r'|Vivantes|Helios|Sana|Charité|medneo|KEH|Einheit|Arthropädicum'
    r'|Medical Office|Policum|Polimedica'
)

# Define pattern for MEDIUM group practices (Tier 2)
group_practice_pattern = (
    r'Gemeinschaftspraxis|Praxisgemeinschaft|Ärztinnen|Dres\.' # Keywords
    r'|&|;|/' # Separators: '&' OR ';' OR '/'
    r'|Dr\..*, ?Dr\.' # List: "Dr. Name, Dr. Name"
)

# Create boolean flags
df['is_already_clinic'] = (df['amenity'] == 'clinic')

df['is_large_clinic_name'] = df['name'].str.contains(
    clinic_name_pattern,
    case=False, 
    na=False
)

df['is_group_practice_name_or_list'] = df['name'].str.contains(
    group_practice_pattern,
    case=False,
    na=False
)

# Apply the NEW 3-Tier logic
conditions_infra = [
    # Priority 1: Is it a LARGE clinic (by name or original tag)?
    (df['is_already_clinic'] == True) | 
    (df['is_large_clinic_name'] == True),
    
    # Priority 2: (If not large) Is it a MEDIUM group practice?
    (df['is_group_practice_name_or_list'] == True)
]
choices_infra = [
    'clinic',       # Tier 3 (Large)
    'group_practice'  # Tier 2 (Medium)
]

# The default is 'practice' (Tier 1)
df['amenity_clean'] = np.select(
    conditions_infra, 
    choices_infra, 
    default='practice' 
)

# VERIFICATION 

print("New 'amenity_clean' counts:")
# This should now show the correct distribution
print(df['amenity_clean'].value_counts().to_string())


print("\n Checking 10 random samples marked as 'clinic' ")
print(df[df['amenity_clean'] == 'clinic']
      [['name', 'amenity', 'amenity_clean']]
      .sample(10, random_state=1, replace=True))

print("\n Checking 10 random samples marked as 'group_practice' ")
print(df[df['amenity_clean'] == 'group_practice']
      [['name', 'amenity', 'amenity_clean']]
      .sample(10, random_state=1, replace=True))

print("\n Checking 10 random samples marked as 'practice' ")
print(df[df['amenity_clean'] == 'practice']
      [['name', 'amenity', 'amenity_clean']]
      .sample(10, random_state=1, replace=True))

New 'amenity_clean' counts:
amenity_clean
practice          1159
clinic             287
group_practice     173

 Checking 10 random samples marked as 'clinic' 
                                                                                       name  \
713                                          Gesundheitszentrum Marzahn "Ernst-Ludwig Heim"   
1563                                                                              Ärztehaus   
966   Klinik für Psychiatrie, Psychosomatik und Psychotherapie des Kindes- und Jugendalters   
1583                                                             Ärztehaus Hohenzollerndamm   
1513                                                                Zentrum für Haarausfall   
1075                        Medizinisches Versorgungszentrum für Neurologie und Psychiatrie   
1132                                                                        Orthomed Berlin   
1071                                Medizinisches Versorgungszentrum Hausärzte a

Now we can drop temporary column and update 'amenity'

In [285]:
# Define all columns to be dropped
columns_to_drop = [
    'amenity', # The original 'dirty' column
    'is_already_clinic',
    'is_large_clinic_name',
    'is_group_practice_name_or_list'
]

# Drop these columns AND rename 'amenity_clean'
df = df.drop(
    columns=columns_to_drop
).rename(
    columns={'amenity_clean': 'amenity'}
)

In [286]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1619 entries, 0 to 1618
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    1619 non-null   object 
 1   street                  1615 non-null   object 
 2   housenumber             1311 non-null   object 
 3   city                    1619 non-null   object 
 4   postcode                1549 non-null   object 
 5   speciality              1619 non-null   object 
 6   opening_hours           1118 non-null   object 
 7   website                 906 non-null    object 
 8   id                      1619 non-null   object 
 9   longitude               1619 non-null   float64
 10  latitude                1619 non-null   float64
 11  wheelchair              576 non-null    object 
 12  description             102 non-null    object 
 13  email                   188 non-null    object 
 14  toilets_wheelchair      37 non-null     objec

Now I want to fill in the missing values in the 'speciality' column, as many doctors' specialties are already indicated in the practice name.

In [287]:
#  Define the mapping (Dictionary)
speciality_map = {
    #  Primary Care  
    'Hausarzt': 'general',
    'Allgemeinmedizin': 'general',
    'Familienpraxis': 'general',
    'Internist': 'internal',
    'Innere Medizin': 'internal',
    
    # Paediatrics
    'Kinderarzt': 'paediatrics',
    
    # Gynaecology
    'Gynäkologie': 'gynaecology',
    'Frauenarzt': 'gynaecology',
    'Brustzentrum': 'gynaecology',  # (Breast center)
    'Kinderwunsch': 'gynaecology',  # (Fertility)
    
    # Orthopaedics 
    'Orthopädie': 'orthopaedics',
    'Arthropädicum': 'orthopaedics',
    'Rückenzentrum': 'orthopaedics',  # (Back center)
    
    # Surgery
    'Chirurgie': 'surgery',
    'Oralchirurgie': 'dental_oral_maxillo_facial_surgery',
    
    # Trauma 
    'Durchgangsarzt': 'trauma',
    'Unfall': 'trauma',
    
    # Ophthalmology
    'Augen': 'ophthalmology',
    
    # HNO
    'HNO': 'ear_nose_throat',
    'Cochlear': 'ear_nose_throat',
    
    # Dermatology
    'Dermatologie': 'dermatology',
    'Hautarzt': 'dermatology',
    
    # Other Specialists 
    'Kardiologie': 'cardiology',
    'Urologie': 'urology',
    'Radiologie': 'radiology',
    'Neurologie': 'neurology',
    'Pneumologie': 'pulmonology',
    'Lungenarzt': 'pulmonology',
    'Gastroenterologie': 'gastroenterology',
    'Bauchzentrum': 'gastroenterology',  # (Stomach center)
    'Cancer': 'oncology',
    'Onkologie': 'oncology',
    'Gerinnungszentrum': 'haematology', # (Coagulation)
    'Tropenmedizin': 'tropical_medicine',
    'Psychiatrie': 'psychiatry',
    'Endocrinology': 'endocrinology',
    'Infektiologie': 'infectious_diseases',
    'Nierenzentrum': 'nephrology',     # (Kidney center)
    'Rehabilitation': 'rehabilitation',
    'Physiotherapie': 'physiotherapy',
    'Rheumatologische': 'rheumatology',
    'Schlaf': 'somnology',            # (Sleep)
    'Schmerz': 'anaesthetics',         # (Pain)
    'Zahnarzt': 'dentistry',
    'Zahnklinik': 'dentistry'
}

# Get the filter for rows that need filling 
is_speciality_nan = df['speciality'].isna() | (df['speciality'] == '')
nan_count_before = is_speciality_nan.sum()
print(f"Found {nan_count_before} rows with empty 'speciality' to check.")

# Loop through our map and fill the 'speciality' column 
for keyword, osm_tag in speciality_map.items():
    
    name_contains_keyword = df['name'].str.contains(keyword, case=False, na=False)
    
    # It will ONLY fill rows WHERE:
    # 'speciality' is *still* NaN (is_speciality_nan)
    # AND 'name' contains our keyword
    df.loc[is_speciality_nan & name_contains_keyword, 'speciality'] = osm_tag
    
    # We must update our 'is_speciality_nan' filter *inside* the loop
    is_speciality_nan = df['speciality'].isna() | (df['speciality'] == '')

# VERIFICATION 
new_nan_count = (df['speciality'].isna() | (df['speciality'] == '')).sum()
print(f"\n Verification ")
print(f"Rows with empty 'speciality' *before*: {nan_count_before}")
print(f"Rows with empty 'speciality' *after*: {new_nan_count}")
print(f"We successfully enriched {nan_count_before - new_nan_count} rows.")

Found 200 rows with empty 'speciality' to check.

 Verification 
Rows with empty 'speciality' *before*: 200
Rows with empty 'speciality' *after*: 173
We successfully enriched 27 rows.


In [288]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1619 entries, 0 to 1618
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    1619 non-null   object 
 1   street                  1615 non-null   object 
 2   housenumber             1311 non-null   object 
 3   city                    1619 non-null   object 
 4   postcode                1549 non-null   object 
 5   speciality              1619 non-null   object 
 6   opening_hours           1118 non-null   object 
 7   website                 906 non-null    object 
 8   id                      1619 non-null   object 
 9   longitude               1619 non-null   float64
 10  latitude                1619 non-null   float64
 11  wheelchair              576 non-null    object 
 12  description             102 non-null    object 
 13  email                   188 non-null    object 
 14  toilets_wheelchair      37 non-null     objec

As we already have dentist data, I would like to inspect this specialty in doctors data and remove rows that we already have in dental_offices table.

In [289]:
count_dentistry = df['speciality'].str.contains('dentistry', na=False).sum()
print(count_dentistry)

5


In [290]:
filtered_dentistry_rows = df[
    df['speciality'].str.contains('dentistry', na=False, case=False)
]

result_df = filtered_dentistry_rows[['name', 'speciality']]

print({len(result_df)})
print(result_df)

{5}
                                         name  \
1053            Matthias Dahms Zahnarztpraxis   
1063                                  Medicum   
1508                      Zahnarztpraxis Hein   
1510                           Zahnklinik Ost   
1511  Zahnklinik/Zahnärztlicher Notfalldienst   

                                                                               speciality  
1053                                                                            dentistry  
1063  surgery, dermatology, gynaecology, ear_nose_throat, internal, cardiology, dentistry  
1508                                                                            dentistry  
1510                                                                            dentistry  
1511                                                                            dentistry  


In [291]:
# The goal is to remove 'dentistry' from the speciality string (to keep other specialities).
df.loc[1063, 'speciality'] = df.loc[1063, 'speciality'].replace(', dentistry', '')

# Print the corrected row for immediate verification
print(df[df.index.isin([1063])][['name', 'speciality']])

# Drop 4 specified rows
indices_to_drop = [1053, 1508, 1510, 1511]
df.drop(indices_to_drop, inplace=True) 

         name  \
1063  Medicum   

                                                                    speciality  
1063  surgery, dermatology, gynaecology, ear_nose_throat, internal, cardiology  


In [292]:
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1615 entries, 0 to 1614
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    1615 non-null   object 
 1   street                  1611 non-null   object 
 2   housenumber             1307 non-null   object 
 3   city                    1615 non-null   object 
 4   postcode                1545 non-null   object 
 5   speciality              1615 non-null   object 
 6   opening_hours           1116 non-null   object 
 7   website                 904 non-null    object 
 8   id                      1615 non-null   object 
 9   longitude               1615 non-null   float64
 10  latitude                1615 non-null   float64
 11  wheelchair              573 non-null    object 
 12  description             102 non-null    object 
 13  email                   188 non-null    object 
 14  toilets_wheelchair      36 non-null     

In [293]:
#Save the Final Enriched File
save_path = Path('../clean/doctors_clean_with_distr.csv')
df.to_csv(save_path, index=False, encoding='utf-8-sig')
print(f"\nDataFrame successfully saved to '{save_path}'")


DataFrame successfully saved to '..\clean\doctors_clean_with_distr.csv'


I am now creating a features to represent the weighted capacity of each facility. A hybrid, 3-tiered logic is used for this calculation:

For 'practice' and 'group_practice' facilities: A weighted score is applied based on official German KBV/Zi statistics (2023/24):

practice: 1.0 (Einzelpraxis)

group_practice: 2.7 (Gemeinschaftspraxis)

For 'clinic' facilities (with speciality = Not-NaN): For clinics where specialities are listed, we "explode" the speciality list. We apply the group_practice weight of 2.7 to each listed speciality.

Formula: Capacity Score = Count(specialties) × 2.7

For 'clinic' facilities (with speciality = NaN): For "blind" clinics where specialities are not listed, a statistical average is used as the capacity score.

Formula: Capacity Score = 6.3 (Based on the 2023/24 KBV average for all MVZs)

In [None]:
# CREATE 'capacity_score' COLUMN

# Define the weights based on official KBV/Zi statistics
weight_map = {
    'practice': 1.0, # Tier 1 (Einzelpraxis)
    'group_practice': 2.7, # Tier 2 (Gemeinschaftspraxis)
    'clinic': 6.3 # Tier 3 (MVZ Overall Average)
}

# Apply the weights to the clean 'amenity' column
df['capacity_score'] = df['amenity'].map(weight_map)

# Define Regex patterns

paediatric_pattern = r'paediatrics'
primary_adult_pattern = r'general|internal'
primary_name_pattern = r'Hausarzt|Allgemeinmedizin|Familienpraxis'
specialist_spec_pattern = (
    r'cardiology|ophthalmology|ear_nose_throat|orthopaedics|surgery'
    r'|gynaecology|urology|radiology|dermatology|neurology|pulmonology'
    r'|gastroenterology|oncology|haematology|tropical_medicine|psychiatry'
    r'|endocrinology|infectious_diseases|nephrology|rehabilitation'
    r'|physiotherapy|rheumatology|somnology|anaesthetics'
    r'|dental_oral_maxillo_facial_surgery|trauma'
)


# SPLIT, PROCESS, AND AGGREGATE

# Table A: 'practice' and 'group_practice'
df_practices_groups = df[df['amenity'] != 'clinic'].copy()

# Table B: 'clinic'
df_clinics = df[df['amenity'] == 'clinic'].copy()

# Process Table A (Practices & Groups) 

# 2Create the 3 independent boolean flags

# Create the two specialty flags needed for the primary logic
df_practices_groups['is_pediatric'] = df_practices_groups['speciality'].str.contains(paediatric_pattern, case=False, na=False)
is_speciality_nan = df_practices_groups['speciality'].isna() | (df_practices_groups['speciality'] == '')
df_practices_groups['is_specialist'] = (
    df_practices_groups['speciality'].str.contains(specialist_spec_pattern, case=False, na=False) |
    (is_speciality_nan & ~df_practices_groups['name'].str.contains(primary_name_pattern, case=False, na=False))
)

# Logic for is_primary_adult is CONDITIONAL (Hierarchy)
# This logic implements that the 'Specialist' status overrides 'Internal' status.
df_practices_groups['is_primary_adult'] = (
    # Part A: Check for GENERAL practitioners (These are ALWAYS Primary Care)
    # This includes 'general' spec OR 'Hausarzt' name
    (df_practices_groups['speciality'].str.contains('general', case=False, na=False) |
     df_practices_groups['name'].str.contains(primary_name_pattern, case=False, na=False))
    
    # Check for INTERNAL Medicine (This is the conditional part)
    # This is TRUE ONLY if 'internal' is present AND the object is NOT already a specialist.
    | (df_practices_groups['speciality'].str.contains('internal', case=False, na=False) & 
       ~df_practices_groups['is_specialist'])
)


# Create the 3 Final "Weighted Score" Columns
df_practices_groups['primary_adult_score'] = df_practices_groups['capacity_score'] * df_practices_groups['is_primary_adult']
df_practices_groups['pediatric_score'] = df_practices_groups['capacity_score'] * df_practices_groups['is_pediatric']
df_practices_groups['specialist_score'] = df_practices_groups['capacity_score'] * df_practices_groups['is_specialist']

# Aggregate Table A
columns_to_sum_A = ['primary_adult_score', 'pediatric_score', 'specialist_score']
agg_table_A = df_practices_groups.groupby('district_id')[columns_to_sum_A].sum()


# Process Table B (Clinics - The Explode Logic)

# Infrastructure Count (Count the number of clinics)
agg_clinics_infra = df_clinics.groupby('district_id').size().to_frame(name='clinic_infra_count')

# Score from Blind Clinics (capacity_score = 6.3)
df_clinics_nan = df_clinics[df_clinics['speciality'].isna() | (df_clinics['speciality'] == '')].copy()
agg_clinics_nan = df_clinics_nan.groupby('district_id')['capacity_score'].sum().to_frame(name='specialist_services_from_nan')

# Service Count (We only explode clinics *with* speciality data)
df_clinics_defined = df_clinics.dropna(subset=['speciality'])
df_exploded = df_clinics_defined[['district_id', 'speciality']].copy()

df_exploded['speciality_list'] = df_exploded['speciality'].str.split(r'[^a-zA-Z_]+')
df_exploded = df_exploded.explode('speciality_list')
df_exploded.dropna(subset=['speciality_list'], inplace=True)
df_exploded['speciality_single'] = df_exploded['speciality_list'].str.strip().str.lower()
df_exploded = df_exploded[df_exploded['speciality_single'] != '']

# Categorize the *exploded* specialities (Simple count, as each row is a service)
conditions_exploded = [
    df_exploded['speciality_single'].str.contains(paediatric_pattern, case=False, na=False),
    df_exploded['speciality_single'].str.contains(primary_adult_pattern, case=False, na=False)
]
choices_exploded = [
    'primary_care_pediatric_services',
    'primary_care_adult_services'
]
df_exploded['service_category'] = np.select(
    conditions_exploded,
    choices_exploded,
    default='specialist_services'
)

# Aggregate Table B (Services)
agg_table_B_services = df_exploded.groupby('district_id')['service_category'].value_counts().unstack(fill_value=0)



# MERGE all tables into ONE final table

district_final_features = agg_table_A.join(agg_clinics_infra, how='outer')
district_final_features = district_final_features.join(agg_table_B_services, how='outer')
district_final_features = district_final_features.join(agg_clinics_nan, how='outer')

# Final cleanup
district_final_features['specialist_score_total'] = district_final_features['specialist_score'] + district_final_features['specialist_services_from_nan'] + district_final_features['specialist_services']

district_final_features = district_final_features.fillna(0)

# Final dtype cleanup
for col in district_final_features.columns:
    if 'score' in col:
        district_final_features[col] = district_final_features[col].astype(float)
    else:
        district_final_features[col] = district_final_features[col].astype(int)

print("\n FINAL District Healthcare Features Table")
print(district_final_features.to_string())


 FINAL District Healthcare Features Table
             primary_adult_score  pediatric_score  specialist_score  clinic_infra_count  primary_care_adult_services  primary_care_pediatric_services  specialist_services  specialist_services_from_nan  specialist_score_total
district_id                                                                                                                                                                                                                     
11001001                    49.8             20.2              81.4                  55                            6                                2                   39                           138                   259.0
11002002                    49.5              7.0              63.0                  18                            4                                1                   13                            50                   126.4
11003003                    86.3             16.4        

In [295]:
# FINAL SCORE 1: TOTAL Primary Adult Score (Combines Practices/Groups + Clinics)
district_final_features['total_primary_adult_score'] = (
    district_final_features['primary_adult_score'] + 
    district_final_features['primary_care_adult_services']
).astype(float)

# FINAL SCORE 2: TOTAL Pediatric Score (Combines Practices/Groups + Clinics)
district_final_features['total_pediatric_score'] = (
    district_final_features['pediatric_score'] + 
    district_final_features['primary_care_pediatric_services']
).astype(float)

# FINAL SCORE 3: TOTAL Specialist Score (Already created and final)
# district_final_features['specialist_score_total'] is already the final score.


print("\n FINAL Feature Table with Combined Scores ")
print(district_final_features[['total_primary_adult_score', 
                               'total_pediatric_score', 
                               'specialist_score_total']].to_string())


 FINAL Feature Table with Combined Scores 
             total_primary_adult_score  total_pediatric_score  specialist_score_total
district_id                                                                          
11001001                          55.8                   22.2                   259.0
11002002                          53.5                    8.0                   126.4
11003003                          93.3                   17.4                   187.2
11004004                          77.3                   15.8                   241.2
11005005                          13.7                    5.7                    67.6
11006006                          54.8                   15.4                   186.1
11007007                          84.6                   19.8                   172.9
11008008                          72.3                   13.0                    86.8
11009009                          54.8                    8.0                   113.0
11010010  

In [296]:
#Save healthcare_features
save_path = Path('../clean/healthcare_features.csv')
district_final_features.to_csv(save_path, index=True, encoding='utf-8-sig')
print(f"\n Final Feature Table successfully saved to '{save_path}'")


 Final Feature Table successfully saved to '..\clean\healthcare_features.csv'


In [297]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1615 entries, 0 to 1614
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    1615 non-null   object 
 1   street                  1611 non-null   object 
 2   housenumber             1307 non-null   object 
 3   city                    1615 non-null   object 
 4   postcode                1545 non-null   object 
 5   speciality              1615 non-null   object 
 6   opening_hours           1116 non-null   object 
 7   website                 904 non-null    object 
 8   id                      1615 non-null   object 
 9   longitude               1615 non-null   float64
 10  latitude                1615 non-null   float64
 11  wheelchair              573 non-null    object 
 12  description             102 non-null    object 
 13  email                   188 non-null    object 
 14  toilets_wheelchair      36 non-null     

In [298]:
df = df.drop(columns=['capacity_score'], errors='ignore')

Validation & Quality Checks:
  - Check for duplicate rows. 
  - Check final row count.

In [299]:
df.duplicated().sum()

np.int64(0)

In [300]:
df['district_id'].isnull().sum()

np.int64(0)

In [301]:
df['neighborhood_id'].isnull().sum()

np.int64(0)

In [302]:
df['id'].nunique()

1615

In [303]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1615 entries, 0 to 1614
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    1615 non-null   object 
 1   street                  1611 non-null   object 
 2   housenumber             1307 non-null   object 
 3   city                    1615 non-null   object 
 4   postcode                1545 non-null   object 
 5   speciality              1615 non-null   object 
 6   opening_hours           1116 non-null   object 
 7   website                 904 non-null    object 
 8   id                      1615 non-null   object 
 9   longitude               1615 non-null   float64
 10  latitude                1615 non-null   float64
 11  wheelchair              573 non-null    object 
 12  description             102 non-null    object 
 13  email                   188 non-null    object 
 14  toilets_wheelchair      36 non-null     