Imports & Constants

In [1]:
import requests
import csv
import json
from typing import Any, Dict, List, Tuple

# Constants
DEFAULT_ENDPOINT = "https://overpass-api.de/api/interpreter"
USER_AGENT = "BerlinDentistsFetcher/1.0 (jaywindie1@gmail.com)" 
OVERPASS_QUERY = """
[out:json][timeout:60];
area["name"="Berlin"]["admin_level"="4"]->.berlin;
(
  node["healthcare"="dentist"](area.berlin);
  way["healthcare"="dentist"](area.berlin);
  relation["healthcare"="dentist"](area.berlin);
);
out center;
"""


Fetch Data from Overpass API

In [2]:
def fetch_overpass(query: str, endpoint: str = DEFAULT_ENDPOINT) -> Dict[str, Any]:
    headers = {"User-Agent": USER_AGENT}
    response = requests.post(endpoint, data={"data": query}, headers=headers)
    response.raise_for_status()
    return response.json()

# Test fetch (small run)
data = fetch_overpass(OVERPASS_QUERY)
print(f"Retrieved {len(data.get('elements', []))} elements.")

Retrieved 781 elements.


Normalize Elements (Flatten Into Rows)

In [3]:
def normalize_element(e: Dict[str, Any]) -> Dict[str, Any]:
    tags = e.get("tags", {})
    # Some elements are 'node', some are 'way' or 'relation' with a 'center'
    lat = e.get("lat") or (e.get("center", {}).get("lat") if "center" in e else None)
    lon = e.get("lon") or (e.get("center", {}).get("lon") if "center" in e else None)
    
    return {
        "osm_type": e.get("type"),
        "osm_id": e.get("id"),
        "name": tags.get("name"),
        "addr_street": tags.get("addr:street"),
        "addr_housenumber": tags.get("addr:housenumber"),
        "addr_postcode": tags.get("addr:postcode"),
        "addr_city": tags.get("addr:city"),
        "opening_hours": tags.get("opening_hours"),
        "wheelchair": tags.get("wheelchair"),
        "phone": tags.get("phone"),
        "email": tags.get("email"),
        "website": tags.get("website"),
        "lat": lat,
        "lon": lon,
    }

# Normalize some elements
elements = data.get("elements", [])
rows = [normalize_element(e) for e in elements]
print(f"Normalized {len(rows)} elements. Example:\n", rows[0] if rows else "No data")

Normalized 781 elements. Example:
 {'osm_type': 'node', 'osm_id': 304183504, 'name': None, 'addr_street': 'Hönower Straße', 'addr_housenumber': '75', 'addr_postcode': '12623', 'addr_city': 'Berlin', 'opening_hours': None, 'wheelchair': None, 'phone': None, 'email': None, 'website': None, 'lat': 52.5114112, 'lon': 13.612096}


Convert to GeoJSON

In [4]:
def elements_to_geojson(elements: List[Dict[str, Any]]) -> Dict[str, Any]:
    features = []
    for e in elements:
        tags = e.get("tags", {})
        lat = e.get("lat") or (e.get("center", {}).get("lat") if "center" in e else None)
        lon = e.get("lon") or (e.get("center", {}).get("lon") if "center" in e else None)
        if lat is None or lon is None:
            continue
        features.append({
            "type": "Feature",
            "geometry": {"type": "Point", "coordinates": [lon, lat]},
            "properties": {
                "osm_type": e.get("type"),
                "osm_id": e.get("id"),
                **tags
            }
        })
    return {"type": "FeatureCollection", "features": features}

geojson = elements_to_geojson(elements)
print(f"GeoJSON with {len(geojson['features'])} features created.")


GeoJSON with 781 features created.


Save to CSV & GeoJSON

In [5]:
def write_csv(rows: List[Dict[str, Any]], path: str) -> None:
    fieldnames = [
        "osm_type", "osm_id", "name", "addr_street", "addr_housenumber",
        "addr_postcode", "addr_city", "opening_hours", "wheelchair",
        "phone", "email", "website", "lat", "lon"
    ]
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for el in rows:
            writer.writerow({k: el.get(k) for k in fieldnames})

def write_geojson(geo: Dict[str, Any], path: str) -> None:
    with open(path, "w", encoding="utf-8") as f:
        json.dump(geo, f, ensure_ascii=False, indent=2)

# Save both files
csv_path = "berlin_dentists.csv"
geojson_path = "berlin_dentists.geojson"

write_csv(rows, csv_path)
write_geojson(geojson, geojson_path)

print(f"Saved CSV to {csv_path}")
print(f"Saved GeoJSON to {geojson_path}")

Saved CSV to berlin_dentists.csv
Saved GeoJSON to berlin_dentists.geojson


Quick Data Preview

In [6]:
import pandas as pd

df = pd.DataFrame(rows)
df.head()  # show first 5 dentists

Unnamed: 0,osm_type,osm_id,name,addr_street,addr_housenumber,addr_postcode,addr_city,opening_hours,wheelchair,phone,email,website,lat,lon
0,node,304183504,,Hönower Straße,75,12623.0,Berlin,,,,,,52.511411,13.612096
1,node,313539258,Zahnzentrum Wedding,Müllerstraße,34a,13353.0,Berlin,Mo 09:00-19:00; Tu 09:00-18:00; We 09:00-17:00...,yes,,,,52.548838,13.355305
2,node,325161442,A. Nejad,,,,,Mo-Tu 09:00-19:00; We 09:00-14:00; Th 09:00-19...,yes,+49 30 361 91 06,,,52.508843,13.180477
3,node,345236220,Dr. Beate Lengert,Kurfürstendamm,218,10719.0,Berlin,,,,,http://www.dr-beate-lengert.de/,52.502722,13.328137
4,node,391394177,Serpil Hartfiel,Kollwitzstraße,77,10435.0,Berlin,"Mo,Tu,Th 08:00-19:00; We 18:00-18:00; Fr 08:00...",no,,,,52.537547,13.418994


Imports & File Setup

In [7]:
import pandas as pd
import os

# Path to the raw CSV produced from Overpass
input_file = "/Users/jamie/berlin_dentists.csv"

# Load the CSV
df = pd.read_csv(input_file)
df.head()

Unnamed: 0,osm_type,osm_id,name,addr_street,addr_housenumber,addr_postcode,addr_city,opening_hours,wheelchair,phone,email,website,lat,lon
0,node,304183504,,Hönower Straße,75,12623.0,Berlin,,,,,,52.511411,13.612096
1,node,313539258,Zahnzentrum Wedding,Müllerstraße,34a,13353.0,Berlin,Mo 09:00-19:00; Tu 09:00-18:00; We 09:00-17:00...,yes,,,,52.548838,13.355305
2,node,325161442,A. Nejad,,,,,Mo-Tu 09:00-19:00; We 09:00-14:00; Th 09:00-19...,yes,+49 30 361 91 06,,,52.508843,13.180477
3,node,345236220,Dr. Beate Lengert,Kurfürstendamm,218,10719.0,Berlin,,,,,http://www.dr-beate-lengert.de/,52.502722,13.328137
4,node,391394177,Serpil Hartfiel,Kollwitzstraße,77,10435.0,Berlin,"Mo,Tu,Th 08:00-19:00; We 18:00-18:00; Fr 08:00...",no,,,,52.537547,13.418994


Inspect Raw Data

In [8]:
print("Initial Data Info:")
print(df.info())
df.head()  # preview first 5 rows

Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781 entries, 0 to 780
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   osm_type          781 non-null    object 
 1   osm_id            781 non-null    int64  
 2   name              755 non-null    object 
 3   addr_street       571 non-null    object 
 4   addr_housenumber  571 non-null    object 
 5   addr_postcode     531 non-null    float64
 6   addr_city         523 non-null    object 
 7   opening_hours     589 non-null    object 
 8   wheelchair        288 non-null    object 
 9   phone             287 non-null    object 
 10  email             82 non-null     object 
 11  website           299 non-null    object 
 12  lat               781 non-null    float64
 13  lon               781 non-null    float64
dtypes: float64(3), int64(1), object(10)
memory usage: 85.6+ KB
None


Unnamed: 0,osm_type,osm_id,name,addr_street,addr_housenumber,addr_postcode,addr_city,opening_hours,wheelchair,phone,email,website,lat,lon
0,node,304183504,,Hönower Straße,75,12623.0,Berlin,,,,,,52.511411,13.612096
1,node,313539258,Zahnzentrum Wedding,Müllerstraße,34a,13353.0,Berlin,Mo 09:00-19:00; Tu 09:00-18:00; We 09:00-17:00...,yes,,,,52.548838,13.355305
2,node,325161442,A. Nejad,,,,,Mo-Tu 09:00-19:00; We 09:00-14:00; Th 09:00-19...,yes,+49 30 361 91 06,,,52.508843,13.180477
3,node,345236220,Dr. Beate Lengert,Kurfürstendamm,218,10719.0,Berlin,,,,,http://www.dr-beate-lengert.de/,52.502722,13.328137
4,node,391394177,Serpil Hartfiel,Kollwitzstraße,77,10435.0,Berlin,"Mo,Tu,Th 08:00-19:00; We 18:00-18:00; Fr 08:00...",no,,,,52.537547,13.418994


Normalize Text Fields

In [9]:
import pandas as pd
df['name'] = df['name'].str.strip().str.title()
df['addr_street'] = df['addr_street'].str.strip().str.title()
df['addr_housenumber'] = df['addr_housenumber'].astype('string')
df['addr_city'] = df['addr_city'].fillna('Berlin').str.strip().str.title()
df['addr_postcode'] = df['addr_postcode'].astype('string')
# 6. Build full_address with custom function
def build_address(row):
    parts = row.dropna().astype(str)
    if parts.empty:
        return pd.NA   # leave as missing if all fields are NaN
    return ', '.join(parts)

address_cols = ['addr_street', 'addr_housenumber', 'addr_postcode', 'addr_city']
df['full_address'] = df[address_cols].agg(build_address, axis=1)
print(df.head())

  osm_type     osm_id                 name     addr_street addr_housenumber  \
0     node  304183504                  NaN  Hönower Straße               75   
1     node  313539258  Zahnzentrum Wedding    Müllerstraße              34a   
2     node  325161442             A. Nejad             NaN             <NA>   
3     node  345236220    Dr. Beate Lengert  Kurfürstendamm              218   
4     node  391394177      Serpil Hartfiel  Kollwitzstraße               77   

  addr_postcode addr_city                                      opening_hours  \
0       12623.0    Berlin                                                NaN   
1       13353.0    Berlin  Mo 09:00-19:00; Tu 09:00-18:00; We 09:00-17:00...   
2          <NA>    Berlin  Mo-Tu 09:00-19:00; We 09:00-14:00; Th 09:00-19...   
3       10719.0    Berlin                                                NaN   
4       10435.0    Berlin  Mo,Tu,Th 08:00-19:00; We 18:00-18:00; Fr 08:00...   

  wheelchair             phone email        

Drop duplicates only if ALL these fields match (to avoid dropping distinct locations sharing the same name):

In [10]:
fields_for_duplicates = [
    "name",
    "addr_street",
    "addr_housenumber",
    "addr_postcode",
    "addr_city",
    "lat",
    "lon"
]

before = len(df)
df = df.drop_duplicates(subset=fields_for_duplicates)
after = len(df)

print(f"Removed {before - after} duplicates based on name + full location. {after} rows remain.")

Removed 0 duplicates based on name + full location. 781 rows remain.


Normalize Phone Numbers

In [11]:
df['phone'] = (
    df['phone']
    .fillna('')
    .astype(str)
    .str.replace(r"\s+", "", regex=True)   # remove spaces
    .str.replace("-", "")                  # remove hyphens
)

df[['name', 'phone']].head()

Unnamed: 0,name,phone
0,,
1,Zahnzentrum Wedding,
2,A. Nejad,49303619106.0
3,Dr. Beate Lengert,
4,Serpil Hartfiel,


Standardize Wheelchair Field

In [12]:
df['wheelchair'] = df['wheelchair'].fillna('unknown').str.lower()
df['wheelchair'].value_counts()

wheelchair
unknown    493
no         154
yes         93
limited     41
Name: count, dtype: int64

Ensure Coordinates Are Valid

Ensure Coordinates Are Valid

In [13]:
for col in ['lat', 'lon']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows without valid coordinates
before = len(df)
df = df.dropna(subset=['lat', 'lon'])
after = len(df)

print(f"Removed {before - after} rows due to missing/invalid coordinates.")

Removed 0 rows due to missing/invalid coordinates.


Save Clean Data

In [14]:
output_file = os.path.join(os.path.dirname(input_file), "berlin_dentists_clean.csv")
df.to_csv(output_file, index=False)
print(f"✅ Transformed data saved to {output_file}")

✅ Transformed data saved to /Users/jamie/berlin_dentists_clean.csv


Preview Clean Dataset

In [15]:
df.sample(5, random_state=42)  # preview 5 random dentists

Unnamed: 0,osm_type,osm_id,name,addr_street,addr_housenumber,addr_postcode,addr_city,opening_hours,wheelchair,phone,email,website,lat,lon,full_address
595,node,9786588756,Praxis Für Zahnmedizin,Wildenbruchstraße,14,12045.0,Berlin,"Mo 09:00-16:00; Tu,Th 09:00-18:30; We 09:00-14...",no,,,,52.483908,13.443282,"Wildenbruchstraße, 14, 12045.0, Berlin"
587,node,9631262555,,Gasteiner Straße,9,10717.0,Berlin,,unknown,,,http://www.zahnarztpraxis-wilmersdorf.de/,52.487791,13.322032,"Gasteiner Straße, 9, 10717.0, Berlin"
543,node,9434234316,Za Dembinski/Reinnagel,Kissingenstraße,45,13189.0,Berlin,Mo-Th 08:00-18:00; Fr 08:00-11:00,no,,,http://www.pankow-zahnarzt.de/,52.56683,13.416816,"Kissingenstraße, 45, 13189.0, Berlin"
645,node,10608880228,Zenker & Partner,Ferdinandstraße,21,12209.0,Berlin,"Mo-Fr 08:30-12:30,13:30-18:30",unknown,,,,52.425322,13.330399,"Ferdinandstraße, 21, 12209.0, Berlin"
487,node,8422865841,Ladewig & Ladewig,Berliner Straße,9,10715.0,Berlin,"Mo,Tu,Th 08:00-12:00,14:30-18:00; We 08:00-14:...",unknown,,,https://www.zahnarzt-ladewig.de/,52.487699,13.332421,"Berliner Straße, 9, 10715.0, Berlin"


download and install geopy so it can be used for geocoding etc

In [16]:
!pip install geopy



In [17]:
!pip install geopandas shapely pyproj fiona rtree



downloading the GeoJSON file with Berlin district boundaries

In [18]:
import requests

url = "https://tsb-opendata.s3.eu-central-1.amazonaws.com/bezirksgrenzen/bezirksgrenzen.geojson"
resp = requests.get(url)
with open("berlin_bezirksgeo.json", "wb") as f:
    f.write(resp.content)
print("GeoJSON downloaded as berlin_bezirksgeo.json")

GeoJSON downloaded as berlin_bezirksgeo.json


Imports and Read Files

In [19]:
import geopandas as gpd
from shapely.geometry import Point

# Load the Berlin districts GeoJSON
districts = gpd.read_file("berlin_bezirksgeo.json")

# Load your dentists cleaned CSV with lat/lon
dentists_df = gpd.read_file("berlin_dentists_clean.csv")  # We'll convert to GeoDataFrame later

print(f"Districts count: {len(districts)}")
print(f"Dentists count: {len(dentists_df)}")

Districts count: 12
Dentists count: 781


Prepare Dentists GeoDataFrame

In [20]:
# Create Point geometry from lat/lon
geometry = [Point(xy) for xy in zip(dentists_df['lon'], dentists_df['lat'])]

# Create GeoDataFrame of dentists with proper CRS (Coordinate Reference System)
dentists = gpd.GeoDataFrame(dentists_df, geometry=geometry, crs="EPSG:4326")

print(dentists.head())


  osm_type     osm_id                 name     addr_street addr_housenumber  \
0     node  304183504                       Hönower Straße               75   
1     node  313539258  Zahnzentrum Wedding    Müllerstraße              34a   
2     node  325161442             A. Nejad                                    
3     node  345236220    Dr. Beate Lengert  Kurfürstendamm              218   
4     node  391394177      Serpil Hartfiel  Kollwitzstraße               77   

  addr_postcode addr_city                                      opening_hours  \
0       12623.0    Berlin                                                      
1       13353.0    Berlin  Mo 09:00-19:00; Tu 09:00-18:00; We 09:00-17:00...   
2                  Berlin  Mo-Tu 09:00-19:00; We 09:00-14:00; Th 09:00-19...   
3       10719.0    Berlin                                                      
4       10435.0    Berlin  Mo,Tu,Th 08:00-19:00; We 18:00-18:00; Fr 08:00...   

  wheelchair         phone email            

Ensuring Both GeoDataFrames Use Same CRS

In [21]:
# Check CRS of districts and dentists
print("Districts CRS:", districts.crs)
print("Dentists CRS:", dentists.crs)

# Reproject dentists to district CRS if needed
if dentists.crs != districts.crs:
    dentists = dentists.to_crs(districts.crs)
    print("Dentists reprojected to match districts CRS")

print("After reprojection, dentists CRS:", dentists.crs)

Districts CRS: EPSG:4326
Dentists CRS: EPSG:4326
After reprojection, dentists CRS: EPSG:4326


Performing Spatial Join to Add District Info to Dentists

In [22]:
# Spatial join: assign each dentist a district polygon it falls within
dentists_with_districts = gpd.sjoin(dentists, districts, how="left", predicate="within")

# Inspect new columns from districts GeoDataFrame, e.g., 'Gemeinde_name' for district name
print(dentists_with_districts.columns)
print(dentists_with_districts[['name', 'Gemeinde_name']].head())

Index(['osm_type', 'osm_id', 'name', 'addr_street', 'addr_housenumber',
       'addr_postcode', 'addr_city', 'opening_hours', 'wheelchair', 'phone',
       'email', 'website', 'lat', 'lon', 'full_address', 'geometry',
       'index_right', 'gml_id', 'Gemeinde_name', 'Gemeinde_schluessel',
       'Land_name', 'Land_schluessel', 'Schluessel_gesamt'],
      dtype='object')
                  name               Gemeinde_name
0                              Marzahn-Hellersdorf
1  Zahnzentrum Wedding                       Mitte
2             A. Nejad                     Spandau
3    Dr. Beate Lengert  Charlottenburg-Wilmersdorf
4      Serpil Hartfiel                      Pankow


Dropping columns

In [23]:
dentists_with_districts.drop(
    columns=['index_right', 'gml_id', 'Schluessel_gesamt', 'Gemeinde_schluessel','Land_name'],
    inplace=True
)

Renaming columns with German Heads to English

In [24]:
dentists_with_districts.rename(columns={
    'Gemeinde_name': 'district_name',
    'Land_schluessel': 'addr_city_code'
}, inplace=True)

Mapping district names to official district IDs using hardcoded dictionary:

In [25]:
# District mapping (official codes as strings)
district_mapping = {
    'Mitte': '11001001',
    'Friedrichshain-Kreuzberg': '11002002',
    'Pankow': '11003003',
    'Charlottenburg-Wilmersdorf': '11004004',
    'Spandau': '11005005',
    'Steglitz-Zehlendorf': '11006006',
    'Tempelhof-Schöneberg': '11007007',
    'Neukölln': '11008008',
    'Treptow-Köpenick': '11009009',
    'Marzahn-Hellersdorf': '11010010',
    'Lichtenberg': '11011011',
    'Reinickendorf': '11012012'
}

# Apply mapping to create district_id column
dentists_with_districts['district_id'] = (
    dentists_with_districts['district_name']
    .map(district_mapping)
    .astype(str)
)

# Optional: check unmapped districts
unmapped = dentists_with_districts[
    ~dentists_with_districts['district_name'].isin(district_mapping.keys())
]['district_name'].unique()

if len(unmapped) > 0:
    print("⚠️ Unmapped districts found:", unmapped)

print(dentists_with_districts[['name', 'district_name', 'district_id']].head())

                  name               district_name district_id
0                              Marzahn-Hellersdorf    11010010
1  Zahnzentrum Wedding                       Mitte    11001001
2             A. Nejad                     Spandau    11005005
3    Dr. Beate Lengert  Charlottenburg-Wilmersdorf    11004004
4      Serpil Hartfiel                      Pankow    11003003


Save Joined Result to CSV

In [26]:
# Save final enriched dataframe
output_file = "berlin_dentists_with_districts.csv"
dentists_with_districts.to_csv(output_file, index=False)
print(f"✅ Saved dentists with districts info to {output_file}")

✅ Saved dentists with districts info to berlin_dentists_with_districts.csv
