In [1]:
import pandas as pd
from geopy.geocoders import Nominatim
from time import sleep

# -------- Paths --------
kg_path = "kindergartens_berlin_final (1).csv"

# -------- Load data --------
kg = pd.read_csv(kg_path)

# 1) Rename id -> kindergarten_id (if present)
if "id" in kg.columns and "kindergarten_id" not in kg.columns:
    kg = kg.rename(columns={"id": "kindergarten_id"}).copy()

# 2) Official Berlin district (Bezirk) -> official district_id map
DISTRICT_ID_MAP = {
    "Mitte": "11001001",
    "Friedrichshain-Kreuzberg": "11002002",
    "Pankow": "11003003",
    "Charlottenburg-Wilmersdorf": "11004004",
    "Spandau": "11005005",
    "Steglitz-Zehlendorf": "11006006",
    "Tempelhof-Schöneberg": "11007007",
    "Neukölln": "11008008",
    "Treptow-Köpenick": "11009009",
    "Marzahn-Hellersdorf": "11010010",
    "Lichtenberg": "11011011",
    "Reinickendorf": "11012012",
}
OFFICIAL_BEZIRKE = set(DISTRICT_ID_MAP.keys())

# 3) Ortsteil (locality) -> Bezirk mapping (no postcodes)
ortsteil_to_bezirk = {
    # Mitte
    "Moabit": "Mitte", "Tiergarten": "Mitte", "Gesundbrunnen": "Mitte", "Wedding": "Mitte", "Hansaviertel": "Mitte",
    # Reinickendorf
    "Wittenau": "Reinickendorf", "Tegel": "Reinickendorf", "Hermsdorf": "Reinickendorf", "Frohnau": "Reinickendorf",
    "Konradshöhe": "Reinickendorf", "Heiligensee": "Reinickendorf", "Waidmannslust": "Reinickendorf",
    "Märkisches Viertel": "Reinickendorf", "Borsigwalde": "Reinickendorf",
    # Charlottenburg-Wilmersdorf
    "Charlottenburg": "Charlottenburg-Wilmersdorf", "Charlottenburg-Nord": "Charlottenburg-Wilmersdorf",
    "Wilmersdorf": "Charlottenburg-Wilmersdorf", "Schmargendorf": "Charlottenburg-Wilmersdorf",
    "Grunewald": "Charlottenburg-Wilmersdorf", "Halensee": "Charlottenburg-Wilmersdorf",
    "Westend": "Charlottenburg-Wilmersdorf",
    # Tempelhof-Schöneberg
    "Tempelhof": "Tempelhof-Schöneberg", "Schöneberg": "Tempelhof-Schöneberg", "Friedenau": "Tempelhof-Schöneberg",
    "Mariendorf": "Tempelhof-Schöneberg", "Marienfelde": "Tempelhof-Schöneberg", "Lichtenrade": "Tempelhof-Schöneberg",
    # Steglitz-Zehlendorf
    "Steglitz": "Steglitz-Zehlendorf", "Lichterfelde": "Steglitz-Zehlendorf", "Zehlendorf": "Steglitz-Zehlendorf",
    "Dahlem": "Steglitz-Zehlendorf", "Nikolassee": "Steglitz-Zehlendorf", "Wannsee": "Steglitz-Zehlendorf",
    "Schlachtensee": "Steglitz-Zehlendorf", "Lankwitz": "Steglitz-Zehlendorf",
    # Pankow
    "Prenzlauer Berg": "Pankow", "Weißensee": "Pankow", "Niederschönhausen": "Pankow", "Pankow": "Pankow",
    "Heinersdorf": "Pankow", "Karow": "Pankow", "Buch": "Pankow", "Französisch Buchholz": "Pankow",
    "Wilhelmsruh": "Pankow", "Rosenthal": "Pankow",
    # Neukölln
    "Britz": "Neukölln", "Buckow": "Neukölln", "Rudow": "Neukölln", "Gropiusstadt": "Neukölln",
    # Treptow-Köpenick
    "Alt-Treptow": "Treptow-Köpenick", "Plänterwald": "Treptow-Köpenick", "Baumschulenweg": "Treptow-Köpenick",
    "Niederschöneweide": "Treptow-Köpenick", "Johannisthal": "Treptow-Köpenick", "Adlershof": "Treptow-Köpenick",
    "Oberschöneweide": "Treptow-Köpenick", "Spindlersfeld": "Treptow-Köpenick", "Altglienicke": "Treptow-Köpenick",
    "Bohnsdorf": "Treptow-Köpenick", "Grünau": "Treptow-Köpenick", "Köpenick": "Treptow-Köpenick",
    "Friedrichshagen": "Treptow-Köpenick", "Rahnsdorf": "Treptow-Köpenick", "Schmöckwitz": "Treptow-Köpenick",
    # Marzahn-Hellersdorf
    "Marzahn": "Marzahn-Hellersdorf", "Hellersdorf": "Marzahn-Hellersdorf", "Biesdorf": "Marzahn-Hellersdorf",
    "Kaulsdorf": "Marzahn-Hellersdorf", "Mahlsdorf": "Marzahn-Hellersdorf", "Hönow": "Marzahn-Hellersdorf",
    # Lichtenberg
    "Fennpfuhl": "Lichtenberg", "Rummelsburg": "Lichtenberg", "Friedrichsfelde": "Lichtenberg",
    "Karlshorst": "Lichtenberg", "Alt-Hohenschönhausen": "Lichtenberg", "Neu-Hohenschönhausen": "Lichtenberg",
    "Malchow": "Lichtenberg", "Wartenberg": "Lichtenberg", "Falkenberg": "Lichtenberg",
    # Friedrichshain-Kreuzberg
    "Friedrichshain": "Friedrichshain-Kreuzberg", "Kreuzberg": "Friedrichshain-Kreuzberg",
    # Spandau
    "Haselhorst": "Spandau", "Siemensstadt": "Spandau", "Staaken": "Spandau", "Wilhelmstadt": "Spandau",
    "Hakenfelde": "Spandau", "Falkenhagener Feld": "Spandau", "Kladow": "Spandau", "Gatow": "Spandau",
}

def _norm(s):
    if pd.isna(s):
        return None
    return str(s).strip()

# Normalize raw fields
kg["district_raw"] = kg.get("district")
kg["neighborhood_raw"] = kg.get("neighborhood")
kg["district_norm"] = kg["district_raw"].apply(_norm)
kg["neighborhood_norm"] = kg["neighborhood_raw"].apply(_norm)

# 4) Resolve official Bezirk (district = 12 only)
def resolve_bezirk(row):
    d = row["district_norm"]
    n = row["neighborhood_norm"]
    if d in OFFICIAL_BEZIRKE:
        return d
    if n in OFFICIAL_BEZIRKE:
        return n
    if d in ortsteil_to_bezirk:
        return ortsteil_to_bezirk[d]
    if n in ortsteil_to_bezirk:
        return ortsteil_to_bezirk[n]
    return None

kg["district"] = kg.apply(resolve_bezirk, axis=1)
kg["district_id"] = kg["district"].map(DISTRICT_ID_MAP)

# 5) Reverse geocode neighborhood from lat/lon (Ortsteil/Kiez only)
#    Priority: suburb -> neighbourhood -> quarter -> city_district
geolocator = Nominatim(user_agent="berlin_kindergartens_neighborhood_extractor")
_cache = {}  # (lat, lon) -> neighborhood

def get_neighborhood(lat, lon):
    key = (lat, lon)
    if key in _cache:
        return _cache[key]
    try:
        location = geolocator.reverse((lat, lon), exactly_one=True, language="de")
        sleep(1)  # respect Nominatim rate limits
        if not location:
            _cache[key] = None
            return None
        addr = location.raw.get("address", {}) or {}
        neigh = (
            addr.get("suburb")
            or addr.get("neighbourhood")
            or addr.get("quarter")
            or addr.get("city_district")
        )
        neigh = neigh.strip() if isinstance(neigh, str) else None
        _cache[key] = neigh
        return neigh
    except Exception:
        _cache[key] = None
        return None

# Fill/overwrite neighborhood using reverse geocoding when lat/lon present
if "latitude" in kg.columns and "longitude" in kg.columns:
    kg["neighborhood"] = kg.apply(
        lambda r: get_neighborhood(r["latitude"], r["longitude"])
        if pd.notna(r.get("latitude")) and pd.notna(r.get("longitude"))
        else None,
        axis=1
    )
else:
    # If no coordinates, keep existing neighborhood values (but they may be messy)
    kg["neighborhood"] = kg["neighborhood_norm"]

# 6) Clean postal codes: replace integer 0 with "Unknown"
if "postal_code" in kg.columns:
    kg["postal_code"] = kg["postal_code"].replace(0, "Unknown")

# 7) Final column order (removed phone_number, email, website, coordinates)
final_cols = [
    "kindergarten_id", "district_id", "name", "address", "postal_code",
    "latitude", "longitude", "neighborhood", "district",
    "operator", "capacity", "source", "created_at", "updated_at"
]
for c in final_cols:
    if c not in kg.columns:
        kg[c] = None

final_df = kg[final_cols]

# 8) Quality checks
print("Total rows:", len(final_df))
print("Rows missing district:", final_df["district"].isna().sum())
print("Rows missing district_id:", final_df["district_id"].isna().sum())
print("Rows missing neighborhood (after geocoding):", final_df["neighborhood"].isna().sum())

# 9) Save output
out_path = "kindergartens_berlin_final_ready.csv"
final_df.to_csv(out_path, index=False)
print("Saved:", out_path)




Total rows: 2298
Rows missing district: 636
Rows missing district_id: 636
Rows missing neighborhood (after geocoding): 1165
Saved: kindergartens_berlin_final_ready.csv
