In [2]:
import pandas as pd

# Input files
kg_path = "/mnt/data/kindergartens_berlin_final (1).csv"
districts_path = "/mnt/data/districts_202508191131.csv"

# Load data
kg = pd.read_csv('/content/kindergartens_berlin_final (1).csv')
districts = pd.read_csv('/content/districts_202508191131.csv')

# 1) Rename id -> kindergarten_id
kg = kg.rename(columns={"id": "kindergarten_id"}).copy()

# 2) Ensure districts has district_id (stable 2-digit ids if missing)
if "district_id" not in districts.columns:
    districts = districts.sort_values("district").reset_index(drop=True)
    districts.insert(0, "district_id", [f"{i:02d}" for i in range(1, len(districts) + 1)])

official_bezirke = set(districts["district"].astype(str).str.strip())

# 3) Ortsteil (locality) -> Bezirk (borough) mapping
ortsteil_to_bezirk = {
    # Mitte
    "Moabit": "Mitte", "Tiergarten": "Mitte", "Gesundbrunnen": "Mitte", "Wedding": "Mitte", "Hansaviertel": "Mitte",
    # Reinickendorf
    "Wittenau": "Reinickendorf", "Tegel": "Reinickendorf", "Hermsdorf": "Reinickendorf", "Frohnau": "Reinickendorf",
    "Konradshöhe": "Reinickendorf", "Heiligensee": "Reinickendorf", "Waidmannslust": "Reinickendorf",
    "Märkisches Viertel": "Reinickendorf", "Borsigwalde": "Reinickendorf",
    # Charlottenburg-Wilmersdorf
    "Charlottenburg": "Charlottenburg-Wilmersdorf", "Charlottenburg-Nord": "Charlottenburg-Wilmersdorf",
    "Wilmersdorf": "Charlottenburg-Wilmersdorf", "Schmargendorf": "Charlottenburg-Wilmersdorf",
    "Grunewald": "Charlottenburg-Wilmersdorf", "Halensee": "Charlottenburg-Wilmersdorf",
    "Westend": "Charlottenburg-Wilmersdorf",
    # Tempelhof-Schöneberg
    "Tempelhof": "Tempelhof-Schöneberg", "Schöneberg": "Tempelhof-Schöneberg", "Friedenau": "Tempelhof-Schöneberg",
    "Mariendorf": "Tempelhof-Schöneberg", "Marienfelde": "Tempelhof-Schöneberg", "Lichtenrade": "Tempelhof-Schöneberg",
    # Steglitz-Zehlendorf
    "Steglitz": "Steglitz-Zehlendorf", "Lichterfelde": "Steglitz-Zehlendorf", "Zehlendorf": "Steglitz-Zehlendorf",
    "Dahlem": "Steglitz-Zehlendorf", "Nikolassee": "Steglitz-Zehlendorf", "Wannsee": "Steglitz-Zehlendorf",
    "Schlachtensee": "Steglitz-Zehlendorf", "Lankwitz": "Steglitz-Zehlendorf",
    # Pankow
    "Prenzlauer Berg": "Pankow", "Weißensee": "Pankow", "Niederschönhausen": "Pankow", "Pankow": "Pankow",
    "Heinersdorf": "Pankow", "Karow": "Pankow", "Buch": "Pankow", "Französisch Buchholz": "Pankow",
    "Wilhelmsruh": "Pankow", "Rosenthal": "Pankow",
    # Neukölln
    "Britz": "Neukölln", "Buckow": "Neukölln", "Rudow": "Neukölln", "Gropiusstadt": "Neukölln",
    # Treptow-Köpenick
    "Alt-Treptow": "Treptow-Köpenick", "Plänterwald": "Treptow-Köpenick", "Baumschulenweg": "Treptow-Köpenick",
    "Niederschöneweide": "Treptow-Köpenick", "Johannisthal": "Treptow-Köpenick", "Adlershof": "Treptow-Köpenick",
    "Oberschöneweide": "Treptow-Köpenick", "Spindlersfeld": "Treptow-Köpenick", "Altglienicke": "Treptow-Köpenick",
    "Bohnsdorf": "Treptow-Köpenick", "Grünau": "Treptow-Köpenick", "Köpenick": "Treptow-Köpenick",
    "Friedrichshagen": "Treptow-Köpenick", "Rahnsdorf": "Treptow-Köpenick", "Schmöckwitz": "Treptow-Köpenick",
    # Marzahn-Hellersdorf
    "Marzahn": "Marzahn-Hellersdorf", "Hellersdorf": "Marzahn-Hellersdorf", "Biesdorf": "Marzahn-Hellersdorf",
    "Kaulsdorf": "Marzahn-Hellersdorf", "Mahlsdorf": "Marzahn-Hellersdorf", "Hönow": "Marzahn-Hellersdorf",
    # Lichtenberg
    "Fennpfuhl": "Lichtenberg", "Rummelsburg": "Lichtenberg", "Friedrichsfelde": "Lichtenberg",
    "Karlshorst": "Lichtenberg", "Alt-Hohenschönhausen": "Lichtenberg", "Neu-Hohenschönhausen": "Lichtenberg",
    "Malchow": "Lichtenberg", "Wartenberg": "Lichtenberg", "Falkenberg": "Lichtenberg",
    # Friedrichshain-Kreuzberg
    "Friedrichshain": "Friedrichshain-Kreuzberg", "Kreuzberg": "Friedrichshain-Kreuzberg",
    # Spandau
    "Haselhorst": "Spandau", "Siemensstadt": "Spandau", "Staaken": "Spandau", "Wilhelmstadt": "Spandau",
    "Hakenfelde": "Spandau", "Falkenhagener Feld": "Spandau", "Kladow": "Spandau", "Gatow": "Spandau",
}

def norm(s):
    if pd.isna(s):
        return None
    return str(s).strip()

kg["district_norm"] = kg["district"].apply(norm)
kg["neighborhood_norm"] = kg["neighborhood"].apply(norm)

def resolve_bezirk(row):
    d = row["district_norm"]
    n = row["neighborhood_norm"]
    if d in official_bezirke:
        return d
    if d in ortsteil_to_bezirk:
        return ortsteil_to_bezirk[d]
    if n in official_bezirke:
        return n
    if n in ortsteil_to_bezirk:
        return ortsteil_to_bezirk[n]
    return None

kg["district_bezirk"] = kg.apply(resolve_bezirk, axis=1)

# 4) Attach district_id via merge on borough name
kg = kg.merge(
    districts[["district_id", "district"]].rename(columns={"district": "district_bezirk"}),
    on="district_bezirk",
    how="left"
)

# 5) Build coordinates if desired
if "coordinates" not in kg.columns:
    kg["coordinates"] = kg.apply(
        lambda r: f"{r['latitude']},{r['longitude']}" if pd.notna(r.get("latitude")) and pd.notna(r.get("longitude")) else None,
        axis=1
    )

# Final column order for DB import
final_cols = [
    "kindergarten_id", "district_id", "name", "address", "postal_code",
    "phone_number", "email", "website", "coordinates",
    "latitude", "longitude", "neighborhood", "district_bezirk",
    "operator", "capacity", "source", "created_at", "updated_at"
]
for c in final_cols:
    if c not in kg.columns:
        kg[c] = None

final_df = kg[final_cols].rename(columns={"district_bezirk": "district"})

# 6) Quality checks
print("Total rows:", len(final_df))
print("Rows missing district_id:", final_df['district_id'].isna().sum())
print(final_df['district'].value_counts(dropna=False).head(20))

# 7) Save output
out_path = "kindergartens_berlin_final_ready.csv"  # saves in current folder
final_df.to_csv(out_path, index=False)
print("Saved:", out_path)


Total rows: 2298
Rows missing district_id: 636
district
None                          636
Tempelhof-Schöneberg          224
Mitte                         193
Pankow                        189
Charlottenburg-Wilmersdorf    188
Friedrichshain-Kreuzberg      171
Steglitz-Zehlendorf           127
Treptow-Köpenick              119
Neukölln                      119
Reinickendorf                 105
Lichtenberg                   100
Marzahn-Hellersdorf            64
Spandau                        63
Name: count, dtype: int64
Saved: kindergartens_berlin_final_ready.csv
