In [18]:
# /scripts/extract_kindergartens_osm.py

import requests
import pandas as pd
import os

def get_osm_data(bbox):
    """
    Queries the Overpass API for all kindergartens within a given bounding box.

    Args:
        bbox (str): A string representing the bounding box in the format "south,west,north,east".
                    Example for Berlin: "52.3,13.0,52.6,13.8"

    Returns:
        pd.DataFrame: A DataFrame containing the extracted data.
    """
    try:
        # Overpass API endpoint
        overpass_url = "http://overpass-api.de/api/interpreter"

        # Overpass QL query to find all nodes, ways, and relations with 'amenity=kindergarten'
        # within the specified bounding box.
        overpass_query = f"""
            [out:json][timeout:25];
            (
              node["amenity"="kindergarten"]({bbox});
              way["amenity"="kindergarten"]({bbox});
              relation["amenity"="kindergarten"]({bbox});
            );
            out body;
            >;
            out skel qt;
        """

        print("Requesting data from Overpass API...")
        response = requests.get(overpass_url, data={'data': overpass_query})
        response.raise_for_status()

        # Parse the JSON response
        data = response.json()
        elements = data.get('elements', [])

        kindergartens = []
        for element in elements:
            if element['type'] in ['node', 'way', 'relation']:
                tags = element.get('tags', {})
                kindergarten = {
                    'id': element['id'],
                    'latitude': element.get('lat', tags.get('lat')), # Coordinates for nodes are direct
                    'longitude': element.get('lon', tags.get('lon')),
                    'name': tags.get('name'),
                    'address': tags.get('addr:full'),
                    'street': tags.get('addr:street'),
                    'housenumber': tags.get('addr:housenumber'),
                    'postcode': tags.get('addr:postcode'),
                    'city': tags.get('addr:city'),
                }
                kindergartens.append(kindergarten)

        df = pd.DataFrame(kindergartens)
        return df

    except requests.exceptions.RequestException as e:
        print(f"Error interacting with the Overpass API: {e}")
        return pd.DataFrame()
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return pd.DataFrame()

def main():
    """
    Main function to orchestrate the data extraction process.
    """
    # Bounding box for Berlin, Germany (south, west, north, east)
    berlin_bbox = "52.34,13.1,52.6,13.8"

    kindergarten_df = get_osm_data(berlin_bbox)

    if not kindergarten_df.empty:
        # Save the DataFrame to a CSV file in the 'sources' directory
        raw_data_path = '../sources/kindergartens_berlin_raw_osm.csv'
        os.makedirs(os.path.dirname(raw_data_path), exist_ok=True)
        kindergarten_df.to_csv(raw_data_path, index=False)
        print(f"Data successfully extracted and saved to: {raw_data_path}")
    else:
        print("No data was extracted.")


if __name__ == "__main__":
    main()

Requesting data from Overpass API...
Data successfully extracted and saved to: ../sources/kindergartens_berlin_raw_osm.csv


In [13]:
import pandas as pd
from datetime import datetime

# -------------------
# Step 1. Load CSV
# -------------------
raw_file = "/20250815_0538_osm_kindergartens.json"  # despite .json, it's CSV
df = pd.read_csv(raw_file)

# -------------------
# Step 2. Cleaning
# -------------------

# Strip and collapse extra spaces
df['name'] = df['name'].astype(str).str.strip().str.replace(r'\s+', ' ', regex=True)
df['address'] = df['address'].astype(str).str.strip().str.replace(r'\s+', ' ', regex=True)
df['district'] = df['district'].astype(str).str.strip().str.title()

# Replace placeholder "Berlin" in addresses with NaN
df['address'] = df['address'].replace("Berlin", pd.NA)

# Convert postal_code to string
df['postal_code'] = df['postal_code'].astype("string")

# Drop rows missing BOTH postal_code and district
df = df.dropna(subset=['postal_code', 'district'], how='all')

# Remove duplicates
df = df.drop_duplicates()

# Reset index
df = df.reset_index(drop=True)

# -------------------
# Step 3. Add IDs & Metadata
# -------------------

# Use original row index as "original_id"
df.insert(0, "id", df.index + 1)

# Add created_at and updated_at with current timestamp
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
df["created_at"] = timestamp
df["updated_at"] = timestamp

# Replace NaN with 'Unknown'
df = df.fillna("Unknown")

# -------------------
# Step 4. Save cleaned file
# -------------------
output_file = "kindergartens_berlin_cleaned.csv"
df.to_csv(output_file, index=False)

print(f" Cleaning complete! Saved as {output_file}")
df.head()



✅ Cleaning complete! Saved as kindergartens_berlin_cleaned.csv


Unnamed: 0,id,name,address,postal_code,district,latitude,longitude,operator,capacity,source,created_at,updated_at
0,1,Bunte Klänge Kindergarten,"Graf-Haeseler-Straße 23, 13403 Berlin",13403.0,Reinickendorf,52.56996,13.313863,Unknown,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54
1,2,Nestwärme Kita,Unknown,Unknown,Nan,52.501536,13.433399,Unknown,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54
2,3,Kita Fehlerstraße 2,"Fehlerstraße 2, 12161 Berlin",12161.0,Friedenau,52.476741,13.325825,Unknown,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54
3,4,Ganztagsbetreuung der Fläming-Grundschule,"Rheinstraße 54, 12161 Berlin",12161.0,Friedenau,52.468522,13.332504,Nachbarschaftsheim Schöneberg e.V.,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54
4,5,kidsweb.de Kindertagesbetreuung,Unknown,Unknown,Nan,52.539884,13.349624,Unknown,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54


Complete Transformation Logic
 continue after the initial cleaning:

In [20]:
# ===============================
# 1. Import Libraries
# ===============================
import pandas as pd
from datetime import datetime
from geopy.geocoders import Nominatim
from time import sleep


## 2. Load Dataset

We load the previously cleaned dataset. Make sure it contains at least:
- `latitude` & `longitude`
- `postal_code`
- `address`
- `district` (optional)


In [21]:
df = pd.read_csv("kindergartens_berlin_cleaned.csv")
df.head()


Unnamed: 0,id,name,address,postal_code,district,latitude,longitude,operator,capacity,source,created_at,updated_at
0,1,Bunte Klänge Kindergarten,"Graf-Haeseler-Straße 23, 13403 Berlin",13403.0,Reinickendorf,52.5699596,13.3138632,Unknown,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54
1,2,Nestwärme Kita,Unknown,Unknown,Nan,52.5015358,13.4333992,Unknown,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54
2,3,Kita Fehlerstraße 2,"Fehlerstraße 2, 12161 Berlin",12161.0,Friedenau,52.4767411,13.3258251,Unknown,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54
3,4,Ganztagsbetreuung der Fläming-Grundschule,"Rheinstraße 54, 12161 Berlin",12161.0,Friedenau,52.4685223,13.3325044,Nachbarschaftsheim Schöneberg e.V.,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54
4,5,kidsweb.de Kindertagesbetreuung,Unknown,Unknown,Nan,52.5398837,13.3496243,Unknown,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54


## 3. Normalize Column Names

We convert column names to lowercase, replace spaces and hyphens with underscores for consistency.


In [22]:
df.columns = (
    df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("-", "_")
)


## 4. Handle Missing Values

We replace missing values with defaults such as `"Not Provided"` or `"Unknown"` to avoid errors in downstream processing.


In [23]:
default_values = {
    "website": "Not Provided",
    "phone": "Not Provided",
    "email": "Not Provided",
    "address": "Not Provided",
    "district": "Unknown",
    "postal_code": "00000",
}

for col, default in default_values.items():
    if col in df.columns:
        df[col] = df[col].fillna(default).astype(str).str.strip()

# Fill any remaining missing cells
df = df.fillna("Not Provided")


## 5. Standardize Postal Codes

Keep only valid 5-digit postal codes.


In [24]:
if "postal_code" in df.columns:
    df["postal_code"] = (
        df["postal_code"].astype(str)
        .str.extract(r"(\d{5})")  # capture 5-digit codes
        .fillna("00000")
    )


## 6. Standardize Addresses

Remove redundant city information from the address for consistency.


In [25]:
if "address" in df.columns:
    df["address"] = df["address"].str.replace(", Berlin", "", regex=False).str.strip()


## 7. Ensure Coordinates Are Numeric

Convert latitude and longitude to numeric types and coerce invalid entries to `NaN`.


In [26]:
df["latitude"] = pd.to_numeric(df["latitude"], errors="coerce")
df["longitude"] = pd.to_numeric(df["longitude"], errors="coerce")


## 8. Validate Coordinates Within Berlin

Check that coordinates fall within the expected latitude and longitude range for Berlin.


In [27]:
valid_coords = df[
    (df["latitude"].between(52.3, 52.7)) &
    (df["longitude"].between(13.1, 13.7))
]
print("Rows with valid coordinates:", len(valid_coords), "of", len(df))


Rows with valid coordinates: 1137 of 2298


## 9. Assign Districts via Postal Code

We use a predefined postal code to district mapping.


In [28]:
postal_to_district = {
    "13403": "Reinickendorf",
    "12161": "Friedenau",
    # Add more mappings as needed
}

df["district"] = df.apply(
    lambda row: postal_to_district.get(str(row["postal_code"]), row["district"]),
    axis=1
)


## 10. Reverse Geocoding for Official Bezirks

We use Nominatim to enrich each location with its official Berlin district (`Bezirk`).  
A 1-second delay is included to respect API rate limits.


In [29]:
geolocator = Nominatim(user_agent="berlin_bezirk_locator")

def get_bezirk(lat, lon):
    try:
        location = geolocator.reverse((lat, lon), exactly_one=True, language='de')
        sleep(1)
        if location and "address" in location.raw:
            address = location.raw["address"]
            return (
                address.get("city_district") or
                address.get("borough") or
                address.get("county") or
                None
            )
        return None
    except:
        return None

df["neighborhood"] = df.apply(
    lambda row: get_bezirk(row["latitude"], row["longitude"]) if pd.notnull(row["latitude"]) else "Unknown",
    axis=1
)




## 11. Remove Duplicate Rows


In [30]:
print("Duplicate rows before:", df.duplicated().sum())
df = df.drop_duplicates()
print("Duplicate rows after:", df.duplicated().sum())


Duplicate rows before: 0
Duplicate rows after: 0


## 12. Add Timestamps

Add creation and update timestamps for dataset tracking.


In [31]:
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
df["created_at"] = timestamp
df["updated_at"] = timestamp


## 13. Save Final Dataset

Save the fully cleaned and enriched dataset.


In [32]:
df.to_csv("kindergartens_berlin_final.csv", index=False)
print("Final dataset saved. Total rows:", len(df))


Final dataset saved. Total rows: 2298


In [33]:
from google.colab import files
files.download("kindergartens_berlin_final.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>