In [18]:
# /scripts/extract_kindergartens_osm.py

import requests
import pandas as pd
import os

def get_osm_data(bbox):
    """
    Queries the Overpass API for all kindergartens within a given bounding box.

    Args:
        bbox (str): A string representing the bounding box in the format "south,west,north,east".
                    Example for Berlin: "52.3,13.0,52.6,13.8"

    Returns:
        pd.DataFrame: A DataFrame containing the extracted data.
    """
    try:
        # Overpass API endpoint
        overpass_url = "http://overpass-api.de/api/interpreter"

        # Overpass QL query to find all nodes, ways, and relations with 'amenity=kindergarten'
        # within the specified bounding box.
        overpass_query = f"""
            [out:json][timeout:25];
            (
              node["amenity"="kindergarten"]({bbox});
              way["amenity"="kindergarten"]({bbox});
              relation["amenity"="kindergarten"]({bbox});
            );
            out body;
            >;
            out skel qt;
        """

        print("Requesting data from Overpass API...")
        response = requests.get(overpass_url, data={'data': overpass_query})
        response.raise_for_status()

        # Parse the JSON response
        data = response.json()
        elements = data.get('elements', [])

        kindergartens = []
        for element in elements:
            if element['type'] in ['node', 'way', 'relation']:
                tags = element.get('tags', {})
                kindergarten = {
                    'id': element['id'],
                    'latitude': element.get('lat', tags.get('lat')), # Coordinates for nodes are direct
                    'longitude': element.get('lon', tags.get('lon')),
                    'name': tags.get('name'),
                    'address': tags.get('addr:full'),
                    'street': tags.get('addr:street'),
                    'housenumber': tags.get('addr:housenumber'),
                    'postcode': tags.get('addr:postcode'),
                    'city': tags.get('addr:city'),
                }
                kindergartens.append(kindergarten)

        df = pd.DataFrame(kindergartens)
        return df

    except requests.exceptions.RequestException as e:
        print(f"Error interacting with the Overpass API: {e}")
        return pd.DataFrame()
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return pd.DataFrame()

def main():
    """
    Main function to orchestrate the data extraction process.
    """
    # Bounding box for Berlin, Germany (south, west, north, east)
    berlin_bbox = "52.34,13.1,52.6,13.8"

    kindergarten_df = get_osm_data(berlin_bbox)

    if not kindergarten_df.empty:
        # Save the DataFrame to a CSV file in the 'sources' directory
        raw_data_path = '../sources/kindergartens_berlin_raw_osm.csv'
        os.makedirs(os.path.dirname(raw_data_path), exist_ok=True)
        kindergarten_df.to_csv(raw_data_path, index=False)
        print(f"Data successfully extracted and saved to: {raw_data_path}")
    else:
        print("No data was extracted.")


if __name__ == "__main__":
    main()

Requesting data from Overpass API...
Data successfully extracted and saved to: ../sources/kindergartens_berlin_raw_osm.csv


In [13]:
import pandas as pd
from datetime import datetime

# -------------------
# Step 1. Load CSV
# -------------------
raw_file = "/20250815_0538_osm_kindergartens.json"  # despite .json, it's CSV
df = pd.read_csv(raw_file)

# -------------------
# Step 2. Cleaning
# -------------------

# Strip and collapse extra spaces
df['name'] = df['name'].astype(str).str.strip().str.replace(r'\s+', ' ', regex=True)
df['address'] = df['address'].astype(str).str.strip().str.replace(r'\s+', ' ', regex=True)
df['district'] = df['district'].astype(str).str.strip().str.title()

# Replace placeholder "Berlin" in addresses with NaN
df['address'] = df['address'].replace("Berlin", pd.NA)

# Convert postal_code to string
df['postal_code'] = df['postal_code'].astype("string")

# Drop rows missing BOTH postal_code and district
df = df.dropna(subset=['postal_code', 'district'], how='all')

# Remove duplicates
df = df.drop_duplicates()

# Reset index
df = df.reset_index(drop=True)

# -------------------
# Step 3. Add IDs & Metadata
# -------------------

# Use original row index as "original_id"
df.insert(0, "id", df.index + 1)

# Add created_at and updated_at with current timestamp
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
df["created_at"] = timestamp
df["updated_at"] = timestamp

# Replace NaN with 'Unknown'
df = df.fillna("Unknown")

# -------------------
# Step 4. Save cleaned file
# -------------------
output_file = "kindergartens_berlin_cleaned.csv"
df.to_csv(output_file, index=False)

print(f" Cleaning complete! Saved as {output_file}")
df.head()



✅ Cleaning complete! Saved as kindergartens_berlin_cleaned.csv


Unnamed: 0,id,name,address,postal_code,district,latitude,longitude,operator,capacity,source,created_at,updated_at
0,1,Bunte Klänge Kindergarten,"Graf-Haeseler-Straße 23, 13403 Berlin",13403.0,Reinickendorf,52.56996,13.313863,Unknown,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54
1,2,Nestwärme Kita,Unknown,Unknown,Nan,52.501536,13.433399,Unknown,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54
2,3,Kita Fehlerstraße 2,"Fehlerstraße 2, 12161 Berlin",12161.0,Friedenau,52.476741,13.325825,Unknown,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54
3,4,Ganztagsbetreuung der Fläming-Grundschule,"Rheinstraße 54, 12161 Berlin",12161.0,Friedenau,52.468522,13.332504,Nachbarschaftsheim Schöneberg e.V.,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54
4,5,kidsweb.de Kindertagesbetreuung,Unknown,Unknown,Nan,52.539884,13.349624,Unknown,Unknown,openstreetmap,2025-08-18 07:51:54,2025-08-18 07:51:54


Complete Transformation Logic
 continue after the initial cleaning:

In [16]:
# ===============================
# Kindergartens Berlin Data Transformation
# ===============================

# Import necessary libraries
import pandas as pd
from datetime import datetime
import re

# ===============================
# Step 0: Load Dataset
# ===============================
# Load the last cleaned dataset
df = pd.read_csv("kindergartens_berlin_cleaned.csv")

# Markdown:
"""
### Step 0: Load Dataset
- Load the previously cleaned raw data.
- Ensure the source contains the latest approved data.
"""
print(f"Initial dataset rows: {len(df)}")

# ===============================
# Step 1: Normalize Columns and Check Duplicates
# ===============================
# Normalize column names
df.columns = (
    df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("-", "_")
)

# Check and remove duplicate rows
print("Duplicate rows before removal:", df.duplicated().sum())
df = df.drop_duplicates()
print("Duplicate rows after removal:", df.duplicated().sum())

# Markdown:
"""
### Step 1: Normalize Columns and Check Duplicates
- Standardize column names to lowercase with underscores.
- Remove any duplicate rows to ensure unique kindergarten entries.
"""

# ===============================
# Step 2: Handle Missing Values
# ===============================
df = df.fillna("Unknown")

# Validate missing values
missing_summary = df.isna().sum()
print("Missing values after transformation:\n", missing_summary)

# Markdown:
"""
### Step 2: Handle Missing Values
- All remaining missing values are filled with 'Unknown'.
- This ensures no nulls interfere with analysis or merging.
"""

# ===============================
# Step 3: Normalize Postal Codes
# ===============================
df["postal_code"] = (
    df["postal_code"].astype(str)
    .str.extract(r"(\d{5})")  # Keep only 5-digit codes
    .fillna("Unknown")
)

# Markdown:
"""
### Step 3: Normalize Postal Codes
- Ensure all postal codes are standardized as 5-digit strings.
- Postal codes not matching the pattern are labeled as 'Unknown'.
"""

# ===============================
# Step 4: Standardize Addresses
# ===============================
df["address"] = df["address"].str.replace(", Berlin", "", regex=False).str.strip()

# Markdown:
"""
### Step 4: Standardize Addresses
- Removed redundant ', Berlin' text from addresses.
- Trimmed whitespace for consistency.
"""

# ===============================
# Step 5: Ensure Coordinates are Float & Valid
# ===============================
df["latitude"] = pd.to_numeric(df["latitude"], errors="coerce")
df["longitude"] = pd.to_numeric(df["longitude"], errors="coerce")

# Keep only coordinates within Berlin boundaries
df = df[(df["latitude"].between(52.3, 52.7)) & (df["longitude"].between(13.1, 13.7))]
print("Rows after filtering invalid coordinates:", len(df))

# Markdown:
"""
### Step 5: Validate Coordinates
- Latitude and longitude converted to numeric types.
- Only coordinates within Berlin boundaries retained.
"""

# ===============================
# Step 6: Assign Missing Districts
# ===============================
postal_to_district = {
    "13403": "Reinickendorf",
    "12161": "Friedenau",
    # Add more mappings as needed
}

df["district"] = df.apply(
    lambda row: postal_to_district.get(str(row["postal_code"]), row["district"]),
    axis=1
)

# Markdown:
"""
### Step 6: Assign Missing Districts
- Missing district values filled using postal code → district mapping.
- Postal codes not in mapping remain 'Unknown'.
"""

# ===============================
# Step 7: Normalize Website, Phone, Email
# ===============================
for col in ["website", "phone", "email"]:
    if col in df.columns:
        df[col] = df[col].fillna("Unknown").astype(str).str.strip()

# Normalize websites
if "website" in df.columns:
    df["website"] = df["website"].apply(
        lambda x: x if re.match(r"https?://", x) else f"http://{x}" if x != "Unknown" else x
    )

# Lowercase emails
if "email" in df.columns:
    df["email"] = df["email"].str.lower()

# Markdown:
"""
### Step 7: Normalize Contact Information
- Missing values replaced with 'Unknown'.
- Websites are prefixed with 'http://' if missing.
- Emails converted to lowercase for consistency.
"""

# ===============================
# Step 8: Add Timestamps
# ===============================
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
df["created_at"] = timestamp
df["updated_at"] = timestamp

# Markdown:
"""
### Step 8: Add Timestamps
- Added 'created_at' and 'updated_at' for versioning and tracking changes.
"""

# ===============================
# Step 9: Summary and Final Checks
# ===============================
print(" Transformation Complete")
print(f"Total kindergartens: {len(df)}")
if "district" in df.columns:
    print(f"District distribution:\n{df['district'].value_counts()}")

# Markdown:
"""
### Step 9: Summary
- All transformations completed.
- Dataset contains only valid coordinates within Berlin.
- Duplicate rows removed, missing values handled, and contact info standardized.
"""

# ===============================
# Step 10: Save Final Dataset
# ===============================
df.to_csv("kindergartens_berlin_final.csv", index=False)
print("Final cleaned dataset saved as kindergartens_berlin_final.csv")

# Markdown:
"""
### Step 10: Save Final Dataset
- Final cleaned dataset saved for further analysis and modeling.
"""



Initial dataset rows: 2298
Duplicate rows before removal: 0
Duplicate rows after removal: 0
Missing values after transformation:
 id             0
name           0
address        0
postal_code    0
district       0
latitude       0
longitude      0
operator       0
capacity       0
source         0
created_at     0
updated_at     0
dtype: int64
Rows after filtering invalid coordinates: 1137
✅ Transformation Complete
Total kindergartens: 1137
District distribution:
district
Nan                     481
Schöneberg               58
Kreuzberg                48
Neukölln                 45
Prenzlauer Berg          43
                       ... 
Französisch Buchholz      1
Plänterwald               1
Schmargendorf             1
Staaken                   1
Buch                      1
Name: count, Length: 75, dtype: int64
Final cleaned dataset saved as kindergartens_berlin_final.csv


'\n### Step 10: Save Final Dataset\n- Final cleaned dataset saved for further analysis and modeling.\n'

Validation & Quality Checks

In [17]:
# Check for duplicates
print("Duplicates:", df.duplicated().sum())

# Check Berlin boundary (approx lat: 52.3–52.7, lon: 13.1–13.7)
valid_coords = df[
    (df["latitude"].between(52.3, 52.7)) &
    (df["longitude"].between(13.1, 13.7))
]
print("Rows with valid coords:", len(valid_coords), "of", len(df))

# Count check
print("Total rows:", len(df))

# Website/phone/email normalization (if present)
for col in ["website", "phone", "email"]:
    if col in df.columns:
        df[col] = df[col].fillna("Unknown").astype(str).str.strip()


Duplicates: 0
Rows with valid coords: 1137 of 1137
Total rows: 1137
