In [26]:
# /scripts/extract_kindergartens_osm.py

import requests
import pandas as pd
import os

def get_osm_data(bbox):
    """
    Queries the Overpass API for all kindergartens within a given bounding box.

    Args:
        bbox (str): A string representing the bounding box in the format "south,west,north,east".
                    Example for Berlin: "52.3,13.0,52.6,13.8"

    Returns:
        pd.DataFrame: A DataFrame containing the extracted data.
    """
    try:
        # Overpass API endpoint
        overpass_url = "http://overpass-api.de/api/interpreter"

        # Overpass QL query to find all nodes, ways, and relations with 'amenity=kindergarten'
        # within the specified bounding box.
        overpass_query = f"""
            [out:json][timeout:25];
            (
              node["amenity"="kindergarten"]({bbox});
              way["amenity"="kindergarten"]({bbox});
              relation["amenity"="kindergarten"]({bbox});
            );
            out body;
            >;
            out skel qt;
        """

        print("Requesting data from Overpass API...")
        response = requests.get(overpass_url, data={'data': overpass_query})
        response.raise_for_status()

        # Parse the JSON response
        data = response.json()
        elements = data.get('elements', [])

        kindergartens = []
        for element in elements:
            if element['type'] in ['node', 'way', 'relation']:
                tags = element.get('tags', {})
                kindergarten = {
                    'id': element['id'],
                    'latitude': element.get('lat', tags.get('lat')), # Coordinates for nodes are direct
                    'longitude': element.get('lon', tags.get('lon')),
                    'name': tags.get('name'),
                    'address': tags.get('addr:full'),
                    'street': tags.get('addr:street'),
                    'housenumber': tags.get('addr:housenumber'),
                    'postcode': tags.get('addr:postcode'),
                    'city': tags.get('addr:city'),
                }
                kindergartens.append(kindergarten)

        df = pd.DataFrame(kindergartens)
        return df

    except requests.exceptions.RequestException as e:
        print(f"Error interacting with the Overpass API: {e}")
        return pd.DataFrame()
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return pd.DataFrame()

def main():
    """
    Main function to orchestrate the data extraction process.
    """
    # Bounding box for Berlin, Germany (south, west, north, east)
    berlin_bbox = "52.34,13.1,52.6,13.8"

    kindergarten_df = get_osm_data(berlin_bbox)

    if not kindergarten_df.empty:
        # Save the DataFrame to a CSV file in the 'sources' directory
        raw_data_path = '../sources/kindergartens_berlin_raw_osm.csv'
        os.makedirs(os.path.dirname(raw_data_path), exist_ok=True)
        kindergarten_df.to_csv(raw_data_path, index=False)
        print(f"Data successfully extracted and saved to: {raw_data_path}")
    else:
        print("No data was extracted.")


if __name__ == "__main__":
    main()

Requesting data from Overpass API...
Data successfully extracted and saved to: ../sources/kindergartens_berlin_raw_osm.csv


In [28]:
# /scripts/transform_kindergartens.py

import pandas as pd
import numpy as np
import os

def clean_data(raw_data_path, cleaned_data_path):
    """
    Cleans and normalizes the raw kindergarten data from OSM.

    Args:
        raw_data_path (str): The path to the raw data CSV file.
        cleaned_data_path (str): The path to save the cleaned data.
    """
    try:
        # Load the raw data from the CSV file
        df = pd.read_csv(raw_data_path)
        print("Raw data loaded successfully.")

        # --- Data Cleaning and Transformation ---

        # 1. Combine street and housenumber into a single 'address' column
        df['street'] = df['street'].fillna('')
        df['housenumber'] = df['housenumber'].fillna('')
        df['address'] = df['street'].astype(str) + ' ' + df['housenumber'].astype(str)
        df['address'] = df['address'].str.strip()
        df.loc[df['address'] == '', 'address'] = df['city'].fillna('Unknown')

        # 2. Add 'operator' column if it doesn't exist, as it's not always present in OSM data
        if 'operator' not in df.columns:
            df['operator'] = np.nan

        # 3. Fill missing values
        df['name'] = df['name'].fillna('Unknown')
        df['address'] = df['address'].fillna('Unknown')
        df['operator'] = df['operator'].fillna('Unknown')

        # NOTE: OSM data doesn't reliably contain 'capacity' or 'age_groups' tags.
        # These columns will be created with placeholder values.

        # 4. Add created_at and updated_at timestamps
        df['created_at'] = pd.Timestamp.now()
        df['updated_at'] = pd.Timestamp.now()

        # 5. Select and rename columns to match the final schema
        final_df = pd.DataFrame()
        final_df['id'] = df['id']
        final_df['name'] = df['name']
        final_df['address'] = df['address']
        final_df['district'] = df['city'].fillna('Unknown') # Using 'city' as a proxy for district
        final_df['zip_code'] = df['postcode'].fillna('Unknown')
        final_df['latitude'] = df['latitude']
        final_df['longitude'] = df['longitude']
        final_df['provider'] = df['operator'] # Mapping 'operator' to 'provider'
        final_df['capacity'] = np.nan
        final_df['age_groups'] = np.nan
        final_df['created_at'] = df['created_at']
        final_df['updated_at'] = df['updated_at']

        # Save the cleaned DataFrame to a new CSV file
        final_df.to_csv(cleaned_data_path, index=False)
        print(f"Cleaned data saved to {cleaned_data_path}")

    except FileNotFoundError:
        print(f"Error: The file at {raw_data_path} was not found.")
    except Exception as e:
        print(f"An error occurred during data transformation: {e}")

if __name__ == "__main__":
    # Define file paths
    RAW_DATA_FILE = '../sources/kindergartens_berlin_raw_osm.csv'
    CLEANED_DATA_FILE = '../data/kindergartens_berlin_cleaned.csv'

    # Ensure the /data directory exists
    os.makedirs(os.path.dirname(CLEANED_DATA_FILE), exist_ok=True)

    # Run the cleaning function
    clean_data(RAW_DATA_FILE, CLEANED_DATA_FILE)

Raw data loaded successfully.
Cleaned data saved to ../data/kindergartens_berlin_cleaned.csv
