In [None]:
# Imports
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from shapely import wkt
from geopy.geocoders import Nominatim
import time

# 1. BUILD ADDRESS COLUMN
def build_address(df, street_col, house_num_col, postal_col, neigh_col, city_col):
    """
    Combines address parts into a single address string per row.
    """
    df['address'] = (
        df[street_col].fillna('').astype(str) + ' ' +
        df[house_num_col].fillna('').astype(str) + ', ' +
        df[postal_col].fillna('').astype(str) + ' ' +
        df[neigh_col].fillna('').astype(str) + ' ' +
        df[city_col].fillna('').astype(str)
    ).str.strip()
    return df

data = build_address(
    data,
    street_col='street',
    house_num_col='house_number',
    postal_col='postal_code',
    neigh_col='neighbourhood',
    city_col='city'
)

# 2. GEOCODE ADDRESSES
def geocode_address_column(df, address_column, sleep_between=1, user_agent="geo_project"):
    """
    Geocodes an address column using Nominatim and adds latitude/longitude columns.
    """
    geolocator = Nominatim(user_agent=user_agent)

    lats, lons = [], []
    for addr in df[address_column]:
        try:
            location = geolocator.geocode(addr)
            if location:
                lats.append(location.latitude)
                lons.append(location.longitude)
            else:
                lats.append(None)
                lons.append(None)
        except Exception:
            lats.append(None)
            lons.append(None)
        time.sleep(sleep_between)  # respect rate limits!
    df['latitude'] = lats
    df['longitude'] = lons
    return df

# Uncomment to run geocoding (can be slow!)
# data = geocode_address_column(data, address_column='address', sleep_between=1)

# 3. LOAD DISTRICTS AND NEIGHBORHOODS
districts = pd.read_csv("/content/districts_202508191345.csv")
neighborhoods = pd.read_csv("/content/neighborhoods_202508191400.csv")

# 4. CONVERT DATAFRAMES TO GEODATAFRAMES
def df_to_gdf_points(df, lon_col, lat_col):
    """
    Converts dataframe with lon/lat columns to GeoDataFrame with Point geometry.
    """
    df = df.dropna(subset=[lon_col, lat_col])
    geometry = [Point(xy) for xy in zip(df[lon_col], df[lat_col])]
    return gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')

addresses_gdf = df_to_gdf_points(data, lon_col='longitude', lat_col='latitude')

districts_gdf = gpd.GeoDataFrame(
    districts,
    geometry=districts['geometry_str'].apply(wkt.loads),
    crs='EPSG:4326'
)

neighborhoods_gdf = gpd.GeoDataFrame(
    neighborhoods,
    geometry=neighborhoods['geometry_str'].apply(wkt.loads),
    crs='EPSG:4326'
)

# 5. ENSURE CRS MATCHES
addresses_gdf = addresses_gdf.to_crs(districts_gdf.crs)
neighborhoods_gdf = neighborhoods_gdf.to_crs(districts_gdf.crs)

# 6. SPATIAL JOIN: DISTRICTS
addresses_gdf = gpd.sjoin(
    addresses_gdf, districts_gdf[['geometry', 'district']],
    how='left', predicate='within'
)

# 7. CLEAN INDEX_CONFLICTS
if 'index_right' in addresses_gdf.columns:
    addresses_gdf.drop(columns=['index_right'], inplace=True)

# 8. SPATIAL JOIN: NEIGHBORHOODS
addresses_gdf = gpd.sjoin(
    addresses_gdf,
    neighborhoods_gdf[['geometry', 'neighborhood', 'district']],
    how='left', predicate='within'
)

if 'index_right' in addresses_gdf.columns:
    addresses_gdf.drop(columns=['index_right'], inplace=True)

# 9. SAVE TO CSV
addresses_gdf.to_csv("immowelt_with_geodata.csv", index=False)
