In [None]:
import pandas as pd
import sqlite3

# First, I load both the cleaned connections file and the stations file which already includes postcodes
connections_df = pd.read_csv("08-connections-no-dupes.csv")
stations_df = pd.read_csv("simoun-asmar-berlin-stations-with-postcode.csv")

# Then I define a helper function to remove known prefixes and standardize station names
def clean_name(name):
    return (
        name.replace("U-Bahnhof ", "")
            .replace("Bahnhof Berlin ", "")
            .replace("Bahnhöfe Berlin ", "")
            .replace("S-Bahnhof ", "")
            .replace("Bahnhof ", "")
            .replace("Berlin-", "")
            .strip()
    )

# I apply this cleaning function to both datasets so the station names can be matched later
connections_df["point1_clean"] = connections_df["point1"].apply(clean_name)
stations_df["station_clean"] = stations_df["name"].apply(clean_name)

# At this stage, I remove any entries that still start with Q* IDs (likely leftover Wikidata entries)
connections_df = connections_df[~connections_df["point1_clean"].str.startswith("Q")]
stations_df = stations_df[~stations_df["station_clean"].str.startswith("Q")]

# I make sure all postcodes are stored as nullable integers for consistency
stations_df["postcode"] = pd.to_numeric(stations_df["postcode"], errors="coerce").astype("Int64")

# Then I drop any duplicate station names that might have slipped through
stations_df = stations_df.drop_duplicates(subset=["station_clean"])

# I now initialize a local SQLite database to merge and query the data efficiently
conn = sqlite3.connect("berlin_transport.db")

# Next, I push both datasets (connections and stations) into two separate tables in SQLite
connections_df[["point1_clean", "line"]].drop_duplicates()\
    .to_sql("connections", conn, if_exists="replace", index=False)
stations_df[["station_clean", "latitude", "longitude", "postcode"]]\
    .to_sql("stations", conn, if_exists="replace", index=False)

# I then write an SQL query to join stations and lines, enriching the line data with coordinates and postcodes
query = """
SELECT 
    c.point1_clean AS station,
    c.line,
    s.latitude,
    s.longitude,
    s.postcode
FROM connections c
LEFT JOIN stations s
    ON TRIM(c.point1_clean) = TRIM(s.station_clean)
ORDER BY station
"""

# I run the query and load the result into a new DataFrame
result_df = pd.read_sql_query(query, conn)
# Postcodes were float values ending with '.0', converting to string and removing suffix
result_df["postcode"] = result_df["postcode"].astype(str).str.replace(".0", "", regex=False)

# After the join, I manually patch missing coordinates for specific stations that couldn’t be matched
manual_coords = {
    "Beusselstraße": (52.534444, 13.329444),
    "Charlottenburg": (52.50505, 13.30452),
    "Fehrbelliner Platz (U3)": (52.4897, 13.3153),
    "Fehrbelliner Platz (U7)": (52.4897, 13.3153),
    "Friedrichstraße": (52.52000, 13.38700),
    "Hackescher Markt": (52.52333, 13.40278),
    "Hermannstraße": (52.46722, 13.43194),
    "Hohenzollerndamm": (52.48722, 13.31250),
    "Köllnische Heide": (52.46583, 13.46056),
    "Ostkreuz": (52.50278, 13.46917),
    "Südkreuz": (52.47500, 13.36500),
    "Zoologischer Garten": (52.50750, 13.33417),
}

for station, (lat, lon) in manual_coords.items():
    mask = result_df["station"] == station
    result_df.loc[mask, "latitude"] = lat
    result_df.loc[mask, "longitude"] = lon

# Finally, I export the enriched station-line dataset to a new CSV that will be used in the next transformation step
result_df.to_csv("merged_ubahn_line.csv", index=False)

✅ merged_ubahn_line.csv updated and saved with manual coordinates.
