In [44]:
#setup
import pandas as pd
from pathlib import Path


In [45]:
#paths
gtfs_dir = Path("/Users/hebifou/Downloads/GTFS")
stops_csv = gtfs_dir / "sbahn_stations_clean.csv"       # has: stop_id, stop_name, stop_lat, stop_lon
routes_txt = gtfs_dir / "routes.txt"
trips_txt = gtfs_dir / "trips.txt"
stop_times_txt = gtfs_dir / "stop_times.txt"
out_csv = gtfs_dir / "sources" / "sbahn_stations_transformed.csv"
out_csv.parent.mkdir(parents=True, exist_ok=True)

#loading inputs
stops = pd.read_csv(stops_csv)  # keeping stop_id for joining
routes = pd.read_csv(routes_txt, low_memory=False)
trips = pd.read_csv(trips_txt, low_memory=False)
stop_times = pd.read_csv(stop_times_txt, low_memory=False)

print("Raw stops shape:", stops.shape)
print("Raw routes shape:", routes.shape)
print("Raw trips shape:", trips.shape)
print("Raw stop_times shape:", stop_times.shape)


Raw stops shape: (423, 4)
Raw routes shape: (1318, 8)
Raw trips shape: (246906, 10)
Raw stop_times shape: (5497299, 8)


In [46]:
#keeping only S-Bahn routes (route_short_name starts with 'S')
routes_s = routes[routes["route_short_name"].astype(str).str.startswith("S", na=False)].copy()
print("Filtered S-Bahn routes:", routes_s.shape)
print(routes_s[["route_id", "route_short_name"]].head(5))

Filtered S-Bahn routes: (53, 8)
    route_id route_short_name
0  26976_700              S3H
1  25929_109               S3
2  24864_109               S5
3  22240_109              S85
4  22239_109               S8


In [47]:
#mapping stop_id -> list of served S lines
st_tr = stop_times.merge(trips[["trip_id", "route_id"]], on="trip_id", how="left")
print("Merged stop_times + trips:", st_tr.shape)

st_tr_ro = st_tr.merge(routes_s[["route_id", "route_short_name"]], on="route_id", how="inner")
print("Merged with S-Bahn routes:", st_tr_ro.shape)
print(st_tr_ro.head(5))

lines_per_stop = (
    st_tr_ro.groupby("stop_id")["route_short_name"]
    .apply(lambda s: sorted(set(s.dropna().astype(str))))
    .reset_index()
    .rename(columns={"route_short_name": "line"})
)
print("Lines per stop shape:", lines_per_stop.shape)
print(lines_per_stop.head(5))

Merged stop_times + trips: (5497299, 9)
Merged with S-Bahn routes: (364630, 10)
     trip_id arrival_time departure_time                stop_id  \
0  269576732      4:37:00        4:37:00  de:11000:900182001::1   
1  269576732      4:39:00        4:39:00  de:11000:900180512::1   
2  269576732      4:43:00        4:43:00  de:11000:900180309::1   
3  269576731      4:35:00        4:35:00  de:11000:900180309::1   
4  269576731      4:38:00        4:38:00  de:11000:900180512::1   

   stop_sequence  pickup_type  drop_off_type stop_headsign   route_id  \
0              0            0              0           NaN  26976_700   
1              1            0              0           NaN  26976_700   
2              2            0              0           NaN  26976_700   
3              0            0              0           NaN  26976_700   
4              1            0              0           NaN  26976_700   

  route_short_name  
0              S3H  
1              S3H  
2              

In [48]:
#joining lines back to our S-Bahn stops (by stop_id)
df = stops.merge(lines_per_stop, on="stop_id", how="left")
print("After merging lines to stops:", df.shape)

After merging lines to stops: (423, 5)


In [50]:
#renameing to target schema and adding placeholders (no fkdistrict in CSV)
df = df.rename(columns={
    "stop_name": "station",
    "stop_lat": "latitude",
    "stop_lon": "longitude"
})
df["postcode"] = None
df["neighborhood"] = None

#select and order final columns
df = df[["station", "line", "latitude", "longitude", "postcode", "neighborhood"]]

#checks
print("\nData Quality Checks")
print("Total rows:", len(df))
print("Unique stations:", df["station"].nunique())
print("Duplicate stations:", df.duplicated(subset=["station"]).sum())



Data Quality Checks
Total rows: 423
Unique stations: 201
Duplicate stations: 222


In [52]:
#checking coordinates
invalid_coords = df[
    (df["latitude"] < 52.3) | (df["latitude"] > 52.7) |
    (df["longitude"] < 13.0) | (df["longitude"] > 13.8)
]
print("Invalid coordinates found:", len(invalid_coords))

#preview
print("\nFinal DataFrame preview:")
print(df.head(5))



Invalid coordinates found: 0

Final DataFrame preview:
                  station              line   latitude  longitude postcode  \
0      S Birkenwerder Bhf          [S1, S8]  52.688665  13.288775     None   
1      S Birkenwerder Bhf          [S1, S8]  52.688658  13.288626     None   
2  S+U Westhafen (Berlin)   [S41, S42, S46]  52.536218  13.344329     None   
3  S+U Westhafen (Berlin)   [S41, S42, S46]  52.536318  13.344298     None   
4     S Bellevue (Berlin)  [S3, S5, S7, S9]  52.519946  13.348090     None   

  neighborhood  
0         None  
1         None  
2         None  
3         None  
4         None  


In [53]:
#saving
df.to_csv(out_csv, index=False)
print(f"\nSaved: {out_csv}")


Saved: /Users/hebifou/Downloads/GTFS/sources/sbahn_stations_transformed.csv
