Setup and Loading Data

In [57]:
import pandas as pd

csv_path = "/Users/hebifou/Downloads/GTFS/sbahn_stations_clean.csv"

df = pd.read_csv(csv_path)

#data check
print("Shape before transformation:", df.shape)
print("Columns before transformation:", df.columns.tolist())
display(df.head(3))  # preview first 3 rows


Shape before transformation: (423, 4)
Columns before transformation: ['stop_id', 'stop_name', 'stop_lat', 'stop_lon']


Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon
0,de:12065:900200008:1:50,S Birkenwerder Bhf,52.688665,13.288775
1,de:12065:900200008:1:51,S Birkenwerder Bhf,52.688658,13.288626
2,de:11000:900001201:1:50,S+U Westhafen (Berlin),52.536218,13.344329


Rename columns to match `ubahn` schema

In [59]:
df = df.rename(columns={
    "stop_name": "station",
    "stop_lat": "latitude",
    "stop_lon": "longitude"
})

#adding placeholder columns
df["line"] = None
df["postcode"] = None
df["neighborhood"] = None
df["fkdistrict"] = None

#reordering columns
df = df[["station", "line", "latitude", "longitude", "postcode", "neighborhood", "fkdistrict"]]
print("Shape after renaming/reordering:", df.shape)
display(df.head(3))  #checking first 3 rows after transformation

Shape after renaming/reordering: (423, 7)


Unnamed: 0,station,line,latitude,longitude,postcode,neighborhood,fkdistrict
0,S Birkenwerder Bhf,,52.688665,13.288775,,,
1,S Birkenwerder Bhf,,52.688658,13.288626,,,
2,S+U Westhafen (Berlin),,52.536218,13.344329,,,


Data Quality Check

In [60]:
#checking for invalid coordinates 
invalid_coords = df[
    (df["latitude"] < 52.3) | (df["latitude"] > 52.7) |
    (df["longitude"] < 13.0) | (df["longitude"] > 13.8)
]
print(f"Invalid coordinates found: {len(invalid_coords)}")
if not invalid_coords.empty:
    display(invalid_coords)




Invalid coordinates found: 0


In [61]:
#summarizing stats
print("\n=== Summary ===")
print("Total stations:", df.shape[0])
print("Unique station names:", df['station'].nunique())


=== Summary ===
Total stations: 423
Unique station names: 201


In [62]:
#checking duplicate station names
duplicate_stations = df[df.duplicated(subset=["station"], keep=False)]
print(f"Duplicate station names found: {len(duplicate_stations)}")
if not duplicate_stations.empty:
    display(duplicate_stations.sort_values("station"))


Duplicate station names found: 367


Unnamed: 0,station,line,latitude,longitude,postcode,neighborhood,fkdistrict
412,Flughafen BER,,52.364331,13.508164,,,
411,Flughafen BER,,52.364226,13.508213,,,
194,S Adlershof (Berlin),,52.434867,13.541224,,,
195,S Adlershof (Berlin),,52.434946,13.541382,,,
153,S Ahrensfelde Bhf (Berlin),,52.571320,13.565711,,,
...,...,...,...,...,...,...,...
305,S+U Yorckstr. (Berlin),,52.491334,13.372158,,,
273,S+U Yorckstr. (Großgörschenstr.) (Berlin),,52.492301,13.367777,,,
274,S+U Yorckstr. (Großgörschenstr.) (Berlin),,52.492245,13.367968,,,
20,S+U Zoologischer Garten Bhf (Berlin),,52.507031,13.331849,,,


Save transformed dataset

In [63]:
output_dir = "/Users/hebifou/Downloads/GTFS/sources"
os.makedirs(output_dir, exist_ok=True)

df.to_csv("/Users/hebifou/Downloads/GTFS/sources/sbahn_stations_transformed.csv", index=False)
print("S-Bahn stations transformed and saved to sources/sbahn_stations_transformed.csv")

S-Bahn stations transformed and saved to sources/sbahn_stations_transformed.csv


In [65]:
print("=== Final DataFrame Info ===")
print(df.info())   # check column types and nulls
print("\nColumns:", df.columns.tolist())

print("\n=== First 5 rows ===")
display(df.head())

print("\n=== Summary ===")
print("Total stations:", df.shape[0])
print("Unique station names:", df['station'].nunique())


=== Final DataFrame Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423 entries, 0 to 422
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   station       423 non-null    object 
 1   line          0 non-null      object 
 2   latitude      423 non-null    float64
 3   longitude     423 non-null    float64
 4   postcode      0 non-null      object 
 5   neighborhood  0 non-null      object 
 6   fkdistrict    0 non-null      object 
dtypes: float64(2), object(5)
memory usage: 23.3+ KB
None

Columns: ['station', 'line', 'latitude', 'longitude', 'postcode', 'neighborhood', 'fkdistrict']

=== First 5 rows ===


Unnamed: 0,station,line,latitude,longitude,postcode,neighborhood,fkdistrict
0,S Birkenwerder Bhf,,52.688665,13.288775,,,
1,S Birkenwerder Bhf,,52.688658,13.288626,,,
2,S+U Westhafen (Berlin),,52.536218,13.344329,,,
3,S+U Westhafen (Berlin),,52.536318,13.344298,,,
4,S Bellevue (Berlin),,52.519946,13.34809,,,



=== Summary ===
Total stations: 423
Unique station names: 201
