# SBB Data

##  Arrival and Departure Time

In [1]:
import pandas as pd

# Linie is actually the Train Number
# Line Text is actually the Train Line
# Stations can be matched via OPUIC number

interested_columns = {
    "Day of operation": "day_of_operation",
    "Product ID": "product_id",
    "OPUIC": "opuic",
    "Linie": "train_id",
    "Line Text": "train_line",
    "Stop name": "stop_name",
    "OPUIC": "opuic",
    "Arrival time": "arrival_time",
    "Arrival forecast": "arrival_forecast",
    "Arrival forecast status": "arrival_forecast_status",
    "Arrival delay": "arrival_delay",
    "Departure time": "departure_time",
    "Departure forecast": "departure_forecast",
    "Departure forecast status": "departure_forecast_status",
    "Departure delay": "departure_delay",
}
df_arrival_departure = pd.read_csv("./data/arrival_departure_files/sbb_departure_arrival_times.csv", sep=";", usecols=interested_columns.keys())
# Rename columns
df_arrival_departure.rename(columns=interested_columns, inplace=True)
# Filter out entries with "NAN" or null values
df_arrival_departure = df_arrival_departure[df_arrival_departure["arrival_time"].notnull()]
df_arrival_departure = df_arrival_departure[df_arrival_departure["departure_time"].notnull()]

# Save actual time value in arrival and departure columns instead of something like '2025-03-02T11:28:00' -> '11:28:00'
df_arrival_departure['arrival_time'] = pd.to_datetime(df_arrival_departure['arrival_time']).dt.time
df_arrival_departure['arrival_forecast'] = pd.to_datetime(df_arrival_departure['arrival_forecast']).dt.time
df_arrival_departure['departure_time'] = pd.to_datetime(df_arrival_departure['departure_time']).dt.time
df_arrival_departure['departure_forecast'] = pd.to_datetime(df_arrival_departure['departure_forecast']).dt.time

num_train_lines = len(df_arrival_departure["train_line"].unique())
num_stations = len(df_arrival_departure["stop_name"].unique())


print(f"Number of Train Lines {num_train_lines}")
print(f"Number of Stations {num_stations}")
df_arrival_departure.head()

Number of Train Lines 105
Number of Stations 570


Unnamed: 0,day_of_operation,product_id,train_id,train_line,opuic,stop_name,arrival_time,arrival_forecast,arrival_forecast_status,departure_time,departure_forecast,departure_forecast_status,arrival_delay,departure_delay
0,2025-03-02,Zug,2364,IR35,8500218,Olten,11:28:00,11:27:33,REAL,11:36:00,11:36:38,REAL,False,False
2,2025-03-02,Zug,2366,IR35,8503202,Thalwil,11:38:00,11:39:53,REAL,11:39:00,11:40:36,REAL,False,False
4,2025-03-02,Zug,2369,IR35,8508005,Burgdorf,11:49:00,11:49:31,REAL,11:51:00,11:51:38,REAL,False,False
5,2025-03-02,Zug,2369,IR35,8509414,Walenstadt,14:15:00,14:14:52,REAL,14:15:00,14:15:49,REAL,False,False
7,2025-03-02,Zug,2371,IR35,8508100,Langenthal,13:08:00,13:09:18,REAL,13:09:00,13:10:00,REAL,False,False


In [67]:
# Save cleaned version as csv
#df_arrival_departure.to_csv("./data/sbb_departure_arrival_times_cleaned.csv", index=False)

# Stations

In [2]:
interested_columns = {
    "Station abbreviation": "station_abbreviation",
    "Stop name": "stop_name",
    "OPUIC": "opuic",
    "KM": "km",
    "Geopos": "geopos",
}


df_stations = pd.read_csv("./data/sbb_stations.csv", sep=";", usecols=interested_columns.keys())
# Rename columns
df_stations.rename(columns=interested_columns, inplace=True)

# Create separate column for longitude and latitude from geopos
df_stations[['longitude', 'latitude']] = df_stations['geopos'].str.split(',', expand=True)

# Convert to correct types
df_stations['longitude'] = df_stations['longitude'].astype(float)
df_stations['latitude'] = df_stations['latitude'].astype(float)

# Remove geopos
df_stations.drop('geopos', axis=1, inplace=True)

num_stations =  len(df_stations["opuic"].unique())

print(f"Total number of available stations {num_stations}")
df_stations.head()

Total number of available stations 1355


Unnamed: 0,station_abbreviation,stop_name,km,opuic,longitude,latitude
0,ABO,Aarburg-Oftringen,43.00505,8502000,47.320268,7.908223
1,AHAU,Altishausen,29.09142,8506199,47.596488,9.162377
2,ALG,Algetshausen-Henau,103.23323,8506220,47.448548,9.109942
3,ALME,Al Motto (c bin),160.92853,8515373,46.143997,8.949152
4,ALT,Altstatten SG,41.17789,8506319,47.374235,9.55652


In [69]:
# Save cleaned version as csv
#df_stations.to_csv("./data/sbb_stations_cleaned.csv", index=False)