In [1]:
import pandas as pd
import matplotlib as plt
from data import get_train_data


raw_data = get_train_data()

print(f"Number of train groups: {len(raw_data)}")

first_train_key = list(raw_data.keys())[0]
first_train = raw_data[first_train_key][0]

print(f"Keys in first train: {list(first_train.keys())}")
print(f"Station values in first train: {(first_train['stations'][0].keys())}")

Calling API for fresh data
Success!
Number of train groups: 166
Keys in first train: ['routeName', 'trainNum', 'trainNumRaw', 'trainID', 'lat', 'lon', 'trainTimely', 'iconColor', 'textColor', 'stations', 'heading', 'eventCode', 'eventTZ', 'eventName', 'origCode', 'originTZ', 'origName', 'destCode', 'destTZ', 'destName', 'trainState', 'velocity', 'statusMsg', 'createdAt', 'updatedAt', 'lastValTS', 'objectID', 'provider', 'providerShort', 'onlyOfTrainNum', 'alerts']
Station values in first train: dict_keys(['name', 'code', 'tz', 'bus', 'schArr', 'schDep', 'arr', 'dep', 'arrCmnt', 'depCmnt', 'status', 'stopIconColor', 'platform'])


In [2]:
train_records = []
for train_number, train_list in raw_data.items():
  for train in train_list:
    train_record = {
      'train_number': train_number,
      'route_name': train.get('routeName', ''),
      'train_id': train.get('trainID', ''),
      'lat': train.get('lat'),
      'lon': train.get('lon'),
      'heading': train.get('heading', ''),
      'velocity': train.get('velocity'),
      'train_state': train.get('trainState', ''),
      'status_msg': train.get('statusMsg', ''),
      'origin_code': train.get('origCode', ''),
      'dest_code': train.get('destCode', ''),
      'num_stations': len(train.get('stations', []))
    }
    train_records.append(train_record)

trains_df = pd.DataFrame(train_records)
print("TRAINS DataFrame:")
print(trains_df.head())
print(f"Shape: {trains_df.shape}")

TRAINS DataFrame:
  train_number       route_name train_id        lat         lon heading  \
0            1   Sunset Limited     1-30  31.758026 -106.498408       W   
1            2   Sunset Limited     2-30  32.350885 -108.709613       E   
2            3  Southwest Chief     3-31  41.879008  -87.639306       N   
3            3  Southwest Chief     3-30  35.476568 -105.231722      SE   
4            4  Southwest Chief     4-29  41.302868  -89.732535       E   

    velocity   train_state status_msg origin_code dest_code  num_stations  
0   0.093206        Active                    NOL       LAX            22  
1  20.200769        Active                    LAX       NOL            22  
2   0.000000  Predeparture                    CHI       LAX            32  
3  48.284351        Active                    CHI       LAX            32  
4  79.150238        Active                    LAX       CHI            32  
Shape: (182, 12)


In [16]:
station_records = []
for train_number, train_list in raw_data.items():
  for train in train_list:
    route_name = train.get('routeName', '')
    train_id = train.get('trainID', '')
    
    for station in train.get('stations', []):
      station_record = {
        'train_number': train_number,
        'train_id': train_id,
        'route_name': route_name,
        'station_name': station.get('name', ''),
        'station_code': station.get('code', ''),
        'scheduled_arrival': station.get('schArr'),
        'actual_arrival': station.get('arr'),
        'scheduled_departure': station.get('schDep'),
        'actual_departure': station.get('dep'),
        'status': station.get('status', ''),
        'timezone': station.get('tz', ''),
      }
      station_records.append(station_record)

stations_df = pd.DataFrame(station_records)

time_columns = ['scheduled_arrival', 'actual_arrival', 'scheduled_departure', 'actual_departure']
for col in time_columns:
  stations_df[col] = pd.to_datetime(stations_df[col], errors='coerce', utc=True)

print("\nSTATIONS DataFrame:")
print(stations_df.head())
print(f"Shape: {stations_df.shape}")



STATIONS DataFrame:
  train_number train_id      route_name  station_name station_code  \
0            1     1-30  Sunset Limited   New Orleans          NOL   
1            1     1-30  Sunset Limited     Schriever          SCH   
2            1     1-30  Sunset Limited    New Iberia          NIB   
3            1     1-30  Sunset Limited     Lafayette          LFT   
4            1     1-30  Sunset Limited  Lake Charles          LCH   

          scheduled_arrival            actual_arrival  \
0 2025-07-30 14:00:00+00:00 2025-07-30 14:00:00+00:00   
1 2025-07-30 15:30:00+00:00 2025-07-30 15:33:00+00:00   
2 2025-07-30 16:56:00+00:00 2025-07-30 17:05:00+00:00   
3 2025-07-30 17:24:00+00:00 2025-07-30 17:33:00+00:00   
4 2025-07-30 18:55:00+00:00 2025-07-30 18:54:00+00:00   

        scheduled_departure          actual_departure    status  \
0 2025-07-30 14:00:00+00:00 2025-07-30 14:00:00+00:00  Departed   
1 2025-07-30 15:30:00+00:00 2025-07-30 15:35:00+00:00  Departed   
2 2025-07-30 1

In [22]:
stations_df["departure_delay_minutes"] = (stations_df['actual_departure'] - stations_df['scheduled_departure']).dt.total_seconds() / 60
stations_df["arrival_delay_minutes"] = (stations_df['actual_arrival'] - stations_df['scheduled_arrival']).dt.total_seconds() / 60

print(f"Departure delays: {stations_df.departure_delay_minutes.head()}")
print(f"Arrival delays: {stations_df.arrival_delay_minutes.head()}")

print(f"Total departure delays: {stations_df.departure_delay_minutes.gt(0).sum()}")
print(f"Total arrival delays: {stations_df.arrival_delay_minutes.gt(0).sum()}")


Departure delays: 0     0.0
1     5.0
2    10.0
3    13.0
4     1.0
Name: departure_delay_minutes, dtype: float64
Arrival delays: 0    0.0
1    3.0
2    9.0
3    9.0
4   -1.0
Name: arrival_delay_minutes, dtype: float64
Total departure delays: 2004
Total arrival delays: 1989
