In [43]:
"""
Getting realtime data feed from the MTA and separating based on N/S Bound trains
"""
from nyct_gtfs import NYCTFeed

feed = NYCTFeed("N")
trains = feed.filter_trips(line_id=['N'], underway=True)

# filtering based on n/s bound trains
northbound_trains = [train for train in trains if train.direction == 'N']
southbound_trains = [train for train in trains if train.direction == 'S']

# gathering all the individual trips
n_bound_trip_ids = [trip.trip_id for trip in northbound_trains]
s_bound_trip_ids = [trip.trip_id for trip in southbound_trains]

n_bound_trip_ids + s_bound_trip_ids

['098150_N..N33R',
 '098800_N..N33R',
 '099300_N..N33R',
 '100200_N..N16R',
 '101200_N..N33R',
 '101900_N..N33R',
 '102700_N..N33R',
 '090650_N..S34R',
 '092250_N..S34R',
 '092900_N..S34R',
 '093750_N..S34R',
 '094500_N..S34R',
 '095450_N..S34R',
 '097000_N..S34R',
 '097100_N..S16R',
 '098400_N..S',
 '099000_N..S34R',
 '100100_N..S16R',
 '101100_N..S34R',
 '101600_N..S34R',
 '102200_N..S34R']

In [44]:
"""
Creating data frame to hold scheduled times for each trip
Downloads GTFS data from https://rrgtfsfeeds.s3.amazonaws.com/gtfs_supplemented.zip
"""
import pandas as pd
import re
import requests
import zipfile
import io
import os
from datetime import datetime

# Define the absolute path to your project directory
PROJECT_DIR = "/Users/mitchel/Desktop/beep/mtaDelayPredictor"
DATA_DIR = os.path.join(PROJECT_DIR, "data_files")

# Create data_files directory if it doesn't exist
os.makedirs(DATA_DIR, exist_ok=True)

# Download and extract the GTFS data
print("Downloading GTFS data...")
url = "https://rrgtfsfeeds.s3.amazonaws.com/gtfs_supplemented.zip"
response = requests.get(url)
response.raise_for_status()  # Raise an exception for bad status codes

# Extract stop_times.txt from the zip file
print("Extracting stop_times.txt...")
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
    # First, let's see what files are in the archive
    file_list = zip_file.namelist()
    # print("Files in archive:", file_list)
    
    # Find the stop_times.txt file (it might be in a different path)
    stop_times_file_path = None
    for file_path in file_list:
        if file_path.endswith('stop_times.txt'):
            stop_times_file_path = file_path
            break
    
    if stop_times_file_path is None:
        raise FileNotFoundError("stop_times.txt not found in the archive")
    
    # print(f"Found stop_times.txt at: {stop_times_file_path}")
    
    with zip_file.open(stop_times_file_path) as stop_times_file:
        # Read the content and create a DataFrame
        stop_times = pd.read_csv(stop_times_file)

# Generate filename with current datetime
current_datetime = datetime.now().strftime("%m_%d_%Y_%H_%M")
filename = os.path.join(DATA_DIR, f'stop_times_{current_datetime}.csv')

# Get the absolute path for clarity
abs_filename = os.path.abspath(filename)

# Save to CSV with datetime appended
print(f"Saving to {abs_filename}...")
stop_times.to_csv(filename, index=False)

# Verify the file was created
if os.path.exists(filename):
    file_size = os.path.getsize(filename)
    print(f"Successfully saved stop_times data to {abs_filename}")
    print(f"File size: {file_size:,} bytes")
    print(f"Number of rows: {len(stop_times):,}")
else:
    print(f"ERROR: File was not created at {abs_filename}")
    print(f"Current working directory: {os.getcwd()}")
    print(f"Contents of data_files directory: {os.listdir(DATA_DIR) if os.path.exists(DATA_DIR) else 'Directory does not exist'}")


if n_bound_trip_ids or s_bound_trip_ids:
    trip_dfs = {}
    all_trip_ids = n_bound_trip_ids + s_bound_trip_ids
    
    for trip in all_trip_ids:
        escaped_trip = re.escape(trip)
        filtered = stop_times[stop_times['trip_id'].str.contains(escaped_trip, na=False)]
        trip_dfs[trip] = filtered
        
    print(f"Processed {len(trip_dfs)} trip DataFrames")
else:
    print("No trip IDs defined - skipping trip processing")

Downloading GTFS data...
Extracting stop_times.txt...
Saving to /Users/mitchel/Desktop/beep/mtaDelayPredictor/data_files/stop_times_07_22_2025_17_11.csv...
Successfully saved stop_times data to /Users/mitchel/Desktop/beep/mtaDelayPredictor/data_files/stop_times_07_22_2025_17_11.csv
File size: 136,082,764 bytes
Number of rows: 2,297,464
Processed 21 trip DataFrames


In [36]:
"""
Combining static and real time schedule to compute delay information
"""
import pandas as pd
from nyct_gtfs.gtfs_static_types import Stations

stations = Stations()


# === Shared delay calculation logic ===
def process_trains(train_list, direction_label):
    results = []

    for train in train_list:
        trip_id = train.trip_id
        current_time = train.last_position_update
        stop_id = train.location

        if stop_id and trip_id:
            sched = stop_times[
                stop_times.trip_id.str.contains(trip_id, na=False) &
                (stop_times.stop_id == stop_id)
            ][["arrival_time", "departure_time"]]

            if not sched.empty:
                try:
                    scheduled_time = datetime.datetime.strptime(sched["arrival_time"].iloc[0], "%H:%M:%S").time()
                    scheduled_dt = pd.Timestamp.combine(current_time.date(), scheduled_time)
                    delay = (current_time - scheduled_dt).total_seconds() / 60.0

                    results.append({
                        "trip_id": trip_id,
                        "stop_id_raw": stop_id,
                        "stop_id": stations.get_station_name(stop_id),
                        "timestamp": current_time,
                        "delay_min": round(delay,2),
                        "status": "on_time" if abs(delay) < 1 else "delayed" if delay > 0 else "early",
                        "direction": direction_label
                    })
                except Exception as e:
                    print(f"Error parsing time for {trip_id} at {stop_id}: {e}")

    df = pd.DataFrame(results)

    # Add stop_sequence by fuzzy match
    def get_stop_sequence(row):
        match = stop_times[
            stop_times.trip_id.str.contains(row["trip_id"], na=False) &
            (stop_times.stop_id == row["stop_id_raw"])
        ]
        if not match.empty:
            return match.iloc[0]["stop_sequence"]
        else:
            return None

    df["stop_sequence"] = df.apply(get_stop_sequence, axis=1)
    df = df.sort_values(by=["trip_id", "stop_sequence"]).reset_index(drop=True)

    return df


# === Run for both directions ===
northbound_df = process_trains(northbound_trains, direction_label="N")
southbound_df = process_trains(southbound_trains, direction_label="S")

# === Combine for full view ===
all_delays_df = pd.concat([northbound_df, southbound_df]).sort_values(by=["trip_id", "stop_sequence"]).reset_index(drop=True)

# === Rush hour logic ===
def classify_rush_hour(ts):
    hour = ts.hour
    if 7 <= hour < 10:
        return "morning"
    elif 16 <= hour < 19:
        return "evening"
    else:
        return None
    
# === Log ===
all_delays_df["rush_hour"] = all_delays_df["timestamp"].apply(classify_rush_hour)

all_delays_df

Unnamed: 0,trip_id,stop_id_raw,stop_id,timestamp,delay_min,status,direction,stop_sequence,rush_hour
0,082650_N..S34R,N09S,Avenue U,2025-07-22 16:30:48,95.8,delayed,S,26,evening
1,088400_N..S34R,N10S,86 St,2025-07-22 16:30:42,35.7,delayed,S,27,evening
2,089250_N..S34R,N10S,86 St,2025-07-22 16:26:50,24.33,delayed,S,27,evening
3,089850_N..S34R,N07S,Bay Pkwy,2025-07-22 16:30:45,29.75,delayed,S,24,evening
4,090650_N..S34R,N05S,18 Av,2025-07-22 16:30:48,24.8,delayed,S,22,evening
5,092250_N..S34R,R41S,59 St,2025-07-22 16:30:48,14.8,delayed,S,18,evening
6,092900_N..S34R,R36S,36 St,2025-07-22 16:30:15,13.25,delayed,S,17,evening
7,093750_N..S34R,R36S,36 St,2025-07-22 16:30:48,1.3,delayed,S,17,evening
8,093900_N..N35R,R15N,49 St,2025-07-22 16:30:48,1.8,delayed,N,17,evening
9,094500_N..S34R,Q01S,Canal St,2025-07-22 16:23:45,7.75,delayed,S,15,evening


In [37]:
import datetime
import pandas as pd
from sqlalchemy import create_engine, text

# === Database config ===
DB_USER = "postgres"
DB_PASSWORD = "commiteveryday"
DB_HOST = "localhost"
DB_PORT = "5432"
DB_NAME = "train_delays"

# === Create DB engine ===
engine = create_engine(f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

# === Create tables ===
create_train_delays = """
CREATE TABLE IF NOT EXISTS train_delays (
    id SERIAL PRIMARY KEY,
    trip_id TEXT,
    stop_id TEXT,
    station_name TEXT,
    timestamp TIMESTAMP,
    delay_min REAL,
    status TEXT,
    direction TEXT,
    stop_sequence INTEGER,
    rush_hour TEXT
);
"""

with engine.connect() as conn:
    conn.execute(text(create_train_delays))
    print("Table created.")

# === Clean and insert delay data ===
if not all_delays_df.empty:
    delay_cols = ['trip_id', 'stop_id_raw', 'stop_id', 'timestamp', 'delay_min', 'status', 'direction', 'stop_sequence', 'rush_hour']
    df = all_delays_df[delay_cols].copy()
    df.columns = ['trip_id', 'stop_id', 'station_name', 'timestamp', 'delay_min', 'status', 'direction', 'stop_sequence', 'rush_hour']
    df.to_sql('train_delays', engine, if_exists='append', index=False)
    print(f"Inserted {len(df)} delay records.")
else:
    print("No delay data found.")

Table created.
Inserted 15 delay records.


In [None]:
"""
Build Hourly Features + Labels from Postgres
"""
import pandas as pd

# === Get hourly aggregated delay data ===
delay_sql = """
    SELECT 
        DATE_TRUNC('hour', timestamp) AS hour,
        COUNT(*) FILTER (WHERE delay_min > 1) * 1.0 / COUNT(*) AS delay_rate,
        COUNT(*) AS total_trips,
        MAX(rush_hour) AS rush_hour  -- could be 'morning', 'evening', or NULL
    FROM train_delays
    GROUP BY hour
    ORDER BY hour
"""

# === Get matching weather data ===
weather_sql = """
    SELECT 
        time AS hour,
        temperature_f,
        precipitation,
        snowfall,
        humidity,
        windspeed
    FROM weather_hourly
"""

# === Load from Postgres ===
df_delays = pd.read_sql(delay_sql, engine)
df_weather = pd.read_sql(weather_sql, engine)

# === Merge both datasets on hour
df = pd.merge(df_delays, df_weather, on="hour", how="inner")

# === Add time-based features
df["hour_of_day"] = df["hour"].dt.hour
df["day_of_week"] = df["hour"].dt.dayofweek
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

# === Drop incomplete rows
df = df.dropna()

# === Show preview
print(df.head())