In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Step 1: Load Data
data_path = '/Users/chetan/Documents/GitHub/nj_transit_data_ru_hack/data/rail_data/2020_05.csv'
df = pd.read_csv(data_path)

# Step 2: Data Preprocessing
df['scheduled_time'] = pd.to_datetime(df['scheduled_time'])
df['actual_time'] = pd.to_datetime(df['actual_time'])
df['delay_minutes'] = df['delay_minutes'].fillna(0)

df['hour_of_day'] = df['scheduled_time'].dt.hour
df['day_of_week'] = df['scheduled_time'].dt.dayofweek

# Step 3: Prepare Features and Target
features = ['hour_of_day', 'day_of_week', 'from_id', 'to_id']
target = 'delay_minutes'

X = df[features]
y = df[target]

X = X.dropna()
y = y[X.index]

# Step 4: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Initialize and train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 6: Predictions and Evaluation
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"\nModel Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f} minutes")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} minutes")

# Step 7: Prediction Function
def predict_delay(hour_of_day, day_of_week, from_id, to_id):
    # Create a feature vector
    input_data = pd.DataFrame([{
        'hour_of_day': hour_of_day,
        'day_of_week': day_of_week,
        'from_id': from_id,
        'to_id': to_id
    }])
    
    # Predict delay
    predicted_delay = model.predict(input_data)[0]
    return predicted_delay

# Example prediction
from_id = df['from_id'].iloc[0]  # Example 'from_id'
to_id = df['to_id'].iloc[0]  # Example 'to_id'

predicted = predict_delay(18, 6, from_id, to_id)
print(f"\nPredicted Delay (minutes): {predicted:.2f}")

# Create dictionaries for station name to ID mapping
from_station_id = df[['from', 'from_id']].drop_duplicates().set_index('from')['from_id'].to_dict()
to_station_id = df[['to', 'to_id']].drop_duplicates().set_index('to')['to_id'].to_dict()

# Example of how to use the prediction function with station names
def predict_delay_by_station_names(hour_of_day, day_of_week, from_station, to_station):
    from_id = from_station_id.get(from_station)
    to_id = to_station_id.get(to_station)
    if from_id is None or to_id is None:
        return "Invalid station name(s)"
    return predict_delay(hour_of_day, day_of_week, from_id, to_id)

# Example usage
from_station = list(from_station_id.keys())[0]  # Get the first station name
to_station = list(to_station_id.keys())[0]  # Get the first station name
predicted_by_name = predict_delay_by_station_names(18, 6, from_station, to_station)
print(f"\nPredicted Delay for {from_station} to {to_station} (minutes): {predicted_by_name:.2f}")


Model Performance:
Mean Absolute Error (MAE): 1.87 minutes
Root Mean Squared Error (RMSE): 4.67 minutes

Predicted Delay (minutes): 0.18

Predicted Delay for Newark Penn Station to Newark Penn Station (minutes): 0.18


In [9]:
import joblib
import os

# Create the directory if it doesn't exist
model_dir = '/Users/chetan/Documents/GitHub/nj_transit_data_ru_hack/models'
os.makedirs(model_dir, exist_ok=True)

# Save the trained model
model_path = os.path.join(model_dir, 'delay_prediction_model.joblib')
joblib.dump(model, model_path)
print(f"Model saved to {model_path}")

Model saved to /Users/chetan/Documents/GitHub/nj_transit_data_ru_hack/models/delay_prediction_model.joblib


In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import joblib
import streamlit as st

# Load the trained model
model = joblib.load('/Users/chetan/Documents/GitHub/nj_transit_data_ru_hack/models/delay_prediction_model.joblib')

# Function to map day of week to number
def day_to_number(day):
    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    return days.index(day)

# Prediction function
def predict_delay(hour_of_day, day_of_week, from_id, to_id):
    # Create a feature vector
    input_data = pd.DataFrame([{
        'hour_of_day': hour_of_day,
        'day_of_week': day_of_week,
        'from_id': from_id,
        'to_id': to_id
    }])
    
    # Predict delay
    predicted_delay = model.predict(input_data)[0]
    return predicted_delay

# Get inputs from Streamlit
hour_of_day = st.session_state.get('hour_of_day', 0)
from_id = st.session_state.get('from_id', 0)
to_id = st.session_state.get('to_id', 0)
day_of_week = st.session_state.get('day_of_week', 'Monday')

# Convert day of week to number
day_number = day_to_number(day_of_week)

# Make prediction
predicted_delay = predict_delay(hour_of_day, day_number, from_id, to_id)

# Display prediction
st.write(f"## Predicted Delay")
st.write(f"The predicted delay for your journey is: **{predicted_delay:.2f} minutes**")

# Provide some context
if predicted_delay < 5:
    st.write("Your train is likely to be on time or only slightly delayed.")
elif predicted_delay < 15:
    st.write("There might be a minor delay. Consider allowing a little extra time for your journey.")
else:
    st.write("There could be a significant delay. Please plan accordingly and check for any service updates.")

