In [9]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# === DB Connection ===
DB_USER = "postgres"
DB_PASSWORD = "commiteveryday"
DB_HOST = "localhost"
DB_PORT = "5432"
DB_NAME = "train_delays"

engine = create_engine(f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

# === Query: Delay Aggregation ===
delay_sql = """
SELECT 
    DATE_TRUNC('hour', timestamp) AS hour,
    rush_hour,
    AVG(delay_min) AS avg_delay_min,
    COUNT(*) AS total_trips
FROM train_delays
WHERE rush_hour IN ('morning', 'evening')
GROUP BY rush_hour, hour
ORDER BY rush_hour, hour
"""
delay_df = pd.read_sql(delay_sql, engine)

# === Query: Weather Data ===
weather_sql = """
SELECT * FROM weather_hourly
"""
weather_df = pd.read_sql(weather_sql, engine)

delay_df


Unnamed: 0,hour,rush_hour,avg_delay_min,total_trips
0,2025-07-22 16:00:00,evening,-56.0,2
1,2025-07-22 17:00:00,evening,3.855862,29
2,2025-07-23 17:00:00,evening,1.2935,20
3,2025-07-23 18:00:00,evening,0.896949,59


In [None]:
# # === Merge ===
# delay_df['hour'] = pd.to_datetime(delay_df['hour'])
# weather_df['time'] = pd.to_datetime(weather_df['time'])
# data = pd.merge(delay_df, weather_df, left_on='hour', right_on='time', how='inner')
# 
# # === Feature Engineering ===
# data['hour_of_day'] = data['hour'].dt.hour
# data['day_of_week'] = data['hour'].dt.dayofweek
# data['is_weekend'] = data['day_of_week'].isin([5, 6]).astype(int)
# 
# # === Drop NaNs ===
# data = data.dropna(subset=['avg_delay_min'])
# 
# # === Features and Target ===
# features = [
#     'temperature_f', 'precipitation', 'snowfall', 'humidity', 'windspeed',
#     'hour_of_day', 'day_of_week', 'is_weekend', 'rush_hour'
# ]
# target = 'avg_delay_min'
# 
# X = data[features]
# y = data[target]
# 
# # === One-hot encode rush_hour ===
# categorical = ['rush_hour']
# numeric = [col for col in features if col not in categorical]
# 
# preprocessor = ColumnTransformer([
#     ('num', 'passthrough', numeric),
#     ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
# ])
# 
# model = Pipeline([
#     ('preprocessor', preprocessor),
#     ('regressor', LinearRegression())
# ])
# 
# # === Train/Test Split ===
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
# model.fit(X_train, y_train)
# 
# # === Evaluation ===
# y_pred = model.predict(X_test)
# mae = mean_absolute_error(y_test, y_pred)
# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# print("MAE:", round(mae, 2))
# print("RMSE:", round(rmse, 2))
# 
# # === Comparison DataFrame ===
# comparison = pd.DataFrame({
#     'Hour': X_test.index,
#     'Actual Avg Delay (min)': np.round(y_test.values, 2),
#     'Predicted Avg Delay (min)': np.round(y_pred, 2),
#     'Difference': np.round(abs(y_test.values - y_pred), 2)
# })
# 
# print(comparison.head(10))