In [8]:
import pandas as pd
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [9]:
holiday_dates = [
    date(2015, 11, 11), date(2015, 12, 25), date(2016, 1, 26), 
    date(2016, 8, 15), date(2016, 10, 30), date(2016, 12, 25), 
    date(2017, 1, 26), date(2017, 5, 1), date(2017, 8, 15) # Extended list for Test Set coverage
]
df_ready = pd.read_csv('train_aWnotuB.csv')
df_ready['DateTime'] = pd.to_datetime(df_ready['DateTime'])
df_ready['Year'] = df_ready['DateTime'].dt.year
df_ready['Month'] = df_ready['DateTime'].dt.month
df_ready['Day'] = df_ready['DateTime'].dt.day
df_ready['Hour'] = df_ready['DateTime'].dt.hour
df_ready['DayOfWeek'] = df_ready['DateTime'].dt.dayofweek
df_ready['DayOfYear'] = df_ready['DateTime'].dt.dayofyear
df_ready['is_weekend'] = df_ready['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
df_ready['Date'] = df_ready['DateTime'].dt.date
df_ready['is_holiday'] = df_ready['Date'].apply(lambda x: 1 if x in holiday_dates else 0)
df_ready.drop(columns=['Date'], inplace=True)
df_ready['Junction'] = df_ready['Junction'].astype(str)
df_traffic_encoded = pd.get_dummies(df_ready, columns=['Junction'], drop_first=True)

In [10]:
df_traffic_encoded['Vehicles_Lag1'] = df_traffic_encoded['Vehicles'].shift(1)
df_traffic_encoded['Vehicles_Lag24'] = df_traffic_encoded['Vehicles'].shift(24)
df_traffic_lagged = df_traffic_encoded.dropna()
print(f"Data rows used for training after lagging: {len(df_traffic_lagged)}")

Data rows used for training after lagging: 48096


In [11]:
Y = df_traffic_lagged['Vehicles']
X = df_traffic_lagged.drop(columns=['Vehicles', 'DateTime', 'ID'], errors='ignore')
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, shuffle=False
)
feature_cols = X_train.columns.to_list()


In [12]:
model = RandomForestRegressor(n_estimators=100, 
                              random_state=42, 
                              n_jobs=-1,
                              max_depth=15) 

print("\n--- Model Training Started (Random Forest Regressor with Lag Features) ---")
model.fit(X_train, Y_train)
print("--- Model Training Complete ---")


--- Model Training Started (Random Forest Regressor with Lag Features) ---
--- Model Training Complete ---


In [13]:
Y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
r_squared = r2_score(Y_test, Y_pred)

print(f"\n--- Final Validation Results (Improved Model) ---")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-Squared Score (R²): {r_squared:.4f}")


--- Final Validation Results (Improved Model) ---
Root Mean Squared Error (RMSE): 5.08
R-Squared Score (R²): 0.7401


In [14]:
test_file = 'datasets_8494_11879_test_BdBKkAj.csv'
df_test = pd.read_csv(test_file)
df_train_raw = pd.read_csv('train_aWnotuB.csv')
df_test['Vehicles'] = 0 # Create dummy column for concatenation
df_train_raw['DateTime'] = pd.to_datetime(df_train_raw['DateTime'])
df_test['DateTime'] = pd.to_datetime(df_test['DateTime'])
df_full = pd.concat([df_train_raw.tail(24), df_test], ignore_index=True)
df_full['Year'] = df_full['DateTime'].dt.year
df_full['Month'] = df_full['DateTime'].dt.month
df_full['Day'] = df_full['DateTime'].dt.day
df_full['Hour'] = df_full['DateTime'].dt.hour
df_full['DayOfWeek'] = df_full['DateTime'].dt.dayofweek
df_full['DayOfYear'] = df_full['DateTime'].dt.dayofyear
df_full['is_weekend'] = df_full['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
df_full['Date'] = df_full['DateTime'].dt.date
df_full['is_holiday'] = df_full['Date'].apply(lambda x: 1 if x in holiday_dates else 0)
df_full.drop(columns=['Date'], inplace=True)
df_full['Vehicles_Lag1'] = df_full['Vehicles'].shift(1)
df_full['Vehicles_Lag24'] = df_full['Vehicles'].shift(24)
df_full['Junction'] = df_full['Junction'].astype(str)
df_full_encoded = pd.get_dummies(df_full, columns=['Junction'], drop_first=True)
df_final_test_set = df_full_encoded.iloc[24:].copy()
for col in feature_cols:
    if col not in df_final_test_set.columns:
        df_final_test_set[col] = False 

X_final_test = df_final_test_set[feature_cols]

In [15]:
final_predictions = model.predict(X_final_test)

final_predictions = np.maximum(0, np.round(final_predictions)).astype(int)

In [16]:
df_submission = pd.DataFrame({
    'ID': df_final_test_set['ID'], 
    'Vehicles': final_predictions
})

df_submission.to_csv('project9_traffic_predictions_week4_improved.csv', index=False)

print("\n--- Final Output ---")
print("Submission file created: project9_traffic_predictions_week4_improved.csv")


--- Final Output ---
Submission file created: project9_traffic_predictions_week4_improved.csv
