# 04. Model XGBoost
XGBoost models implementation.

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import utils
import os

# Load Processed Data
processed_file = 'data/processed_data/processed_trip_data.csv'
if os.path.exists(processed_file):
    df_process = pd.read_csv(processed_file)
    df_process['start_date'] = df_process['start_date'].astype(str)
    print(f"Loaded processed data from {processed_file}")

    # Split
    unique_dates = sorted(df_process['start_date'].unique())
    split_idx = int(len(unique_dates) * 0.8)
    train_dates = unique_dates[:split_idx]
    test_dates = unique_dates[split_idx:]

    df_train = df_process[df_process['start_date'].isin(train_dates)].copy()
    df_test = df_process[df_process['start_date'].isin(test_dates)].copy()
else:
    print(f"File not found: {processed_file}. Please run 02_process_data.ipynb first.")
    df_process = None

In [None]:
# Create Sequences
if df_process is not None:
    n_past_trips = 5
    stops_dict = {}
    for rd_key in df_process['route_direction_key'].unique():
        rd_df = df_process[df_process['route_direction_key'] == rd_key]
        stops_dict[rd_key] = sorted(rd_df['stop_sequence'].unique())

    X_delays_train, X_features_train, X_agg_train, y_train, _, _, n_stops = utils.create_trip_based_sequences_multi_route(
        df_train, n_past_trips, stops_dict=stops_dict
    )
    X_delays_test, X_features_test, X_agg_test, y_test, _, _, _ = utils.create_trip_based_sequences_multi_route(
        df_test, n_past_trips, stops_dict=stops_dict
    )

In [None]:
# Prepare Data for XGBoost
if df_process is not None:
    X_train_flat = np.concatenate([
        X_delays_train.reshape(len(X_delays_train), -1),
        X_features_train,
        X_agg_train
    ], axis=1)

    X_test_flat = np.concatenate([
        X_delays_test.reshape(len(X_delays_test), -1),
        X_features_test,
        X_agg_test
    ], axis=1)

    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train_flat)
    X_test_scaled = scaler.transform(X_test_flat)

In [None]:
# Train XGBoost (Per Stop)
if df_process is not None:
    print("Training XGBoost...")
    y_pred_xgb_all = []

    for stop_idx in range(n_stops):
        model = xgb.XGBRegressor(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            n_jobs=-1,
            random_state=42
        )
        model.fit(X_train_scaled, y_train[:, stop_idx])
        pred = model.predict(X_test_scaled)
        y_pred_xgb_all.append(pred)

    y_pred_xgb_all = np.array(y_pred_xgb_all).T

    mae_xgb = mean_absolute_error(y_test.flatten(), y_pred_xgb_all.flatten())
    r2_xgb = r2_score(y_test.flatten(), y_pred_xgb_all.flatten())

    print(f"XGBoost MAE: {mae_xgb:.2f}, R2: {r2_xgb:.4f}")