# XGBoost Regressor for Delay Prediction
This notebook demonstrates how to use a Gradient Boosting Regressor to predict the delay at a target stop using features such as current stop, current delay, and target stop. It includes data loading, preprocessing, model training, evaluation, and example predictions.

In [1]:
# Import Required Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [2]:
# Experimental day_type
combinations_df = pd.read_csv('../input/line401_combinations_exp.csv', usecols=['to_station', 'current_stop_index', 'current_delay', 'target_stop_index', 'target_delay', 'day_type'], dtype={'to_station': 'bool', 'current_stop_index': np.int8, 'current_delay': np.float16, 'target_stop_index': np.int8, 'target_delay': np.float16, 'day_type': np.int8})
X_raw = combinations_df[['to_station', 'current_stop_index', 'current_delay', 'target_stop_index', 'day_type']]
y = combinations_df['target_delay']
X_encoded = X_raw.copy()

In [2]:
# Experimental day_type and WEATHER
combinations_df = pd.read_csv('../input/line401_combinations_with_weather.csv', usecols=['to_station', 'current_stop_index', 'current_delay', 'target_stop_index', 'target_delay', 'day_type', 'FR/windspeed (0.1 m/s)', 'TG/temperature (0.1 째C)', 'RH/precipitation (0.1mm)'], dtype={'to_station': 'bool', 'current_stop_index': np.int8, 'current_delay': np.float16, 'target_stop_index': np.int8, 'target_delay': np.float16, 'day_type': np.int8, 'FR/windspeed (0.1 m/s)': np.float16, 'TG/temperature (0.1 째C)': np.float16, 'RH/precipitation (0.1mm)': np.float16})
X_raw = combinations_df[['to_station', 'current_stop_index', 'current_delay', 'target_stop_index', 'day_type', 'FR/windspeed (0.1 m/s)', 'TG/temperature (0.1 째C)', 'RH/precipitation (0.1mm)']]
y = combinations_df['target_delay']
X_encoded = X_raw.copy()

In [None]:
# Split data and train model
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# model = xgb.XGBRegressor(objective='reg:squarederror',
#                          n_estimators=100, random_state=42, learning_rate=0.2, max_depth=6, subsample=1.0, colsample_bytree=1.0)
# model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    colsample_bytree=0.8,
    learning_rate=0.2,
    max_depth=9,
    n_estimators=300,
    subsample=1.0,
    random_state=42
)
model.fit(X_train, y_train)

#### Exporting the model

In [None]:
# Export the model:
import joblib
joblib.dump(gb, './data/models/gradient_boosting_model_L401.joblib')

#### Importing the model

In [None]:
import joblib
gb = joblib.load('./data/models/gradient_boosting_model_L401.joblib')

#### Evaluation

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

# Evaluate model
y_pred = model.predict(X_test).flatten()
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Calculate NRMSE (Normalized RMSE, normalized by the range of y_test)
nrmse = rmse / (np.max(y_test) - np.min(y_test))

# Calculate SMAPE (Symmetric Mean Absolute Percentage Error)
smape = 100 * np.mean(2 * np.abs(y_pred - y_test) / (np.abs(y_test) + np.abs(y_pred) + 1e-8))

# Calculate 90th Percentile Absolute Error (P90 AE)
p90_ae = np.percentile(np.abs(y_test - y_pred), 90)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R^2 Score: {r2:.2f}")
# print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
# print(f"Symmetric MAPE (SMAPE): {smape:.2f}%")
print(f"Normalized RMSE (NRMSE): {nrmse:.4f}")
print(f"90th Percentile Absolute Error (P90 AE): {p90_ae:.2f}")


### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [3, 6, 9],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0]
# }

param_grid = {
    'n_estimators': [200, 250, 300],
    'max_depth': [9, 11, 14],
    'learning_rate': [0.2, 0.3, 0.4],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.6, 0.8]
}

grid_search = RandomizedSearchCV(
    estimator=xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
    param_distributions=param_grid, cv=3, n_jobs=-1, verbose=2
)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

#### Feature plotting

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

importance = model.get_booster().get_score(importance_type='weight')

importance_df = pd.DataFrame({
    'Feature': list(importance.keys()),
    'Importance': list(importance.values())
}).sort_values(by='Importance', ascending=False)

top_n = 20
plt.figure(figsize=(10, 8))
plt.barh(
    importance_df['Feature'].head(top_n)[::-1],
    importance_df['Importance'].head(top_n)[::-1],
    color='skyblue'
)
plt.xlabel('Importance Score')
plt.title(f'Top {top_n} Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
xgb.plot_importance(model)

In [None]:
xgb.plot_tree(model, num_trees=2)
xgb.to_graphviz(model, num_trees=2)

In [None]:
# Example predictions: Glaspoort to Piazza and Evoluon to Piazza
X_new = pd.DataFrame(0, index=[0], columns=X_train.columns)
X_new['current_stop_index'] = 13
X_new['current_delay'] = -60
X_new['target_stop_index'] = 15
X_new['to_station'] = 1
y_new_pred = model.predict(X_new)
print(f'Glaspoort to Piazza delay (GB): {y_new_pred[0]:.2f}')

X_evoluon = pd.DataFrame(0, index=[0], columns=X_train.columns)
X_evoluon['current_stop_index'] = 10
X_evoluon['current_delay'] = 120
X_evoluon['target_stop_index'] = 15
X_evoluon['TG/temperature (0.1 째C)'] = 200  # Example temperature
X_evoluon['to_station'] = 1
y_evoluon_pred = model.predict(X_evoluon)
print(f'Evoluon to Piazza delay (GB): {y_evoluon_pred[0]:.2f}')