# XGBoost


In [21]:
# import

import matplotlib.pyplot as plt

import numpy as np
np.random.seed(42)

import pickle

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import make_scorer


In [22]:
taille_fenetre_to_run = 20
taille_stride_to_run = 5
# 20w_5s déterminer dans search window size 

In [23]:
data = pickle.load(open(f"Data/donnees_{taille_fenetre_to_run}w_{taille_stride_to_run}s.pkl", "rb"))

In [24]:
data.keys()

dict_keys(['X_np_label', 'X_np_binary', 'y_np', 'features_names_label', 'features_names_binary', 'idx_explicabilite', 'X_label_explicabilite', 'X_binary_explicabilite', 'y_explicabilite'])

In [25]:
X_np_label = data["X_np_label"]
y_np = data["y_np"]

In [26]:
X_label_explicabilite = data["X_label_explicabilite"]
y_explicabilite = data["y_explicabilite"]

In [27]:
features_label = data["features_names_label"]

In [28]:
X_np_label.shape

(19010, 152)

In [29]:
param_grid = {
    'n_estimators': [500],      
    'max_depth': [10],           
    'learning_rate': [0.1],          
    'gamma': [0, 1, 5],             # Régularisation L1
    'reg_alpha': [0, 0.1],          # Régularisation L1
    'reg_lambda': [1, 2]            # Régularisation L2
}  

# Scorer personnalisé pour évaluer les modèles
scoring = {
    "MSE": make_scorer(mean_squared_error, greater_is_better=False),
    "MAE": make_scorer(mean_absolute_error, greater_is_better=False),
    "RMSE": make_scorer(root_mean_squared_error, greater_is_better=False),
    "MAPE": make_scorer(mean_absolute_percentage_error, greater_is_better=False)
}


print("Performing GridSearch for XGBoost...")
xgb = XGBRegressor(random_state=42, use_label_encoder=False, verbosity=0)

n_splits = 5
kf = KFold(n_splits=n_splits, random_state=42, shuffle=True) 

gd = GridSearchCV(xgb, param_grid, scoring="neg_mean_squared_error", cv=kf, n_jobs=-1, verbose=2)

gd.fit(X_np_label, y_np)

print("Best parameters found: ", gd.best_params_)
print("Best score found: ", - gd.best_score_)  # Revenir à une valeur positive pour MSE
print("Best estimator found: ", gd.best_estimator_)

Performing GridSearch for XGBoost...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END gamma=0, learning_rate=0.1, max_depth=10, n_estimators=500, reg_alpha=0, reg_lambda=1; total time= 3.5min
[CV] END gamma=0, learning_rate=0.1, max_depth=10, n_estimators=500, reg_alpha=0, reg_lambda=2; total time= 3.5min
[CV] END gamma=0, learning_rate=0.1, max_depth=10, n_estimators=500, reg_alpha=0, reg_lambda=1; total time= 3.5min
[CV] END gamma=0, learning_rate=0.1, max_depth=10, n_estimators=500, reg_alpha=0, reg_lambda=1; total time= 3.5min
[CV] END gamma=0, learning_rate=0.1, max_depth=10, n_estimators=500, reg_alpha=0, reg_lambda=1; total time= 3.5min
[CV] END gamma=0, learning_rate=0.1, max_depth=10, n_estimators=500, reg_alpha=0, reg_lambda=2; total time= 3.5min
[CV] END gamma=0, learning_rate=0.1, max_depth=10, n_estimators=500, reg_alpha=0, reg_lambda=2; total time= 3.5min
[CV] END gamma=0, learning_rate=0.1, max_depth=10, n_estimators=500, reg_alpha=0, reg_lambda=1; t

In [30]:
# pour 20w_5s et 
# param_grid = {
#     'n_estimators': [100, 200],         
#     'max_depth': [3, 5],  
# }

# Best parameters found:  {'max_depth': 5, 'n_estimators': 200}
# Best score found:  0.03334463831295349
# Best estimator found:  XGBRegressor(base_score=None, booster=None, callbacks=None,
#              colsample_bylevel=None, colsample_bynode=None,
#              colsample_bytree=None, early_stopping_rounds=None,
#              enable_categorical=False, eval_metric=None, feature_types=None,
#              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
#              interaction_constraints=None, learning_rate=None, max_bin=None,
#              max_cat_threshold=None, max_cat_to_onehot=None,
#              max_delta_step=None, max_depth=5, max_leaves=None,
#              min_child_weight=None, missing=nan, monotone_constraints=None,
#              n_estimators=200, n_jobs=None, num_parallel_tree=None,
#              predictor=None, random_state=42, ...)

In [31]:
# pour 20w_5s et param_grid = {
#     'n_estimators': [200, 500],      
#     'max_depth': [5, 10],           
#     'learning_rate': [0.01, 0.1],                     
# }

# Best parameters found:  {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 500}
# Best score found:  0.02637333205149689
# Best estimator found:  XGBRegressor(base_score=None, booster=None, callbacks=None,
#              colsample_bylevel=None, colsample_bynode=None,
#              colsample_bytree=None, early_stopping_rounds=None,
#              enable_categorical=False, eval_metric=None, feature_types=None,
#              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
#              interaction_constraints=None, learning_rate=0.1, max_bin=None,
#              max_cat_threshold=None, max_cat_to_onehot=None,
#              max_delta_step=None, max_depth=10, max_leaves=None,
#              min_child_weight=None, missing=nan, monotone_constraints=None,
#              n_estimators=500, n_jobs=None, num_parallel_tree=None,
#              predictor=None, random_state=42, ...)


In [None]:
# pour 20w_5s et param_grid = {
#     'n_estimators': [500],      
#     'max_depth': [10],           
#     'learning_rate': [0.1],          
#     'gamma': [0, 1, 5],             # Régularisation L1
#     'reg_alpha': [0, 0.1],          # Régularisation L1
#     'reg_lambda': [1, 2]            # Régularisation L2
# }  

# Best parameters found:  {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 500, 'reg_alpha': 0.1, 'reg_lambda': 1}
# Best score found:  0.02460594927348995
# Best estimator found:  XGBRegressor(base_score=None, booster=None, callbacks=None,
#              colsample_bylevel=None, colsample_bynode=None,
#              colsample_bytree=None, early_stopping_rounds=None,
#              enable_categorical=False, eval_metric=None, feature_types=None,
#              gamma=0, gpu_id=None, grow_policy=None, importance_type=None,
#              interaction_constraints=None, learning_rate=0.1, max_bin=None,
#              max_cat_threshold=None, max_cat_to_onehot=None,
#              max_delta_step=None, max_depth=10, max_leaves=None,
#              min_child_weight=None, missing=nan, monotone_constraints=None,
#              n_estimators=500, n_jobs=None, num_parallel_tree=None,
#              predictor=None, random_state=42, ...)