HistGradientBoostingClassifier

In [3]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
import numpy as np
import pandas as pd



In [4]:
#Importation des données clean
X_train=pd.read_csv("data/X_train_after_selection.csv")
y_train=pd.read_csv("data/y_train.csv")
X_val=pd.read_csv("data/X_val_after_selection.csv")
y_val=pd.read_csv("data/y_val.csv")
X_test=pd.read_csv("data/X_test.csv")

data_test = pd.read_csv("data/test.csv")

In [13]:
X_full, y_full  = pd.concat([X_train,X_val],axis=0,ignore_index=True), pd.concat([y_train,y_val],axis=0,ignore_index=True)

In [14]:
X_full

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Geography_Germany,Gender_Male
0,646.0,38.0,4.0,0.00,2.0,0,479.56,0,1
1,595.0,30.0,4.0,146329.57,1.0,0,79329.70,0,0
2,648.0,31.0,7.0,98592.88,1.0,1,187925.75,0,1
3,687.0,35.0,4.0,0.00,2.0,1,83470.40,0,0
4,676.0,26.0,10.0,0.00,2.0,0,83342.73,0,1
...,...,...,...,...,...,...,...,...,...
14994,696.0,44.0,1.0,0.00,2.0,1,121789.30,0,0
14995,523.0,33.0,7.0,0.00,2.0,1,92320.36,0,1
14996,607.0,31.0,4.0,0.00,2.0,0,60917.24,0,0
14997,667.0,36.0,1.0,0.00,2.0,1,102299.81,0,1


In [15]:
categorical_features = ['IsActiveMember', 'Geography_Germany', 'Gender_Male']

for col in categorical_features:
    X_full[col] = X_full[col].astype('int')
    X_test[col] = X_test[col].astype('int')

In [26]:

def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.3),
        'max_iter': trial.suggest_int('max_iter', 50, 500),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 50),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 50),
        'l2_regularization': trial.suggest_float('l2_regularization', 0.0, 1.0),
        'early_stopping': True,
        'validation_fraction': 0.1,
        'random_state': trial.suggest_int('random_state',0, 10000)
    }
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []
    
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx].ravel()
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx].ravel()
        
        model = HistGradientBoostingClassifier(**params)
        model.fit(X_train_fold, y_train_fold)
        
        y_val_pred = model.predict_proba(X_val_fold)[:, 1]
        auc = roc_auc_score(y_val_fold, y_val_pred)
        auc_scores.append(auc)
    
    return np.mean(auc_scores)

# Create a study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=750)

# Best hyperparameters
print("Best parameters:", study.best_params)
print("Best AUC score:", study.best_value)



[I 2024-12-31 17:33:07,937] A new study created in memory with name: no-name-58935a83-d47b-42d0-84aa-9d89f5bcb92d
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[I 2024-12-31 17:33:09,094] Trial 0 finished with value: 0.9363789879416778 and parameters: {'learning_rate': 0.02247182840542502, 'max_iter': 156, 'max_leaf_nodes': 11, 'max_depth': 14, 'min_samples_leaf': 7, 'l2_regularization': 0.18452423035396648, 'random_state': 4358}. Best is trial 0 with value: 0.9363789879416778.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[I 2024-12-31 17:33:09,955] Trial 1 finished with value: 0.9310887706151785 and parameters: {'learning_rate': 0.16229273869845404, 'max_iter': 491, 'max_leaf_nodes': 47, 'max_depth': 10, 'min_samples_leaf': 22, 'l2_regularization': 0

Best parameters: {'learning_rate': 0.05864689166323513, 'max_iter': 269, 'max_leaf_nodes': 31, 'max_depth': 3, 'min_samples_leaf': 27, 'l2_regularization': 0.8032649185759841, 'random_state': 1779}
Best AUC score: 0.9386104925733989


In [27]:
study.best_trial

FrozenTrial(number=164, state=1, values=[0.9386104925733989], datetime_start=datetime.datetime(2024, 12, 31, 17, 35, 49, 523963), datetime_complete=datetime.datetime(2024, 12, 31, 17, 35, 50, 529300), params={'learning_rate': 0.05864689166323513, 'max_iter': 269, 'max_leaf_nodes': 31, 'max_depth': 3, 'min_samples_leaf': 27, 'l2_regularization': 0.8032649185759841, 'random_state': 1779}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.3, log=False, low=0.0001, step=None), 'max_iter': IntDistribution(high=500, log=False, low=50, step=1), 'max_leaf_nodes': IntDistribution(high=50, log=False, low=10, step=1), 'max_depth': IntDistribution(high=15, log=False, low=3, step=1), 'min_samples_leaf': IntDistribution(high=50, log=False, low=5, step=1), 'l2_regularization': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'random_state': IntDistribution(high=10000, log=False, low=0, step=1)}, trial_id=164, value=None)

In [None]:
# Train a final model using the best parameters
best_params = study.best_params
final_model = HistGradientBoostingClassifier(**best_params)
final_model.fit(X_train, y_train)

In [36]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

models=[]
for train_idx, val_idx in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    model = HistGradientBoostingClassifier(**best_params)
    model.fit(X_train_fold, y_train_fold)
    a=model.fit(X_train_fold, y_train_fold)
    y_val_pred = model.predict_proba(X_val_fold)[:, 1]
    auc = roc_auc_score(y_val_fold, y_val_pred)
    models.append(a)
    auc_scores.append(auc)

auc_scores

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[0.9469737268801665,
 0.9333403375363284,
 0.9370807239127747,
 0.94921337594539,
 0.9227661251988876]

In [37]:
models

[HistGradientBoostingClassifier(l2_regularization=0.8032649185759841,
                                learning_rate=0.05864689166323513, max_depth=3,
                                max_iter=269, min_samples_leaf=27,
                                random_state=1779),
 HistGradientBoostingClassifier(l2_regularization=0.8032649185759841,
                                learning_rate=0.05864689166323513, max_depth=3,
                                max_iter=269, min_samples_leaf=27,
                                random_state=1779),
 HistGradientBoostingClassifier(l2_regularization=0.8032649185759841,
                                learning_rate=0.05864689166323513, max_depth=3,
                                max_iter=269, min_samples_leaf=27,
                                random_state=1779),
 HistGradientBoostingClassifier(l2_regularization=0.8032649185759841,
                                learning_rate=0.05864689166323513, max_depth=3,
                                max_iter=26

In [None]:
models[0].predict_proba(X_val)[:,1]

array([0.0174481 , 0.00395904, 0.10411834, ..., 0.01290845, 0.00710993,
       0.02653008])

In [44]:


# Evaluate the model on a validation set
y_val_pred = models[4].predict_proba(X_val)[:, 1]
final_auc = roc_auc_score(y_val, y_val_pred)
print(f"Validation ROC AUC: {final_auc:.4f}")


Validation ROC AUC: 0.9321


In [49]:
y_pred_proba_best = final_model.predict_proba(X_test)[:, 1]

sample_submission_test_N = data_test[['id']].copy()  # Copie uniquement la colonne 'id' de data_test
sample_submission_test_N["Exited"] = y_pred_proba_best  # Ajout des probabilités de churn

sample_submission_test_N.to_csv("results_hist.csv", index=False)
