### 1. Import des données

In [2]:
import pandas as pd
import numpy as np
from numpy import sort
from itertools import chain

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
import matplotlib.pyplot as plt

# from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler

# from functions import *
from functions import fun as f
import re

df_all_data = pd.read_csv("data/all_data_before_ml.csv")



In [1]:
# list(df_all_data)

### 2. Transformation en features numériques adaptées à Scikit-Learn

On a besoin de features numériques en vue réaliser une régression logistique de type Ridge et un modèle de type XGBoost<br>
pour les variables qui sont toutes catégorielles indiquées ci-dessous suite à la conclusion faite lors de "1. Statistiques descriptives"<br>
Les variables utilisées sont:
- embauche comme variable cible
- sexe       
- diplome       
- specialite 
- dispo
- note_Q     
- salaire_Q  
- age_D        
- note_D       
- salaire_D    

Bonus: on pourrait créer des nouvelles variables préconisées par les arbres CHAID pour note, salaire et age<br>
(on pourrait se contenter de créer "note_chaid" en vue de l'utiliser pour xgboost)

On renomme specialite, sexe, diplome en specialite_C, sexe_C, diplome_C

In [29]:
df_all_data = df_all_data.rename(columns={'specialite': 'C_specialite', 'sexe': 'C_sexe', 'diplome': 'C_diplome','dispo': 'C_dispo' })
# df_all_data.dtypes

### Modèle n°1 : régression logistique de type Ridge
Je propose de réaliser un premier modèle avec les variables : C_specialite, C_sexe, C_diplome, C_dispo, note, salaire et age
- sans interactions
- puis avec interactions

Ensuite, on pourra voir ce que donne XGBoost<br>

### I Création de toutes les variables pour Scikit-Learn
- en vue de réaliser une régression logistique (Ridge), les variables catégorielles vont être utilisées
sous la forme de "dummy" variables (variables indicatrices)
- En vue de réaliser une régression XGboost, les variables catégorielles vont être utilisées sous la forme 
de "label encodings" c'est-à-dire que les variables vont être transformées en données numériques grâce à la fonction
"label encoding" de Scikit-Learn


On commence ici la transformation de toutes les features 

On transforme en dummy variables : specialite, sexe, diplome et dispo<br>

In [30]:
target_var_name = "embauche"
all_categorical = [name_var for name_var in list(df_all_data) if name_var[0:1] == 'C'] 

In [31]:
# all_categorical

['C_sexe', 'C_diplome', 'C_specialite', 'C_dispo']

In [9]:
# df_all_data[all_categorical].dtypes

In [33]:
df_all_data.head()

Unnamed: 0,date,cheveux,age,exp,salaire,C_sexe,C_diplome,C_specialite,note,C_dispo,...,annee,mois,jour,age_Q,exp_Q,note_Q,salaire_Q,age_D,note_D,salaire_D
0,2010-01-01,blond,22,8,35554,M,master,geologie,72.41,oui,...,2010,1,1,[18-30[,[7-9[,[64-76[,[34964-38348[,[18-24[,[71-76[,[34964-36216[
1,2010-01-01,brun,42,6,38102,M,licence,geologie,72.46,non,...,2010,1,1,[35-42[,[0-7[,[64-76[,[34964-38348[,[40-43[,[71-76[,[37615-39172[
2,2010-01-01,brun,34,14,36232,M,licence,geologie,65.43,non,...,2010,1,1,[30-35[,[12-23[,[64-76[,[34964-38348[,[31-34[,[61-67[,[36216-37615[
3,2010-01-01,brun,32,7,37425,M,master,geologie,92.64,oui,...,2010,1,1,[30-35[,[0-7[,[87-144[,[34964-38348[,[31-34[,[90-98[,[36216-37615[
4,2010-01-01,brun,25,13,46881,M,bac,geologie,29.52,non,...,2010,1,1,[18-30[,[12-23[,[9-64[,[38348-53977[,[24-28[,[9-54[,[41358-53977[


### Création des variables du type interactions

In [35]:
all_categorical

['C_sexe', 'C_diplome', 'C_specialite', 'C_dispo']

In [36]:
df_with_interaction = f.interaction_order(df_all_data[all_categorical],all_categorical, 2)

In [8]:
# list(df_with_interaction)

In [38]:
df_all_categorical = df_all_data[all_categorical].applymap(str)
df_with_interaction = df_with_interaction.applymap(str)

In [39]:
df_before_dummies = pd.concat([df_all_categorical,df_with_interaction], axis = 1)

In [7]:
# list(df_before_dummies)

In [41]:
df_all_dummies = pd.get_dummies(df_before_dummies)

In [42]:
df_all_data = pd.concat([df_all_data,df_all_dummies], axis = 1)

In [43]:
liste_dummy_variables = list(df_all_dummies)

On ajoute maintenant toutes les versions encodées avec "label encodings" utiles pour XGBoost<br> 

In [6]:
# list(df_before_dummies)

In [45]:
for each in list(df_before_dummies):
    f.to_le(df_before_dummies, each)

In [46]:
df_before_dummies.head()

Unnamed: 0,C_sexe,C_diplome,C_specialite,C_dispo,C_sexe_+_C_diplome,C_sexe_+_C_specialite,C_sexe_+_C_dispo,C_diplome_+_C_specialite,C_diplome_+_C_dispo,C_specialite_+_C_dispo,C_sexe_labels_encoding,C_diplome_labels_encoding,C_specialite_labels_encoding,C_dispo_labels_encoding,C_sexe_+_C_diplome_labels_encoding,C_sexe_+_C_specialite_labels_encoding,C_sexe_+_C_dispo_labels_encoding,C_diplome_+_C_specialite_labels_encoding,C_diplome_+_C_dispo_labels_encoding,C_specialite_+_C_dispo_labels_encoding
0,M,master,geologie,oui,M_master,M_geologie,M_oui,master_geologie,master_oui,geologie_oui,1,3,3,1,7,7,3,15,7,7
1,M,licence,geologie,non,M_licence,M_geologie,M_non,licence_geologie,licence_non,geologie_non,1,2,3,0,6,7,2,11,4,6
2,M,licence,geologie,non,M_licence,M_geologie,M_non,licence_geologie,licence_non,geologie_non,1,2,3,0,6,7,2,11,4,6
3,M,master,geologie,oui,M_master,M_geologie,M_oui,master_geologie,master_oui,geologie_oui,1,3,3,1,7,7,3,15,7,7
4,M,bac,geologie,non,M_bac,M_geologie,M_non,bac_geologie,bac_non,geologie_non,1,0,3,0,4,7,2,3,0,6


In [47]:
liste_var_label_encodings = [each for each in list(df_before_dummies) if 'encoding' in each]

In [5]:
# liste_var_label_encodings

In [49]:
df_all_data = pd.concat([df_all_data,df_before_dummies[liste_var_label_encodings]], axis = 1)

In [10]:
# df_all_data.head()

#### !!! SPLIT DES DONNEES !!!
- données d'apprentissage avec 80% des données 
- données de test avec 20% des données

In [53]:
Y_train = df_all_data[[target_var_name]]
X_train_all_columns, X_test_all_columns, Y_train, Y_test = train_test_split(df_all_data,Y_train,test_size = 0.20,random_state = 35)


## I Régression logistique avec régularisation L2
### On utilise de la cross validation avec  GridSearchCV

Liste des variables pour la régression logistique

In [52]:
# df_all_data.shape

(18320, 95)

On sélectionne les variables choisies pour la régression logistique

In [39]:
#------------------------------------------------------------------------------------------------------------#
#------------------ ATTENTION LISTE DES VARIABLES DU MODELE n°1 ---------------------------------------------#
#------------------------------------------------------------------------------------------------------------#
liste_feat_for_logistic_regression_modele_1 = ['C_sexe_F',
'C_sexe_M',
'C_diplome_bac',
'C_diplome_doctorat',
'C_diplome_licence',
'C_diplome_master',
'C_specialite_archeologie',
'C_specialite_detective',
'C_specialite_forage',
'C_specialite_geologie',
'C_dispo_non',
'C_dispo_oui',
'note',
'salaire',
'age']

In [4]:
init = []
init.append(liste_dummy_variables)
numeric_var_to_add = ['note', 'salaire', 'age']
init.append(numeric_var_to_add)

liste_feat_for_logistic_regression_modele_2 = list(chain(*init))
# liste_feat_for_logistic_regression_modele_2

#### !!! CHOISIR SELECTION MODELE n°1 !!!

In [41]:
X_train = X_train_all_columns[liste_feat_for_logistic_regression_modele_1]

#### !!! OU SELECTION MODELE n°2 !!!

In [56]:
X_train = X_train_all_columns[liste_feat_for_logistic_regression_modele_2]

In [57]:
# X_train_all_columns.shape

(14656, 95)

In [54]:
# X_train.shape

In [3]:
# list(X_train)

Centrage-réduction des variables numériques utilisées: note, salaire et age

In [60]:
all_numeric_to_scale = ['note', 'salaire', 'age']
scaler = StandardScaler()
X_train_scaled = pd.DataFrame.from_records(scaler.fit_transform(X_train[all_numeric_to_scale]))

In [61]:
X_train_scaled.columns = all_numeric_to_scale
X_train = X_train.drop(all_numeric_to_scale, axis=1)

In [62]:
# Vérification: % d'embauchés proche de 11.4%
# len(Y_train[Y_train["embauche"] == 1])/len(Y_train)

In [63]:
X_train.index = X_train_scaled.index
X_train = pd.concat([X_train,X_train_scaled], axis = 1)

In [11]:
# X_train[all_numeric_to_scale].head()

In [65]:
# X_train.shape

In [66]:
%%time

model = LogisticRegression(solver = "newton-cg")
parameters = {'C':[0.009, 0.005,0.001, 0.1,1,2,3,4,5,10]}

grid_logistic = GridSearchCV(model,parameters, scoring='roc_auc', cv=5, n_jobs = 4, return_train_score = True)
grid_logistic.fit(X_train.values, Y_train.values.ravel())

CPU times: user 2.36 s, sys: 1.28 s, total: 3.64 s
Wall time: 38.8 s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='newton-cg',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=4,
             param_grid={'C': [0.009, 0.005, 0.001, 0.1, 1, 2, 3, 4, 5, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='roc_auc', verbose=0)

In [None]:
Modèle n°1: sans interactions + numériques standardisées : AUC train = 0.595 , AUC validation = 0.583

In [52]:
pd.DataFrame(grid_logistic.cv_results_)[pd.DataFrame(grid_logistic.cv_results_)['rank_test_score'] == 1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
3,1.195362,0.232043,0.025135,0.016045,0.1,{'C': 0.1},0.572653,0.559626,0.581374,0.603972,...,0.583462,0.016576,1,0.598837,0.600308,0.594338,0.591163,0.592778,0.595485,0.003517


Modèle n°2: interactions + variables numériques centrées: AUC train = 0.78, AUC validation = 0.77

In [67]:
pd.DataFrame(grid_logistic.cv_results_)[pd.DataFrame(grid_logistic.cv_results_)['rank_test_score'] == 1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
6,3.464404,0.428528,0.018088,0.00626,3,{'C': 3},0.771149,0.760693,0.779562,0.780633,...,0.770273,0.009011,1,0.778286,0.780699,0.775219,0.776489,0.782728,0.778684,0.002735


## II XGBoost avec Holdout

Liste variables Modèle n°1

In [73]:
liste_feat_le_for_xgboost = [each for each in list(df_all_data) if 'labels_encoding' in each and '_+_' not in each]

In [74]:
init = []
init.append(liste_feat_le_for_xgboost)
numeric_var_to_add = ['note', 'salaire', 'age']
init.append(numeric_var_to_add)

liste_feat_for_xgboost_modele_1 = list(chain(*init))

In [12]:
# liste_feat_for_xgboost_modele_1

Liste variables Modèle n°2

In [61]:
liste_feat_le_for_xgboost = [each for each in list(df_all_data) if 'labels_encoding' in each]

In [62]:
init = []
init.append(liste_feat_le_for_xgboost)
numeric_var_to_add = ['note', 'salaire', 'age']
init.append(numeric_var_to_add)

liste_feat_for_xgboost_modele_2 = list(chain(*init))

#### !!! ATTENTION NOUVEAU SPLIT DES DONNEES (80% vers 20%) !!!

In [57]:
Y_train = Y_train[[target_var_name]]
X_train_all_columns, X_valid_all_columns, Y_train, Y_valid = train_test_split(X_train_all_columns,Y_train,test_size = 0.20,random_state = 56)

In [63]:
X_train_all_columns.shape

(11724, 95)

#### !!! CHOISIR SELECTION MODELE n°1 !!!

In [76]:
X_train = X_train_all_columns[liste_feat_for_xgboost_modele_1]
X_valid = X_valid_all_columns[liste_feat_for_xgboost_modele_1]

#### !!! OU SELECTION MODELE n°2 !!!

In [64]:
X_train = X_train_all_columns[liste_feat_for_xgboost_modele_2]
X_valid = X_valid_all_columns[liste_feat_for_xgboost_modele_2]

In [13]:
# list(X_train)

In [78]:
X_train.shape

(11724, 7)

Un premier paramétrage du type suivant a été testé:<br>
    
    subsample_value = np.random.uniform(0.4, 1)
    reg_lambda_value = np.random.uniform(1, 5)
    reg_alpha_value = np.random.uniform(0.1, 1)
    n_estimators_value = np.random.randint(150, 300)
    max_depth_value = np.random.randint(7,15)
    learning_rate_value = np.random.uniform(0.03, 0.4)
    colsample_bytree_value = np.random.uniform(0.7,1)
        
Après des résultats meilleurs obtenus avec le paramétrage qui suit notamment pour le modèle n°2<br>
Je propose donc ci-dessous, les résultats avec ce paramétrage faisant intervenir l'hyperparamètre "min_child_weight"<br>
et "reg_lambda".

In [67]:
%%time
n_iter = 100

df_result_models = pd.DataFrame(columns = ['colsample_bytree','learning_rate','max_depth','n_estimators','subsample','min_child_weight','reg_lambda','AUC_train','AUC_valid'])

for each in range(1,n_iter):
    
    colsample_bytree_value = np.random.uniform(0.3,0.7)
    learning_rate_value = np.random.uniform(0.03, 0.3)
    max_depth_value = np.random.randint(7,20)
    n_estimators_value = np.random.randint(400, 500)
    subsample_value = np.random.uniform(0.4, 0.6)
    reg_lambda_value = np.random.uniform(5, 20)
    min_child_weight_value = np.random.randint(0,300)
    
    xgb_model = xgb.XGBClassifier(
        colsample_bytree = colsample_bytree_value,
        learning_rate = learning_rate_value,
        max_depth = max_depth_value,
        n_estimators = n_estimators_value,
        subsample = subsample_value,
        reg_lambda = reg_lambda_value,
        n_jobs = 4,
        min_child_weight = min_child_weight_value,
        objective='binary:logistic')
        
    result = xgb_model.fit(X_train, Y_train.values.ravel())
    predicted_values = xgb_model.predict_proba(X_train)[:,1]
    predicted_values = np.expand_dims(predicted_values, axis=1)
    AUC_train_value = roc_auc_score(Y_train.values, predicted_values)
    
    predicted_values = xgb_model.predict_proba(X_valid)[:,1]
    predicted_values = np.expand_dims(predicted_values, axis=1)
        
    AUC_valid_value = roc_auc_score(Y_valid.values, predicted_values)
    
    if AUC_train_value != None and AUC_valid_value != None:
    
        df_result_models = df_result_models.append({'colsample_bytree': colsample_bytree_value, 
                                 'learning_rate':learning_rate_value,
                                 'max_depth':max_depth_value,
                                 'n_estimators':n_estimators_value,
                                 'subsample':subsample_value,
                                 'reg_lambda':reg_lambda_value,
                                 'min_child_weight':min_child_weight_value,                 
                                 'AUC_train':AUC_train_value,
                                 'AUC_valid':AUC_valid_value}, ignore_index= True)

CPU times: user 11min 27s, sys: 13.1 s, total: 11min 40s
Wall time: 3min 22s


Modèle n°1: avec variables catégorielles sans interactions et variables continues brutes<br>
AUC train = 0.91, AUC valid = 0.84

In [80]:
df_result_models[df_result_models['AUC_valid'] == max(df_result_models['AUC_valid'])]

Unnamed: 0,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,min_child_weight,reg_lambda,AUC_train,AUC_valid
81,0.447386,0.061286,14.0,455.0,0.57636,12.0,11.016465,0.913873,0.839476


Modèle n°2 : avec variables catégorielles + interactions et variables continues brutes<br>
AUC train = 0.8836, AUC valid = 0.8535<br>
Il semble que le modèle n°2 fasse moins de sur-apprentissage

In [98]:
df_result_models[df_result_models['AUC_valid'] == max(df_result_models['AUC_valid'])]

Unnamed: 0,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,min_child_weight,reg_lambda,AUC_train,AUC_valid
72,0.460285,0.056231,7.0,423.0,0.485432,19.0,16.080494,0.883656,0.853516


Automatisation de "tuning" learning rate versus n_estimators

J'ai donc obtenu le meilleur modèle avec les hyperparamètres ci-dessous:

In [119]:
df_result_best_model = df_result_models[df_result_models['AUC_valid'] == max(df_result_models['AUC_valid'])]
colsample_bytree_value = np.expand_dims(df_result_best_model['colsample_bytree'], axis=1)[0][0]
max_depth_value = int(np.expand_dims(df_result_best_model['max_depth'], axis=1)[0][0])
subsample_value = np.expand_dims(df_result_best_model['subsample'], axis=1)[0][0]
min_child_weight_value = int(np.expand_dims(df_result_best_model['min_child_weight'], axis=1)[0][0])
reg_lambda_value = np.expand_dims(df_result_best_model['reg_lambda'], axis=1)[0][0]

Une pratique connue : multiplier le learning rate par un facteur et augmenter le nombre d'itérations<br>
du modèle (le nombre d'arbres réalisés par XGBoost) par ce même facteur

In [120]:
%%time

learning_rate_init = np.expand_dims(df_result_best_model['learning_rate'], axis=1)[0][0]
n_estimators_init = int(np.expand_dims(df_result_best_model['n_estimators'], axis=1)[0][0])

#learning_rate_init = 0.065352
#n_estimators_init = 418

factor_trick = range(1,10)

for each in factor_trick:
    
    print ("Learning rate",learning_rate_init/each,"N_estimators", n_estimators_init*each)
    xgb_model = xgb.XGBClassifier( 
            colsample_bytree = colsample_bytree_value,
            learning_rate = learning_rate_init/each,
            max_depth = max_depth_value,
            n_estimators = n_estimators_init*each,
            subsample = subsample_value,
            n_jobs = 4,
            min_child_weight = min_child_weight_value,
            reg_lambda = reg_lambda_value,
            objective='binary:logistic')

    
    xgb_model.fit(X_train, Y_train.values.ravel())
    # print (xgb_model.predict_proba(X_train)[:,1])
    predicted_values = xgb_model.predict_proba(X_train)[:,1]
    predicted_values = np.expand_dims(predicted_values, axis=1)
    AUC_train_value = roc_auc_score(Y_train.values, predicted_values)

    predicted_values = xgb_model.predict_proba(X_valid)[:,1]
    predicted_values = np.expand_dims(predicted_values, axis=1)
    AUC_valid_value = roc_auc_score(Y_valid.values, predicted_values)
    
    print (AUC_valid_value, AUC_valid_value - AUC_train_value)

Learning rate 0.12353585221467042 N_estimators 489
0.8368408755975704 -0.08573057902626258
Learning rate 0.06176792610733521 N_estimators 978
0.8386403256532834 -0.08507337632624856
Learning rate 0.04117861740489014 N_estimators 1467
0.835568230832824 -0.08935915634069436
Learning rate 0.030883963053667605 N_estimators 1956
0.83835164983286 -0.08672005693513696
Learning rate 0.024707170442934086 N_estimators 2445
0.8379562650515798 -0.0872359005675366
Learning rate 0.02058930870244507 N_estimators 2934
0.8388402645483627 -0.08635598013883161
Learning rate 0.01764797888781006 N_estimators 3423
0.8377293681032314 -0.08684101036858805
Learning rate 0.015441981526833803 N_estimators 3912
0.8386077513389166 -0.08619329301802481
Learning rate 0.013726205801630047 N_estimators 4401
0.8384830703425471 -0.086460416515598
CPU times: user 7min 52s, sys: 3.26 s, total: 7min 56s
Wall time: 2min 8s


In [None]:
MODELE n°1:
Learning rate 0.12353585221467042 N_estimators 489
0.8368408755975704 -0.08573057902626258
Learning rate 0.06176792610733521 N_estimators 978
0.8386403256532834 -0.08507337632624856
Learning rate 0.04117861740489014 N_estimators 1467
0.835568230832824 -0.08935915634069436
Learning rate 0.030883963053667605 N_estimators 1956
0.83835164983286 -0.08672005693513696
Learning rate 0.024707170442934086 N_estimators 2445
0.8379562650515798 -0.0872359005675366
Learning rate 0.02058930870244507 N_estimators 2934
0.8388402645483627 -0.08635598013883161
Learning rate 0.01764797888781006 N_estimators 3423
0.8377293681032314 -0.08684101036858805
Learning rate 0.015441981526833803 N_estimators 3912
0.8386077513389166 -0.08619329301802481
Learning rate 0.013726205801630047 N_estimators 4401
0.8384830703425471 -0.086460416515598

In [None]:
MODELE n° 2:
Learning rate 0.05623051444048664 N_estimators 423
0.8535155547967361 -0.030140651815787067
Learning rate 0.02811525722024332 N_estimators 846
0.8543422684303223 -0.030058999820529064
Learning rate 0.018743504813495546 N_estimators 1269
0.8531403885554077 -0.03128340481474201
Learning rate 0.01405762861012166 N_estimators 1692
0.8541243574997304 -0.030888915354620128
Learning rate 0.01124610288809733 N_estimators 2115
0.8532673160562164 -0.03195308682142539
Learning rate 0.009371752406747773 N_estimators 2538
0.8537671632939146 -0.03138692765399809
Learning rate 0.008032930634355234 N_estimators 2961
0.8535649778943963 -0.03171570855572636
Learning rate 0.00702881430506083 N_estimators 3384
0.8537941213471838 -0.031202438157430956
Learning rate 0.006247834937831849 N_estimators 3807
0.8539839509722872 -0.030938824327446768

Stabilité du modèle XGBoost n°2: on lance le meilleur modèle avec trois random states différents

In [71]:
%%time

random_states = [41,78,168]

colsample_bytree_value = 0.460285
max_depth_value = 7
subsample_value = 0.485432 
min_child_weight_value = 19
reg_lambda_value = 16.080494

learning_rate_value = 0.056231
n_estimators_value = 423

for rs in random_states:
    xgb_model = xgb.XGBClassifier( 
            colsample_bytree = colsample_bytree_value,
            learning_rate = learning_rate_value,
            max_depth = max_depth_value,
            n_estimators = n_estimators_value,
            subsample = subsample_value,
            n_jobs = 4,
            min_child_weight = min_child_weight_value,
            reg_lambda = reg_lambda_value,
            objective='binary:logistic',
            random_state = rs)

    
    xgb_model.fit(X_train, Y_train.values.ravel())
    predicted_values = xgb_model.predict_proba(X_train)[:,1]
    predicted_values = np.expand_dims(predicted_values, axis=1)
    AUC_train_value = roc_auc_score(Y_train.values, predicted_values)

    predicted_values = xgb_model.predict_proba(X_valid)[:,1]
    predicted_values = np.expand_dims(predicted_values, axis=1)
    AUC_valid_value = roc_auc_score(Y_valid.values, predicted_values)
    
    print (AUC_train_value, AUC_valid_value, AUC_valid_value - AUC_train_value)

0.8851318546121951 0.8534155853491967 -0.03171626926299842
0.8851165851803631 0.8520171363358614 -0.03309944884450167
0.8854342110212378 0.850760217102189 -0.034673993919048796
CPU times: user 24.7 s, sys: 165 ms, total: 24.9 s
Wall time: 6.48 s


Pour XGBoost n°1, on utilise les memes random states

In [79]:
%%time

random_states = [41,78,168]


colsample_bytree_value = 0.447386
max_depth_value = 14
subsample_value = 0.57636 
min_child_weight_value = 12
reg_lambda_value = 11.016465

learning_rate_value = 0.061286
n_estimators_value = 455

for rs in random_states:
    xgb_model = xgb.XGBClassifier( 
            colsample_bytree = colsample_bytree_value,
            learning_rate = learning_rate_value,
            max_depth = max_depth_value,
            n_estimators = n_estimators_value,
            subsample = subsample_value,
            n_jobs = 4,
            min_child_weight = min_child_weight_value,
            reg_lambda = reg_lambda_value,
            objective='binary:logistic',
            random_state = rs)

    
    xgb_model.fit(X_train, Y_train.values.ravel())
    predicted_values = xgb_model.predict_proba(X_train)[:,1]
    predicted_values = np.expand_dims(predicted_values, axis=1)
    AUC_train_value = roc_auc_score(Y_train.values, predicted_values)

    predicted_values = xgb_model.predict_proba(X_valid)[:,1]
    predicted_values = np.expand_dims(predicted_values, axis=1)
    AUC_valid_value = roc_auc_score(Y_valid.values, predicted_values)
    
    print (AUC_train_value, AUC_valid_value, AUC_valid_value - AUC_train_value)

0.9157885419771842 0.839282825922864 -0.07650571605432022
0.9127288077432698 0.840043267675497 -0.07268554006777284
0.9136622284721408 0.8409497322166708 -0.07271249625547005
CPU times: user 28.9 s, sys: 97.2 ms, total: 29 s
Wall time: 7.45 s


Conclusion du test des random_states: les deux modèles n°1 et n°2 sont stables<br>
en termes de performances

A priori on retient comme meilleur modèle le XGBoost avec interactions<br>
Regardons maintenant ce que donne la fonctionnalité "feature importance" de Scikit-learn<br>
ainsi que la performance du modèle sur le test set

Si on veut sauvegarder les modèles

In [None]:
# stockage en .pkl
# joblib.dump(grid_logistic, "reg_log_ridge_modele_2.pkl", compress=9)
# joblib.dump(xgb_model, "reg_log_ridge_modele_2.pkl", compress=9)
# pour le reload
# reglog_model = joblib.load("reg_log_ridge_modele_2.pkl")

### Feature importances que pour XGBoost
C'est la décroissance suivant l'impureté d'un noeud

#### Modèle n°1 XGBoost

In [122]:
list(X_train)

['C_sexe_labels_encoding',
 'C_diplome_labels_encoding',
 'C_specialite_labels_encoding',
 'C_dispo_labels_encoding',
 'note',
 'salaire',
 'age']

In [125]:
dict_feature_importance = {}
for each in list(zip(list(X_train),xgb_model.feature_importances_)):
    dict_feature_importance[each[0]] = each[1]

In [126]:
sorted_importances = sorted(dict_feature_importance.items() , reverse = True, key=lambda x: x[1])
for elem in sorted_importances :
    print(elem[0] , " ::" , elem[1] )

C_dispo_labels_encoding  :: 0.25664505
C_sexe_labels_encoding  :: 0.16907978
C_specialite_labels_encoding  :: 0.16082922
C_diplome_labels_encoding  :: 0.1317353
note  :: 0.11420831
salaire  :: 0.097790316
age  :: 0.06971201


#### Modèle n°2 XGBoost

In [107]:
list(X_train)

['C_sexe_labels_encoding',
 'C_diplome_labels_encoding',
 'C_specialite_labels_encoding',
 'C_dispo_labels_encoding',
 'C_sexe_+_C_diplome_labels_encoding',
 'C_sexe_+_C_specialite_labels_encoding',
 'C_sexe_+_C_dispo_labels_encoding',
 'C_diplome_+_C_specialite_labels_encoding',
 'C_diplome_+_C_dispo_labels_encoding',
 'C_specialite_+_C_dispo_labels_encoding',
 'note',
 'salaire',
 'age']

In [108]:
sort(xgb_model.feature_importances_)

array([0.03213315, 0.04660275, 0.05709283, 0.05794229, 0.06018776,
       0.06120392, 0.06787067, 0.0737716 , 0.08094995, 0.08274093,
       0.08817929, 0.09438447, 0.19694044], dtype=float32)

In [109]:
dict_feature_importance = {}
for each in list(zip(list(X_train),xgb_model.feature_importances_)):
    dict_feature_importance[each[0]] = each[1]

In [112]:
sorted_importances = sorted(dict_feature_importance.items() , reverse = True, key=lambda x: x[1])
for elem in sorted_importances :
    print(elem[0] , " ::" , elem[1] )

C_specialite_+_C_dispo_labels_encoding  :: 0.19694044
C_sexe_labels_encoding  :: 0.09438447
C_diplome_+_C_specialite_labels_encoding  :: 0.08817929
note  :: 0.082740925
C_sexe_+_C_diplome_labels_encoding  :: 0.080949955
C_dispo_labels_encoding  :: 0.073771596
C_sexe_+_C_dispo_labels_encoding  :: 0.06787067
C_sexe_+_C_specialite_labels_encoding  :: 0.061203923
C_diplome_+_C_dispo_labels_encoding  :: 0.060187757
salaire  :: 0.05794229
C_diplome_labels_encoding  :: 0.057092834
C_specialite_labels_encoding  :: 0.04660275
age  :: 0.032133155


### Prédiction du modèle n°2 : Regression logistique Ridge sur le test set

In [350]:
all_numeric_to_scale = ['note', 'salaire', 'age']
scaler = StandardScaler()
df_temp = X_test_all_columns[liste_feat_for_logistic_regression_modele_2]
X_test_scaled = pd.DataFrame.from_records(scaler.fit_transform(df_temp[all_numeric_to_scale]))
X_test_scaled.columns = all_numeric_to_scale
df_temp = df_temp.drop(all_numeric_to_scale, axis=1)
df_temp.index = X_test_scaled.index
df_temp = pd.concat([df_temp,X_test_scaled], axis = 1)

Unnamed: 0,C_sexe_F,C_sexe_M,C_diplome_bac,C_diplome_doctorat,C_diplome_licence,C_diplome_master,C_specialite_archeologie,C_specialite_detective,C_specialite_forage,C_specialite_geologie,...,C_diplome_+_C_dispo_master_non,C_diplome_+_C_dispo_master_oui,C_specialite_+_C_dispo_archeologie_non,C_specialite_+_C_dispo_archeologie_oui,C_specialite_+_C_dispo_detective_non,C_specialite_+_C_dispo_detective_oui,C_specialite_+_C_dispo_forage_non,C_specialite_+_C_dispo_forage_oui,C_specialite_+_C_dispo_geologie_non,C_specialite_+_C_dispo_geologie_oui
4807,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
11503,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2783,1,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
767,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
5949,0,1,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [357]:
# predicted_values = grid_logistic.predict(X_test_all_columns[liste_feat_for_logistic_regression_modele_2])
predicted_values = grid_logistic.predict_proba(df_temp)[:,1]
predicted_values = np.expand_dims(predicted_values, axis=1)
AUC_test_value = roc_auc_score(Y_test.values, predicted_values)
AUC_test_value

0.77971091778056

L'AUC vaut 0.78 sur le test set, pour la régression logistique Ridge<br>
sachant que les AUC sur le train set et sur le validation set étaient respectivement de 0.78 et 0.77 

### Prédiction sur test set XGBoost modèle n°1

In [127]:
df_temp = X_test_all_columns[liste_feat_for_xgboost_modele_1]

In [128]:
predicted_values = xgb_model.predict_proba(df_temp)[:,1]
predicted_values = np.expand_dims(predicted_values, axis=1)
AUC_test_value = roc_auc_score(Y_test.values, predicted_values)
AUC_test_value

0.8299312634667076

### Prédiction sur test set XGBoost modèle n°2 sans le trick

In [102]:
df_temp = X_test_all_columns[liste_feat_for_xgboost_modele_2]

In [103]:
predicted_values = xgb_model.predict_proba(df_temp)[:,1]
predicted_values = np.expand_dims(predicted_values, axis=1)
AUC_test_value = roc_auc_score(Y_test.values, predicted_values)
AUC_test_value

0.8445589101658421