### 1. Import des données

In [26]:
import pandas as pd
import numpy as np
from itertools import chain

from sklearn.model_selection import train_test_split

from itertools import combinations

# from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import roc_auc_score

# from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

# from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler

# from functions import *
from functions import fun as f
# mf.to_le()

df_all_data = pd.read_csv("data/all_data_before_ml.csv")

In [27]:
# ?to_le

In [28]:
df_all_data.shape

(18320, 21)

In [29]:
list(df_all_data)

['date',
 'cheveux',
 'age',
 'exp',
 'salaire',
 'sexe',
 'diplome',
 'specialite',
 'note',
 'dispo',
 'embauche',
 'annee',
 'mois',
 'jour',
 'age_Q',
 'exp_Q',
 'note_Q',
 'salaire_Q',
 'age_D',
 'note_D',
 'salaire_D']

### 2. Transformation en features numériques adaptées à Scikit-Learn

On réalise une régression logistique pour les variables qui sont toutes catégorielles indiquées ci-dessous
suite à la conclusion faite lors de "1. Statistiques descriptives".
Pour information, on ne prend pas annee +  mois = 4*12 = 48 modalités.
Les variables utilisées sont:
- embauche comme variable cible
- sexe       
- diplome       
- specialite   
- note_Q     
- salaire_Q  
- age_D        
- note_D       
- salaire_D    

Bonus: on peut créer les nouvelles variables préconisées par les arbres CHAID pour note, salaire et age<br>
mais on pourra se contenter de créer "note_chaid" en vue de l'utiliser pour xgboost

On renomme specialite, sexe, diplome en specialite_C, sexe_C, diplome_C

In [30]:
df_all_data = df_all_data.rename(columns={'specialite': 'C_specialite', 'sexe': 'C_sexe', 'diplome': 'C_diplome'})
df_all_data.dtypes

date             object
cheveux          object
age               int64
exp               int64
salaire           int64
C_sexe           object
C_diplome        object
C_specialite     object
note            float64
dispo            object
embauche          int64
annee             int64
mois              int64
jour              int64
age_Q            object
exp_Q            object
note_Q           object
salaire_Q        object
age_D            object
note_D           object
salaire_D        object
dtype: object

### Modèle n°1 : régression logistique baseline 
avec les variables : C_specialite, C_sexe, C_diplome, note, salaire et age
    

### Modèle n°2: xgboost 
On va utiliser les variables : C_specialite, C_sexe, C_diplome, note_Q, salaire_Q, age_D
ou C_specialite, C_sexe, C_diplome, note_D, salaire_Q, age_D

On peut envisager de créer la variable note avec note_CHAID

### I Création de toutes les variables pour Scikit-Learn
- en vue de réaliser une régression logistique (Ridge), les variables catégorielles vont être utilisées
sous la forme de "dummy" variables (variables indicatrices)
- En vue de réaliser une régression XGboost, les variables catégorielles vont être utilisées sous la forme 
de "label encodings" c'est-à-dire que les variables vont être transformées en données numériques grâce à la fonction
"label encoding" de Scikit-Learn


On commence ici la transformation de toutes les features 

On transforme en dummy variables : specialite, sexe et diplome<br>

In [31]:
target_var_name = "embauche"
all_categorical = [name_var for name_var in list(df_all_data) if name_var[0:1] == 'C'] 

In [32]:
all_categorical

['C_sexe', 'C_diplome', 'C_specialite']

In [33]:
df_all_data[all_categorical].dtypes

C_sexe          object
C_diplome       object
C_specialite    object
dtype: object

Création des "dummy" variables pour : specialite, sexe et diplome

In [34]:
df_all_data.head()

Unnamed: 0,date,cheveux,age,exp,salaire,C_sexe,C_diplome,C_specialite,note,dispo,...,annee,mois,jour,age_Q,exp_Q,note_Q,salaire_Q,age_D,note_D,salaire_D
0,2010-01-01,blond,22,8,35554,M,master,geologie,72.41,oui,...,2010,1,1,[0-25[,[25-50[,[25-50[,[50-75[,[0-10[,[40-50[,[50-60[
1,2010-01-01,brun,42,6,38102,M,licence,geologie,72.46,non,...,2010,1,1,[50-75[,[0-25[,[25-50[,[50-75[,[70-80[,[40-50[,[70-80[
2,2010-01-01,brun,34,14,36232,M,licence,geologie,65.43,non,...,2010,1,1,[25-50[,[75-100[,[25-50[,[50-75[,[30-40[,[20-30[,[60-70[
3,2010-01-01,brun,32,7,37425,M,master,geologie,92.64,oui,...,2010,1,1,[25-50[,[0-25[,[75-100[,[50-75[,[30-40[,[80-90[,[60-70[
4,2010-01-01,brun,25,13,46881,M,bac,geologie,29.52,non,...,2010,1,1,[0-25[,[75-100[,[0-25[,[75-100[,[10-20[,[0-10[,[90-100[


In [35]:
df_all_dummies = pd.get_dummies(df_all_data[all_categorical])

In [36]:
df_all_dummies.head()

Unnamed: 0,C_sexe_F,C_sexe_M,C_diplome_bac,C_diplome_doctorat,C_diplome_licence,C_diplome_master,C_specialite_archeologie,C_specialite_detective,C_specialite_forage,C_specialite_geologie
0,0,1,0,0,0,1,0,0,0,1
1,0,1,0,0,1,0,0,0,0,1
2,0,1,0,0,1,0,0,0,0,1
3,0,1,0,0,0,1,0,0,0,1
4,0,1,1,0,0,0,0,0,0,1


In [37]:
df_all_data = pd.concat([df_all_data, df_all_dummies], axis = 1)

In [38]:
df_all_data.head()

Unnamed: 0,date,cheveux,age,exp,salaire,C_sexe,C_diplome,C_specialite,note,dispo,...,C_sexe_F,C_sexe_M,C_diplome_bac,C_diplome_doctorat,C_diplome_licence,C_diplome_master,C_specialite_archeologie,C_specialite_detective,C_specialite_forage,C_specialite_geologie
0,2010-01-01,blond,22,8,35554,M,master,geologie,72.41,oui,...,0,1,0,0,0,1,0,0,0,1
1,2010-01-01,brun,42,6,38102,M,licence,geologie,72.46,non,...,0,1,0,0,1,0,0,0,0,1
2,2010-01-01,brun,34,14,36232,M,licence,geologie,65.43,non,...,0,1,0,0,1,0,0,0,0,1
3,2010-01-01,brun,32,7,37425,M,master,geologie,92.64,oui,...,0,1,0,0,0,1,0,0,0,1
4,2010-01-01,brun,25,13,46881,M,bac,geologie,29.52,non,...,0,1,1,0,0,0,0,0,0,1


Maintenant, on crée toutes les variables de type "label encodings" pour :<br> 
C_specialite, C_sexe, C_diplome, note_Q, salaire_Q, age_D, note_D et salaire_D

In [39]:
liste_feat_for_label_encodings = ['C_specialite', 'C_sexe', 'C_diplome', 'note_Q', 'salaire_Q', 'age_D', 'note_D','salaire_D']

In [40]:
for each in liste_feat_for_label_encodings:
    # to_le(df_all_data, each)
    f.to_le(df_all_data, each)

In [41]:
list(df_all_data)

['date',
 'cheveux',
 'age',
 'exp',
 'salaire',
 'C_sexe',
 'C_diplome',
 'C_specialite',
 'note',
 'dispo',
 'embauche',
 'annee',
 'mois',
 'jour',
 'age_Q',
 'exp_Q',
 'note_Q',
 'salaire_Q',
 'age_D',
 'note_D',
 'salaire_D',
 'C_sexe_F',
 'C_sexe_M',
 'C_diplome_bac',
 'C_diplome_doctorat',
 'C_diplome_licence',
 'C_diplome_master',
 'C_specialite_archeologie',
 'C_specialite_detective',
 'C_specialite_forage',
 'C_specialite_geologie',
 'C_specialite_labels_encoding',
 'C_sexe_labels_encoding',
 'C_diplome_labels_encoding',
 'note_Q_labels_encoding',
 'salaire_Q_labels_encoding',
 'age_D_labels_encoding',
 'note_D_labels_encoding',
 'salaire_D_labels_encoding']

In [42]:
df_all_data.head()

Unnamed: 0,date,cheveux,age,exp,salaire,C_sexe,C_diplome,C_specialite,note,dispo,...,C_specialite_forage,C_specialite_geologie,C_specialite_labels_encoding,C_sexe_labels_encoding,C_diplome_labels_encoding,note_Q_labels_encoding,salaire_Q_labels_encoding,age_D_labels_encoding,note_D_labels_encoding,salaire_D_labels_encoding
0,2010-01-01,blond,22,8,35554,M,master,geologie,72.41,oui,...,0,1,3,1,3,1,2,0,4,5
1,2010-01-01,brun,42,6,38102,M,licence,geologie,72.46,non,...,0,1,3,1,2,1,2,7,4,7
2,2010-01-01,brun,34,14,36232,M,licence,geologie,65.43,non,...,0,1,3,1,2,1,2,3,2,6
3,2010-01-01,brun,32,7,37425,M,master,geologie,92.64,oui,...,0,1,3,1,3,3,2,3,8,6
4,2010-01-01,brun,25,13,46881,M,bac,geologie,29.52,non,...,0,1,3,1,0,0,3,1,0,9


## I Régression logistique avec régularisation L2
### On utilise de la cross validation avec  GridSearchCV

Liste des variables pour la régression logistique

In [78]:
# del liste_feat_for_logistic_regression

init = []
init.append(list(df_all_dummies))
numeric_var_to_add = ['note', 'salaire', 'age']
init.append(numeric_var_to_add)

liste_feat_for_logistic_regression = list(chain(*init))
liste_feat_for_logistic_regression

['C_sexe_F',
 'C_sexe_M',
 'C_diplome_bac',
 'C_diplome_doctorat',
 'C_diplome_licence',
 'C_diplome_master',
 'C_specialite_archeologie',
 'C_specialite_detective',
 'C_specialite_forage',
 'C_specialite_geologie',
 'note',
 'salaire',
 'age']

In [142]:
df_all_data.shape

(18320, 39)

Split des données en :
- données d'apprentissage pour la cross validation avec 70% des données 
- données de test avec 30% des données

In [280]:
# X_train = df_all_data[liste_feat_for_logistic_regression]

Y_train = df_all_data[[target_var_name]]
X_train_all_columns, X_test_all_columns, Y_train, Y_test = train_test_split(df_all_data,Y_train,test_size = 0.2,random_state = 56)

In [281]:
Y_train.shape

(14656, 1)

In [282]:
X_train = X_train_all_columns[liste_feat_for_logistic_regression]
# Y_train = df_all_data[[target_var_name]]

Centrage-réduction des variables numériques utilisées: note, salaire et age

In [283]:
X_train_all_columns.shape

(14656, 39)

In [284]:
X_train.shape

(14656, 13)

In [285]:
list(X_train)

['C_sexe_F',
 'C_sexe_M',
 'C_diplome_bac',
 'C_diplome_doctorat',
 'C_diplome_licence',
 'C_diplome_master',
 'C_specialite_archeologie',
 'C_specialite_detective',
 'C_specialite_forage',
 'C_specialite_geologie',
 'note',
 'salaire',
 'age']

In [286]:
all_numeric_to_scale = numeric_var_to_add
scaler = StandardScaler()
X_train_scaled = pd.DataFrame.from_records(scaler.fit_transform(X_train[all_numeric_to_scale]))

In [287]:
X_train_scaled.columns = all_numeric_to_scale

In [288]:
X_train_scaled.shape

(14656, 3)

In [289]:
X_train = X_train.drop(list(X_train_scaled), axis=1)

In [290]:
list(X_train)

['C_sexe_F',
 'C_sexe_M',
 'C_diplome_bac',
 'C_diplome_doctorat',
 'C_diplome_licence',
 'C_diplome_master',
 'C_specialite_archeologie',
 'C_specialite_detective',
 'C_specialite_forage',
 'C_specialite_geologie']

In [291]:
X_train.shape

(14656, 10)

In [292]:
X_train_scaled.head()

Unnamed: 0,note,salaire,age
0,0.09153,-0.052302,-1.228076
1,-2.779327,0.840715,-0.771923
2,0.412267,1.066834,0.710577
3,1.755389,-0.123856,-1.912307
4,1.037967,-0.220534,-1.342115


In [293]:
X_train.head()

Unnamed: 0,C_sexe_F,C_sexe_M,C_diplome_bac,C_diplome_doctorat,C_diplome_licence,C_diplome_master,C_specialite_archeologie,C_specialite_detective,C_specialite_forage,C_specialite_geologie
13373,0,1,0,0,1,0,0,0,1,0
7661,0,1,0,0,1,0,0,0,0,1
5296,1,0,0,0,0,1,0,1,0,0
14835,1,0,0,1,0,0,0,0,1,0
3020,1,0,0,0,0,1,0,0,0,1


In [294]:
list(X_train_scaled)

['note', 'salaire', 'age']

In [295]:
X_train = pd.concat([X_train.reset_index(drop = True), X_train_scaled.reset_index(drop = True)], axis = 1)

In [297]:
X_train.shape

(14656, 13)

In [298]:
# X_train, X_test, Y_train, Y_test = train_test_split(X_train,Y_train,test_size = 0.2,random_state = 56)

Le % de la variable cible est respecté

In [299]:
len(Y_train[Y_train["embauche"] == 1])/len(Y_train)

0.11360534934497817

In [300]:
X_test_all_columns.shape

(3664, 39)

In [301]:
X_train.shape

(14656, 13)

In [302]:
Y_train.shape

(14656, 1)

In [303]:
%%time

model = LogisticRegression(solver = "newton-cg")
parameters = {'C':[0.009, 0.005,0.001, 0.1,1,2,3,4,5,10]}

grid_logistic = GridSearchCV(model,parameters, scoring='roc_auc', cv=5, n_jobs = 4)
grid_logistic.fit(X_train.values, Y_train.values.ravel())

CPU times: user 595 ms, sys: 389 ms, total: 984 ms
Wall time: 9.9 s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='newton-cg',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=4,
             param_grid={'C': [0.009, 0.005, 0.001, 0.1, 1, 2, 3, 4, 5, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)

In [304]:
pd.DataFrame(grid_logistic.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.44021,0.10671,0.008791,0.001642,0.009,{'C': 0.009},0.584586,0.611686,0.565267,0.600378,0.588418,0.590066,0.015629,8
1,0.543081,0.018522,0.011792,0.003851,0.005,{'C': 0.005},0.578868,0.609764,0.562978,0.594879,0.58552,0.586401,0.015645,9
2,0.440375,0.045001,0.010315,0.001561,0.001,{'C': 0.001},0.562153,0.603218,0.55349,0.574928,0.571858,0.573129,0.01683,10
3,0.624376,0.040887,0.014052,0.006795,0.1,{'C': 0.1},0.594639,0.612635,0.564577,0.605989,0.591745,0.593917,0.016505,7
4,0.60329,0.031123,0.012512,0.003446,1.0,{'C': 1},0.595571,0.612012,0.564198,0.606273,0.591721,0.593955,0.016557,5
5,0.659368,0.080863,0.013064,0.00588,2.0,{'C': 2},0.59557,0.611951,0.564205,0.606326,0.591726,0.593956,0.01655,4
6,0.548541,0.045636,0.015652,0.005904,3.0,{'C': 3},0.595598,0.611945,0.564188,0.606341,0.591723,0.593959,0.016557,2
7,0.584779,0.056701,0.01186,0.004036,4.0,{'C': 4},0.595613,0.61194,0.564185,0.606335,0.591727,0.59396,0.016556,1
8,0.563729,0.031355,0.012267,0.004104,5.0,{'C': 5},0.595615,0.611935,0.564169,0.606338,0.591725,0.593957,0.016562,3
9,0.506384,0.118205,0.010401,0.005843,10.0,{'C': 10},0.595623,0.611913,0.564157,0.60634,0.591735,0.593954,0.016562,6


Courbe de ROC: 
il faut apparemment faire la cross validation avec KFold 
pour pouvoir tracer la courbe de ROC

## II XGBoost d'abord avec Holdout

Liste des variables pour XGBoost

In [43]:
liste_feat_for_xgboost = ['C_specialite_labels_encoding',
'C_sexe_labels_encoding',
'C_diplome_labels_encoding',
# 'note_Q_labels_encoding',
# 'salaire_Q_labels_encoding',
#'age_D_labels_encoding',
#'note_D_labels_encoding',
#'salaire_D_labels_encoding'
'age','salaire','note']

In [44]:
list(df_all_data)

['date',
 'cheveux',
 'age',
 'exp',
 'salaire',
 'C_sexe',
 'C_diplome',
 'C_specialite',
 'note',
 'dispo',
 'embauche',
 'annee',
 'mois',
 'jour',
 'age_Q',
 'exp_Q',
 'note_Q',
 'salaire_Q',
 'age_D',
 'note_D',
 'salaire_D',
 'C_sexe_F',
 'C_sexe_M',
 'C_diplome_bac',
 'C_diplome_doctorat',
 'C_diplome_licence',
 'C_diplome_master',
 'C_specialite_archeologie',
 'C_specialite_detective',
 'C_specialite_forage',
 'C_specialite_geologie',
 'C_specialite_labels_encoding',
 'C_sexe_labels_encoding',
 'C_diplome_labels_encoding',
 'note_Q_labels_encoding',
 'salaire_Q_labels_encoding',
 'age_D_labels_encoding',
 'note_D_labels_encoding',
 'salaire_D_labels_encoding']

In [45]:
X_train = df_all_data[liste_feat_for_xgboost]
Y_train = df_all_data[[target_var_name]]

In [370]:
X_train.index.name = 'id'
X_train_all_columns[liste_feat_for_xgboost].index.name = 'id'

On enlève les données de test

In [374]:
X_train_all_columns[liste_feat_for_xgboost].shape

(14656, 8)

In [373]:
df_temp_1 = pd.merge(X_train, X_train_all_columns[liste_feat_for_xgboost], on = "id", how = "left")
df_temp_1.head()

Unnamed: 0_level_0,C_specialite_labels_encoding_x,C_sexe_labels_encoding_x,C_diplome_labels_encoding_x,note_Q_labels_encoding_x,salaire_Q_labels_encoding_x,age_D_labels_encoding_x,note_D_labels_encoding_x,salaire_D_labels_encoding_x,C_specialite_labels_encoding_y,C_sexe_labels_encoding_y,C_diplome_labels_encoding_y,note_Q_labels_encoding_y,salaire_Q_labels_encoding_y,age_D_labels_encoding_y,note_D_labels_encoding_y,salaire_D_labels_encoding_y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,3,1,3,1,2,0,4,5,,,,,,,,
1,3,1,2,1,2,7,4,7,3.0,1.0,2.0,1.0,2.0,7.0,4.0,7.0
2,3,1,2,1,2,3,2,6,3.0,1.0,2.0,1.0,2.0,3.0,2.0,6.0
3,3,1,3,3,2,3,8,6,3.0,1.0,3.0,3.0,2.0,3.0,8.0,6.0
4,3,1,0,0,3,1,0,9,3.0,1.0,0.0,0.0,3.0,1.0,0.0,9.0


In [None]:
df_temp_2 = merge(Y_train, Y_test, on = "id", how = "left")

In [349]:
X_train.shape

(18320, 8)

In [364]:
Y_test.shape

(3664, 1)

On splitte les données en train et validation

In [331]:
X_train.shape

(5650, 8)

In [309]:
Y_train.shape

(18320, 1)

In [327]:
X_test_all_columns.shape[0]

3664

In [312]:
len(Y_train[Y_train["embauche"] == 1])/len(Y_train)

0.11408296943231441

In [None]:
NB) A FAIRE :
On doit tirer "X_test_all_columns.shape[0]" observations parmi: X_train,Y_train et avoir tjs 11,39% d 'embauche

In [321]:
(X_test_all_columns.shape[0]/df_all_data.shape[0])

0.2

In [47]:
#X_train, X_valid, Y_train, Y_valid = train_test_split(X_train,Y_train,test_size = 0.2,random_state = 125)
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train,Y_train,test_size = 0.4,random_state = 125)

In [48]:
X_train.shape

(10992, 6)

In [49]:
Y_train.shape

(10992, 1)

In [50]:
X_valid.shape

(7328, 6)

In [None]:
silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.01,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=10)

In [60]:
%%time
n_iter = 100

# df_result_models = pd.DataFrame(columns = ['colsample_bytree','learning_rate','max_depth','n_estimators','subsample','min_child_weight','reg_lambda','AUC_train','AUC_test'])
df_result_models = pd.DataFrame(columns = ['colsample_bytree','gamma','learning_rate','max_depth','n_estimators','subsample','reg_lambda','AUC_train','AUCE_test'])

for each in range(1,n_iter):
    
    # print ("Modèle n°", each)
    # colsample_bytree_value = np.random.uniform(0.3,0.7)
    colsample_bytree_value = 0.4
    # gamma_value = np.random.uniform(0, 0.5)
    gamma_value = 10
    # learning_rate_value = np.random.uniform(0.03, 0.3)
    #  learning_rate_value = np.random.uniform(0.01, 0.1)
    learning_rate_value = 0.1
    # max_depth_value = np.random.randint(2, 6)
    
    # TEST 1 & 2: max_depth_value = np.random.randint(7,20)
    # TEST 3: 
    # max_depth_value = np.random.randint(3,7) 
    max_depth_value = 4
    
    
    
    # TEST 1: n_estimators_value = np.random.randint(400, 500)
    # TEST 2: n_estimators_value = np.random.randint(100, 150)
    n_estimators_value = np.random.randint(400, 500)
    
    # n_estimators_value = np.random.randint(400, 3000)
    # n_estimators_value = np.random.randint(400, 5000)
    # subsample_value = np.random.uniform(0.4, 0.6)
    # subsample_value = np.random.uniform(0,1)
    subsample_value= 0.8
    
    # reg_alpha_value = np.random.uniform(0, 0.05)
    # reg_alpha_value = np.random.uniform(0, 10)
    # reg_lambda_value = np.random.uniform(0, 10)
    reg_lambda_value = np.random.uniform(5, 20)
    
    # print ("max_depth", max_depth_value)
    
    min_child_weight_value = np.random.randint(0,300)
    
    xgb_model = xgb.XGBClassifier(
        colsample_bytree = colsample_bytree_value,
        # gamma = gamma_value,
        learning_rate = learning_rate_value,
        max_depth = max_depth_value,
        n_estimators = n_estimators_value,
        subsample = subsample_value,
        # reg_alpha = reg_alpha_value,
        reg_lambda = reg_lambda_value,
        n_jobs = 4,
        # min_child_weight = min_child_weight_value,
        objective='binary:logistic')
        
        # eval_metric = "rmse"
          
    result = xgb_model.fit(X_train, Y_train)
    
    predicted_values = xgb_model.predict(X_train)
    # print ("predicted_values", predicted_values)
    predicted_values = np.expand_dims(predicted_values, axis=1)
        
    # RMSE_train_value = np.sqrt(np.mean((predicted_values - Y_train.values.ravel())**2, axis=0))
    # print("le score RMLSE sur le training set vaut : {}".format(RMSE))
    
    print (Y_train.values.shape)
    print (predicted_values.shape)
    AUC_train_value = roc_auc_score(Y_train.values, predicted_values)
    
    
    predicted_values = xgb_model.predict(X_valid)
    predicted_values = np.expand_dims(predicted_values, axis=1)
        
    # RMSE_test_value = np.sqrt(np.mean((predicted_values - Y_valid.values.ravel())**2, axis=0))
    # print("le score RMSE sur le test set vaut : {}".format(RMSE))
    AUC_test_value = roc_auc_score(Y_valid.values, predicted_values)
    
    if AUC_train_value != None and AUC_test_value != None:
    
        df_result_models = df_result_models.append({'colsample_bytree': colsample_bytree_value, 
                                 # 'gamma':gamma_value,
                                 'learning_rate':learning_rate_value,
                                 'max_depth':max_depth_value,
                                 'n_estimators':n_estimators_value,
                                 'subsample':subsample_value,
                                 # 'reg_alpha':reg_alpha_value,
                                 'reg_lambda':reg_lambda_value,
                                 # 'min_child_weight':min_child_weight_value,                 
                                 'AUC_train':AUC_train_value,
                                 'AUC_test':AUC_test_value}, ignore_index= True)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)
CPU times: user 8min 1s, sys: 4.78 s, total: 8min 6s
Wall time: 2min 16s


In [61]:
df_result_models

Unnamed: 0,colsample_bytree,gamma,learning_rate,max_depth,n_estimators,subsample,reg_lambda,AUC_train,AUCE_test,AUC_test,min_child_weight
0,0.4,,0.1,4.0,472.0,0.8,18.807112,0.502700,,0.505274,155.0
1,0.4,,0.1,4.0,488.0,0.8,16.654420,0.596662,,0.581990,12.0
2,0.4,,0.1,4.0,416.0,0.8,12.935308,0.500000,,0.500000,212.0
3,0.4,,0.1,4.0,441.0,0.8,17.657723,0.501950,,0.502896,142.0
4,0.4,,0.1,4.0,482.0,0.8,17.884033,0.500801,,0.500000,182.0
5,0.4,,0.1,4.0,477.0,0.8,12.578378,0.505851,,0.508455,144.0
6,0.4,,0.1,4.0,428.0,0.8,7.732646,0.500000,,0.500000,247.0
7,0.4,,0.1,4.0,435.0,0.8,19.747272,0.547371,,0.543797,53.0
8,0.4,,0.1,4.0,454.0,0.8,17.940649,0.577570,,0.572521,22.0
9,0.4,,0.1,4.0,435.0,0.8,10.181586,0.546263,,0.544006,73.0
