### 1. Import des données

In [146]:
import pandas as pd
import numpy as np
from itertools import chain

from sklearn.model_selection import train_test_split

from itertools import combinations

# from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import roc_auc_score

# from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

# from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler

# from functions import *
from functions import fun as f
# mf.to_le()
import re

df_all_data = pd.read_csv("data/all_data_before_ml.csv")

In [147]:
list(df_all_data)

['date',
 'cheveux',
 'age',
 'exp',
 'salaire',
 'sexe',
 'diplome',
 'specialite',
 'note',
 'dispo',
 'embauche',
 'annee',
 'mois',
 'jour',
 'age_Q',
 'exp_Q',
 'note_Q',
 'salaire_Q',
 'age_D',
 'note_D',
 'salaire_D']

### 2. Transformation en features numériques adaptées à Scikit-Learn

On réalise une régression logistique pour les variables qui sont toutes catégorielles indiquées ci-dessous
suite à la conclusion faite lors de "1. Statistiques descriptives".
Les variables utilisées sont:
- embauche comme variable cible
- sexe       
- diplome       
- specialite 
- dispo
- note_Q     
- salaire_Q  
- age_D        
- note_D       
- salaire_D    

Bonus: on peut créer les nouvelles variables préconisées par les arbres CHAID pour note, salaire et age<br>
mais on pourra se contenter de créer "note_chaid" en vue de l'utiliser pour xgboost

On renomme specialite, sexe, diplome en specialite_C, sexe_C, diplome_C

In [148]:
df_all_data = df_all_data.rename(columns={'specialite': 'C_specialite', 'sexe': 'C_sexe', 'diplome': 'C_diplome','dispo': 'C_dispo' })
df_all_data.dtypes

date             object
cheveux          object
age               int64
exp               int64
salaire           int64
C_sexe           object
C_diplome        object
C_specialite     object
note            float64
C_dispo          object
embauche          int64
annee             int64
mois              int64
jour              int64
age_Q            object
exp_Q            object
note_Q           object
salaire_Q        object
age_D            object
note_D           object
salaire_D        object
dtype: object

### Modèle n°1 : régression logistique baseline 
Je propose de réaliser un premier modèle avec les variables : C_specialite, C_sexe, C_diplome, C_dispo, note, salaire et age
- sans interactions
- puis avec interactions

Ensuite, on pourra voir ce que donne XGBoost<br>
On peut envisager de créer la variable note avec note_CHAID

### I Création de toutes les variables pour Scikit-Learn
- en vue de réaliser une régression logistique (Ridge), les variables catégorielles vont être utilisées
sous la forme de "dummy" variables (variables indicatrices)
- En vue de réaliser une régression XGboost, les variables catégorielles vont être utilisées sous la forme 
de "label encodings" c'est-à-dire que les variables vont être transformées en données numériques grâce à la fonction
"label encoding" de Scikit-Learn


On commence ici la transformation de toutes les features 

On transforme en dummy variables : specialite, sexe, diplome et dispo<br>

In [149]:
target_var_name = "embauche"
all_categorical = [name_var for name_var in list(df_all_data) if name_var[0:1] == 'C'] 

In [137]:
all_categorical

['C_sexe', 'C_diplome', 'C_specialite', 'C_dispo']

In [150]:
df_all_data[all_categorical].dtypes

C_sexe          object
C_diplome       object
C_specialite    object
C_dispo         object
dtype: object

In [151]:
df_all_data.head()

Unnamed: 0,date,cheveux,age,exp,salaire,C_sexe,C_diplome,C_specialite,note,C_dispo,...,annee,mois,jour,age_Q,exp_Q,note_Q,salaire_Q,age_D,note_D,salaire_D
0,2010-01-01,blond,22,8,35554,M,master,geologie,72.41,oui,...,2010,1,1,[18-30[,[7-9[,[64-76[,[34964-38348[,[18-24[,[71-76[,[34964-36216[
1,2010-01-01,brun,42,6,38102,M,licence,geologie,72.46,non,...,2010,1,1,[35-42[,[0-7[,[64-76[,[34964-38348[,[40-43[,[71-76[,[37615-39172[
2,2010-01-01,brun,34,14,36232,M,licence,geologie,65.43,non,...,2010,1,1,[30-35[,[12-23[,[64-76[,[34964-38348[,[31-34[,[61-67[,[36216-37615[
3,2010-01-01,brun,32,7,37425,M,master,geologie,92.64,oui,...,2010,1,1,[30-35[,[0-7[,[87-144[,[34964-38348[,[31-34[,[90-98[,[36216-37615[
4,2010-01-01,brun,25,13,46881,M,bac,geologie,29.52,non,...,2010,1,1,[18-30[,[12-23[,[9-64[,[38348-53977[,[24-28[,[9-54[,[41358-53977[


In [140]:
# df_all_dummies = pd.get_dummies(df_all_data[all_categorical])
# df_all_dummies.shape
# df_all_data = pd.concat([df_all_data, df_all_dummies], axis = 1)

Maintenant, on crée toutes les variables de type "label encodings" pour :<br> 
C_specialite, C_sexe, C_diplome, C_dispo, note_Q, salaire_Q, age_D, note_D et salaire_D

In [152]:
liste_feat_for_label_encodings = ['C_specialite', 'C_sexe', 'C_diplome', 'C_dispo','note_Q', 'salaire_Q', 'age_D', 'note_D','salaire_D']
for each in liste_feat_for_label_encodings:
    f.to_le(df_all_data, each)

In [153]:
list(df_all_data)

['date',
 'cheveux',
 'age',
 'exp',
 'salaire',
 'C_sexe',
 'C_diplome',
 'C_specialite',
 'note',
 'C_dispo',
 'embauche',
 'annee',
 'mois',
 'jour',
 'age_Q',
 'exp_Q',
 'note_Q',
 'salaire_Q',
 'age_D',
 'note_D',
 'salaire_D',
 'C_specialite_labels_encoding',
 'C_sexe_labels_encoding',
 'C_diplome_labels_encoding',
 'C_dispo_labels_encoding',
 'note_Q_labels_encoding',
 'salaire_Q_labels_encoding',
 'age_D_labels_encoding',
 'note_D_labels_encoding',
 'salaire_D_labels_encoding']

In [154]:
df_all_data.head()

Unnamed: 0,date,cheveux,age,exp,salaire,C_sexe,C_diplome,C_specialite,note,C_dispo,...,salaire_D,C_specialite_labels_encoding,C_sexe_labels_encoding,C_diplome_labels_encoding,C_dispo_labels_encoding,note_Q_labels_encoding,salaire_Q_labels_encoding,age_D_labels_encoding,note_D_labels_encoding,salaire_D_labels_encoding
0,2010-01-01,blond,22,8,35554,M,master,geologie,72.41,oui,...,[34964-36216[,3,1,3,1,0,2,0,3,5
1,2010-01-01,brun,42,6,38102,M,licence,geologie,72.46,non,...,[37615-39172[,3,1,2,0,0,2,7,3,7
2,2010-01-01,brun,34,14,36232,M,licence,geologie,65.43,non,...,[36216-37615[,3,1,2,0,0,2,3,1,6
3,2010-01-01,brun,32,7,37425,M,master,geologie,92.64,oui,...,[36216-37615[,3,1,3,1,2,2,3,8,6
4,2010-01-01,brun,25,13,46881,M,bac,geologie,29.52,non,...,[41358-53977[,3,1,0,0,3,3,1,7,9


Ajout des variables du type interactions

In [155]:
# def interaction_order(df,list_of_categ_var, order, only_ordre):
def interaction_order(df,list_of_categ_var, order):
    
    initial_list_df = list_of_categ_var
    print (list(df))

    # if only_ordre == True:
    all_interactions = list(combinations(list_of_categ_var,order))

    # print ("Nb d'interactions", len(all_interactions))
    # print ("Interaction entre : ", all_interactions)

    for i in range(0, len(all_interactions)):
        # all_inter_variables = []
        # print (list(all_interactions[i]))

        for each in list(all_interactions[i]):
            # New columns of interaction
            for each in range(0,order):

                # Name of interaction variables
                # var_inter_name = 'inter_' + str(i)
                
                var_inter_name = "_+_".join(list(all_interactions[i]))

                # df[var_inter_name] = df[list(all_interactions[i])[0]] + '_' + df[list(all_interactions[i])[1]]
                if each == 0:
                    # print ("each", each)
                    
                    # print (var_inter_name)
                    df[var_inter_name] = df[list(all_interactions[i])[each]]
                else:
                    df[var_inter_name] = df[var_inter_name] + '_' + df[list(all_interactions[i])[each]]

                # all_inter_variables.append(var_inter_name)
                # print (all_inter_variables)

    # Keep only interaction variables in the dataframe
    # print (initial_list_df)
    df = df.drop(initial_list_df, axis=1)
    # df = df[all_inter_variables]
    return df

In [156]:
df_with_interaction = interaction_order(df_all_data[all_categorical],all_categorical, 2)

['C_sexe', 'C_diplome', 'C_specialite', 'C_dispo']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [157]:
df_with_interaction.head()

Unnamed: 0,C_sexe_+_C_diplome,C_sexe_+_C_specialite,C_sexe_+_C_dispo,C_diplome_+_C_specialite,C_diplome_+_C_dispo,C_specialite_+_C_dispo
0,M_master,M_geologie,M_oui,master_geologie,master_oui,geologie_oui
1,M_licence,M_geologie,M_non,licence_geologie,licence_non,geologie_non
2,M_licence,M_geologie,M_non,licence_geologie,licence_non,geologie_non
3,M_master,M_geologie,M_oui,master_geologie,master_oui,geologie_oui
4,M_bac,M_geologie,M_non,bac_geologie,bac_non,geologie_non


In [158]:
df_all_categorical = df_all_data[all_categorical].applymap(str)
df_with_interaction = df_with_interaction.applymap(str)

In [159]:
df_before_dummies = pd.concat([df_all_categorical,df_with_interaction], axis = 1)

In [160]:
df_before_dummies.head()

Unnamed: 0,C_sexe,C_diplome,C_specialite,C_dispo,C_sexe_+_C_diplome,C_sexe_+_C_specialite,C_sexe_+_C_dispo,C_diplome_+_C_specialite,C_diplome_+_C_dispo,C_specialite_+_C_dispo
0,M,master,geologie,oui,M_master,M_geologie,M_oui,master_geologie,master_oui,geologie_oui
1,M,licence,geologie,non,M_licence,M_geologie,M_non,licence_geologie,licence_non,geologie_non
2,M,licence,geologie,non,M_licence,M_geologie,M_non,licence_geologie,licence_non,geologie_non
3,M,master,geologie,oui,M_master,M_geologie,M_oui,master_geologie,master_oui,geologie_oui
4,M,bac,geologie,non,M_bac,M_geologie,M_non,bac_geologie,bac_non,geologie_non


In [161]:
df_all_dummies = pd.get_dummies(df_before_dummies)

In [162]:
list(df_all_dummies)

['C_sexe_F',
 'C_sexe_M',
 'C_diplome_bac',
 'C_diplome_doctorat',
 'C_diplome_licence',
 'C_diplome_master',
 'C_specialite_archeologie',
 'C_specialite_detective',
 'C_specialite_forage',
 'C_specialite_geologie',
 'C_dispo_non',
 'C_dispo_oui',
 'C_sexe_+_C_diplome_F_bac',
 'C_sexe_+_C_diplome_F_doctorat',
 'C_sexe_+_C_diplome_F_licence',
 'C_sexe_+_C_diplome_F_master',
 'C_sexe_+_C_diplome_M_bac',
 'C_sexe_+_C_diplome_M_doctorat',
 'C_sexe_+_C_diplome_M_licence',
 'C_sexe_+_C_diplome_M_master',
 'C_sexe_+_C_specialite_F_archeologie',
 'C_sexe_+_C_specialite_F_detective',
 'C_sexe_+_C_specialite_F_forage',
 'C_sexe_+_C_specialite_F_geologie',
 'C_sexe_+_C_specialite_M_archeologie',
 'C_sexe_+_C_specialite_M_detective',
 'C_sexe_+_C_specialite_M_forage',
 'C_sexe_+_C_specialite_M_geologie',
 'C_sexe_+_C_dispo_F_non',
 'C_sexe_+_C_dispo_F_oui',
 'C_sexe_+_C_dispo_M_non',
 'C_sexe_+_C_dispo_M_oui',
 'C_diplome_+_C_specialite_bac_archeologie',
 'C_diplome_+_C_specialite_bac_detective',


In [163]:
df_all_data = pd.concat([df_all_data,df_all_dummies], axis = 1)

In [164]:
list(df_all_data)

['date',
 'cheveux',
 'age',
 'exp',
 'salaire',
 'C_sexe',
 'C_diplome',
 'C_specialite',
 'note',
 'C_dispo',
 'embauche',
 'annee',
 'mois',
 'jour',
 'age_Q',
 'exp_Q',
 'note_Q',
 'salaire_Q',
 'age_D',
 'note_D',
 'salaire_D',
 'C_specialite_labels_encoding',
 'C_sexe_labels_encoding',
 'C_diplome_labels_encoding',
 'C_dispo_labels_encoding',
 'note_Q_labels_encoding',
 'salaire_Q_labels_encoding',
 'age_D_labels_encoding',
 'note_D_labels_encoding',
 'salaire_D_labels_encoding',
 'C_sexe_F',
 'C_sexe_M',
 'C_diplome_bac',
 'C_diplome_doctorat',
 'C_diplome_licence',
 'C_diplome_master',
 'C_specialite_archeologie',
 'C_specialite_detective',
 'C_specialite_forage',
 'C_specialite_geologie',
 'C_dispo_non',
 'C_dispo_oui',
 'C_sexe_+_C_diplome_F_bac',
 'C_sexe_+_C_diplome_F_doctorat',
 'C_sexe_+_C_diplome_F_licence',
 'C_sexe_+_C_diplome_F_master',
 'C_sexe_+_C_diplome_M_bac',
 'C_sexe_+_C_diplome_M_doctorat',
 'C_sexe_+_C_diplome_M_licence',
 'C_sexe_+_C_diplome_M_master',
 'C_s

In [165]:
liste_dummy_variables = list(df_all_dummies)

## I Régression logistique avec régularisation L2
### On utilise de la cross validation avec  GridSearchCV

Liste des variables pour la régression logistique

In [166]:
# del liste_feat_for_logistic_regression

init = []
# init.append(list(df_all_dummies))
init.append(liste_dummy_variables)
numeric_var_to_add = ['note', 'salaire', 'age']
init.append(numeric_var_to_add)

liste_feat_for_logistic_regression = list(chain(*init))
liste_feat_for_logistic_regression

['C_sexe_F',
 'C_sexe_M',
 'C_diplome_bac',
 'C_diplome_doctorat',
 'C_diplome_licence',
 'C_diplome_master',
 'C_specialite_archeologie',
 'C_specialite_detective',
 'C_specialite_forage',
 'C_specialite_geologie',
 'C_dispo_non',
 'C_dispo_oui',
 'C_sexe_+_C_diplome_F_bac',
 'C_sexe_+_C_diplome_F_doctorat',
 'C_sexe_+_C_diplome_F_licence',
 'C_sexe_+_C_diplome_F_master',
 'C_sexe_+_C_diplome_M_bac',
 'C_sexe_+_C_diplome_M_doctorat',
 'C_sexe_+_C_diplome_M_licence',
 'C_sexe_+_C_diplome_M_master',
 'C_sexe_+_C_specialite_F_archeologie',
 'C_sexe_+_C_specialite_F_detective',
 'C_sexe_+_C_specialite_F_forage',
 'C_sexe_+_C_specialite_F_geologie',
 'C_sexe_+_C_specialite_M_archeologie',
 'C_sexe_+_C_specialite_M_detective',
 'C_sexe_+_C_specialite_M_forage',
 'C_sexe_+_C_specialite_M_geologie',
 'C_sexe_+_C_dispo_F_non',
 'C_sexe_+_C_dispo_F_oui',
 'C_sexe_+_C_dispo_M_non',
 'C_sexe_+_C_dispo_M_oui',
 'C_diplome_+_C_specialite_bac_archeologie',
 'C_diplome_+_C_specialite_bac_detective',


In [169]:
df_all_data.shape

(18320, 94)

Holdout en faisant un split des données :
- données d'apprentissage avec 70% des données 
- données de validation avec 30% des données

In [170]:
Y_train = df_all_data[[target_var_name]]
X_train_all_columns, X_valid_all_columns, Y_train, Y_valid = train_test_split(df_all_data,Y_train,test_size = 0.3,random_state = 56)

In [171]:
Y_train.shape

(12824, 1)

On sélectionne les variables choisies pour la régression logistique

In [172]:
X_train = X_train_all_columns[liste_feat_for_logistic_regression]

In [173]:
X_train_all_columns.shape

(12824, 94)

In [174]:
X_train.shape

(12824, 67)

In [175]:
list(X_train)

['C_sexe_F',
 'C_sexe_M',
 'C_diplome_bac',
 'C_diplome_doctorat',
 'C_diplome_licence',
 'C_diplome_master',
 'C_specialite_archeologie',
 'C_specialite_detective',
 'C_specialite_forage',
 'C_specialite_geologie',
 'C_dispo_non',
 'C_dispo_oui',
 'C_sexe_+_C_diplome_F_bac',
 'C_sexe_+_C_diplome_F_doctorat',
 'C_sexe_+_C_diplome_F_licence',
 'C_sexe_+_C_diplome_F_master',
 'C_sexe_+_C_diplome_M_bac',
 'C_sexe_+_C_diplome_M_doctorat',
 'C_sexe_+_C_diplome_M_licence',
 'C_sexe_+_C_diplome_M_master',
 'C_sexe_+_C_specialite_F_archeologie',
 'C_sexe_+_C_specialite_F_detective',
 'C_sexe_+_C_specialite_F_forage',
 'C_sexe_+_C_specialite_F_geologie',
 'C_sexe_+_C_specialite_M_archeologie',
 'C_sexe_+_C_specialite_M_detective',
 'C_sexe_+_C_specialite_M_forage',
 'C_sexe_+_C_specialite_M_geologie',
 'C_sexe_+_C_dispo_F_non',
 'C_sexe_+_C_dispo_F_oui',
 'C_sexe_+_C_dispo_M_non',
 'C_sexe_+_C_dispo_M_oui',
 'C_diplome_+_C_specialite_bac_archeologie',
 'C_diplome_+_C_specialite_bac_detective',


Centrage-réduction des variables numériques utilisées: note, salaire et age

In [176]:
all_numeric_to_scale = numeric_var_to_add
scaler = StandardScaler()
X_train_scaled = pd.DataFrame.from_records(scaler.fit_transform(X_train[all_numeric_to_scale]))

In [177]:
all_numeric_to_scale

['note', 'salaire', 'age']

In [178]:
X_train_scaled.columns = all_numeric_to_scale

In [179]:
X_train_scaled.head()

Unnamed: 0,note,salaire,age
0,-0.129135,-0.900863,0.706774
1,-0.948578,0.37779,2.638501
2,1.142052,-1.459383,0.593143
3,0.25549,0.076045,-0.202275
4,-0.752472,1.287844,0.706774


In [180]:
X_train = X_train.drop(list(X_train_scaled), axis=1)

In [181]:
list(X_train)

['C_sexe_F',
 'C_sexe_M',
 'C_diplome_bac',
 'C_diplome_doctorat',
 'C_diplome_licence',
 'C_diplome_master',
 'C_specialite_archeologie',
 'C_specialite_detective',
 'C_specialite_forage',
 'C_specialite_geologie',
 'C_dispo_non',
 'C_dispo_oui',
 'C_sexe_+_C_diplome_F_bac',
 'C_sexe_+_C_diplome_F_doctorat',
 'C_sexe_+_C_diplome_F_licence',
 'C_sexe_+_C_diplome_F_master',
 'C_sexe_+_C_diplome_M_bac',
 'C_sexe_+_C_diplome_M_doctorat',
 'C_sexe_+_C_diplome_M_licence',
 'C_sexe_+_C_diplome_M_master',
 'C_sexe_+_C_specialite_F_archeologie',
 'C_sexe_+_C_specialite_F_detective',
 'C_sexe_+_C_specialite_F_forage',
 'C_sexe_+_C_specialite_F_geologie',
 'C_sexe_+_C_specialite_M_archeologie',
 'C_sexe_+_C_specialite_M_detective',
 'C_sexe_+_C_specialite_M_forage',
 'C_sexe_+_C_specialite_M_geologie',
 'C_sexe_+_C_dispo_F_non',
 'C_sexe_+_C_dispo_F_oui',
 'C_sexe_+_C_dispo_M_non',
 'C_sexe_+_C_dispo_M_oui',
 'C_diplome_+_C_specialite_bac_archeologie',
 'C_diplome_+_C_specialite_bac_detective',


In [206]:
X_train = pd.concat([X_train.reset_index(drop = True), X_train_scaled.reset_index(drop = True)], axis = 1)

In [207]:
X_train.shape

(12824, 45)

Filtre sur les strates de specialité

In [208]:
list(df_all_data["C_specialite"].unique())

['geologie', 'forage', 'archeologie', 'detective']

In [214]:
X_train_with_Y = pd.concat([X_train,Y_train], axis = 1)

In [215]:
X_train_arch = X_train_with_Y[X_train_with_Y['C_specialite_archeologie'] == 1]

In [None]:
'C_specialite_detective',
'C_specialite_forage',
'C_specialite_geologie',

In [220]:
liste_features = [each for each in list(X_train_arch) if each != "embauche"]
X_train = X_train_arch[liste_features]
Y_train = X_train_arch[["embauche"]]
X_train, X_test, Y_train, Y_test = train_test_split(X_train,Y_train,test_size = 0.3,random_state = 56)

Le % de la variable cible est respecté

In [221]:
len(Y_train[Y_train["embauche"] == 1])/len(Y_train)

0.07142857142857142

In [222]:
X_test_all_columns.shape

(5496, 71)

In [223]:
X_train.shape

(616, 45)

In [224]:
Y_train.shape

(616, 1)

In [228]:
X_train.describe()

Unnamed: 0,C_sexe_F,C_sexe_M,C_diplome_bac,C_diplome_doctorat,C_diplome_licence,C_diplome_master,C_specialite_archeologie,C_specialite_detective,C_specialite_forage,C_specialite_geologie,...,C_diplome_+_C_specialite_licence_detective,C_diplome_+_C_specialite_licence_forage,C_diplome_+_C_specialite_licence_geologie,C_diplome_+_C_specialite_master_archeologie,C_diplome_+_C_specialite_master_detective,C_diplome_+_C_specialite_master_forage,C_diplome_+_C_specialite_master_geologie,note,salaire,age
count,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,...,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0
mean,0.800325,0.199675,0.087662,0.202922,0.308442,0.400974,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.400974,0.0,0.0,0.0,0.354783,0.209298,-0.07444
std,0.400081,0.400081,0.283033,0.402502,0.462225,0.490494,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.490494,0.0,0.0,0.0,1.033054,0.982615,1.038391
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.152435,-2.398949,-2.020371
25%,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.359968,-0.481069,-0.884061
50%,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.281462,0.176426,-0.088644
75%,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.089232,0.85726,0.706774
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.064008,3.238046,3.206656


In [225]:
%%time

model = LogisticRegression(solver = "newton-cg")
parameters = {'C':[0.009, 0.005,0.001, 0.1,1,2,3,4,5,10]}

grid_logistic = GridSearchCV(model,parameters, scoring='roc_auc', cv=5, n_jobs = 4)
grid_logistic.fit(X_train.values, Y_train.values.ravel())

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [304]:
# Sans interactions + numériques standardisées
pd.DataFrame(grid_logistic.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.44021,0.10671,0.008791,0.001642,0.009,{'C': 0.009},0.584586,0.611686,0.565267,0.600378,0.588418,0.590066,0.015629,8
1,0.543081,0.018522,0.011792,0.003851,0.005,{'C': 0.005},0.578868,0.609764,0.562978,0.594879,0.58552,0.586401,0.015645,9
2,0.440375,0.045001,0.010315,0.001561,0.001,{'C': 0.001},0.562153,0.603218,0.55349,0.574928,0.571858,0.573129,0.01683,10
3,0.624376,0.040887,0.014052,0.006795,0.1,{'C': 0.1},0.594639,0.612635,0.564577,0.605989,0.591745,0.593917,0.016505,7
4,0.60329,0.031123,0.012512,0.003446,1.0,{'C': 1},0.595571,0.612012,0.564198,0.606273,0.591721,0.593955,0.016557,5
5,0.659368,0.080863,0.013064,0.00588,2.0,{'C': 2},0.59557,0.611951,0.564205,0.606326,0.591726,0.593956,0.01655,4
6,0.548541,0.045636,0.015652,0.005904,3.0,{'C': 3},0.595598,0.611945,0.564188,0.606341,0.591723,0.593959,0.016557,2
7,0.584779,0.056701,0.01186,0.004036,4.0,{'C': 4},0.595613,0.61194,0.564185,0.606335,0.591727,0.59396,0.016556,1
8,0.563729,0.031355,0.012267,0.004104,5.0,{'C': 5},0.595615,0.611935,0.564169,0.606338,0.591725,0.593957,0.016562,3
9,0.506384,0.118205,0.010401,0.005843,10.0,{'C': 10},0.595623,0.611913,0.564157,0.60634,0.591735,0.593954,0.016562,6


In [101]:
# Interactions + variables numériques centrées
pd.DataFrame(grid_logistic.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.888455,0.069528,0.014209,0.006792,0.009,{'C': 0.009},0.686968,0.713513,0.651188,0.693301,0.711016,0.691201,0.02242,8
1,0.911016,0.112461,0.020527,0.013179,0.005,{'C': 0.005},0.67906,0.701753,0.642384,0.682084,0.701078,0.681275,0.021583,9
2,0.800881,0.201881,0.016011,0.011623,0.001,{'C': 0.001},0.655523,0.665709,0.612853,0.653871,0.675483,0.652691,0.021376,10
3,1.066026,0.081099,0.015464,0.00245,0.1,{'C': 0.1},0.709817,0.730402,0.676268,0.718062,0.72858,0.712629,0.019644,7
4,1.393171,0.119134,0.011381,0.003215,1.0,{'C': 1},0.713718,0.7307,0.679633,0.722315,0.729766,0.715229,0.018814,6
5,1.767996,0.320787,0.021259,0.01493,2.0,{'C': 2},0.713709,0.730397,0.680015,0.72238,0.729761,0.715255,0.018624,5
6,1.926846,0.172896,0.012146,0.004514,3.0,{'C': 3},0.713702,0.730271,0.680134,0.722521,0.729752,0.715279,0.018568,4
7,1.568978,0.139979,0.013918,0.006367,4.0,{'C': 4},0.713716,0.730187,0.680275,0.722572,0.729761,0.715305,0.018507,2
8,1.681402,0.178404,0.014597,0.004278,5.0,{'C': 5},0.713693,0.730106,0.680359,0.722617,0.729764,0.71531,0.018466,1
9,1.630143,0.275507,0.010286,0.004609,10.0,{'C': 10},0.713674,0.730008,0.680413,0.722582,0.729745,0.715287,0.018424,3


Courbe de ROC: 
il faut apparemment faire la cross validation avec KFold 
pour pouvoir tracer la courbe de ROC

## II XGBoost d'abord avec Holdout

Liste des variables pour XGBoost

In [43]:
liste_feat_for_xgboost = ['C_specialite_labels_encoding',
'C_sexe_labels_encoding',
'C_diplome_labels_encoding',
# 'note_Q_labels_encoding',
# 'salaire_Q_labels_encoding',
#'age_D_labels_encoding',
#'note_D_labels_encoding',
#'salaire_D_labels_encoding'
'age','salaire','note']

In [44]:
list(df_all_data)

['date',
 'cheveux',
 'age',
 'exp',
 'salaire',
 'C_sexe',
 'C_diplome',
 'C_specialite',
 'note',
 'dispo',
 'embauche',
 'annee',
 'mois',
 'jour',
 'age_Q',
 'exp_Q',
 'note_Q',
 'salaire_Q',
 'age_D',
 'note_D',
 'salaire_D',
 'C_sexe_F',
 'C_sexe_M',
 'C_diplome_bac',
 'C_diplome_doctorat',
 'C_diplome_licence',
 'C_diplome_master',
 'C_specialite_archeologie',
 'C_specialite_detective',
 'C_specialite_forage',
 'C_specialite_geologie',
 'C_specialite_labels_encoding',
 'C_sexe_labels_encoding',
 'C_diplome_labels_encoding',
 'note_Q_labels_encoding',
 'salaire_Q_labels_encoding',
 'age_D_labels_encoding',
 'note_D_labels_encoding',
 'salaire_D_labels_encoding']

In [45]:
X_train = df_all_data[liste_feat_for_xgboost]
Y_train = df_all_data[[target_var_name]]

In [370]:
X_train.index.name = 'id'
X_train_all_columns[liste_feat_for_xgboost].index.name = 'id'

On enlève les données de test

In [374]:
X_train_all_columns[liste_feat_for_xgboost].shape

(14656, 8)

In [373]:
df_temp_1 = pd.merge(X_train, X_train_all_columns[liste_feat_for_xgboost], on = "id", how = "left")
df_temp_1.head()

Unnamed: 0_level_0,C_specialite_labels_encoding_x,C_sexe_labels_encoding_x,C_diplome_labels_encoding_x,note_Q_labels_encoding_x,salaire_Q_labels_encoding_x,age_D_labels_encoding_x,note_D_labels_encoding_x,salaire_D_labels_encoding_x,C_specialite_labels_encoding_y,C_sexe_labels_encoding_y,C_diplome_labels_encoding_y,note_Q_labels_encoding_y,salaire_Q_labels_encoding_y,age_D_labels_encoding_y,note_D_labels_encoding_y,salaire_D_labels_encoding_y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,3,1,3,1,2,0,4,5,,,,,,,,
1,3,1,2,1,2,7,4,7,3.0,1.0,2.0,1.0,2.0,7.0,4.0,7.0
2,3,1,2,1,2,3,2,6,3.0,1.0,2.0,1.0,2.0,3.0,2.0,6.0
3,3,1,3,3,2,3,8,6,3.0,1.0,3.0,3.0,2.0,3.0,8.0,6.0
4,3,1,0,0,3,1,0,9,3.0,1.0,0.0,0.0,3.0,1.0,0.0,9.0


In [None]:
df_temp_2 = merge(Y_train, Y_test, on = "id", how = "left")

In [349]:
X_train.shape

(18320, 8)

In [364]:
Y_test.shape

(3664, 1)

On splitte les données en train et validation

In [331]:
X_train.shape

(5650, 8)

In [309]:
Y_train.shape

(18320, 1)

In [327]:
X_test_all_columns.shape[0]

3664

In [312]:
len(Y_train[Y_train["embauche"] == 1])/len(Y_train)

0.11408296943231441

In [None]:
NB) A FAIRE :
On doit tirer "X_test_all_columns.shape[0]" observations parmi: X_train,Y_train et avoir tjs 11,39% d 'embauche

In [321]:
(X_test_all_columns.shape[0]/df_all_data.shape[0])

0.2

In [47]:
#X_train, X_valid, Y_train, Y_valid = train_test_split(X_train,Y_train,test_size = 0.2,random_state = 125)
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train,Y_train,test_size = 0.4,random_state = 125)

In [48]:
X_train.shape

(10992, 6)

In [49]:
Y_train.shape

(10992, 1)

In [50]:
X_valid.shape

(7328, 6)

In [None]:
silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.01,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=10)

In [62]:
%%time
n_iter = 100

# df_result_models = pd.DataFrame(columns = ['colsample_bytree','learning_rate','max_depth','n_estimators','subsample','min_child_weight','reg_lambda','AUC_train','AUC_test'])
df_result_models = pd.DataFrame(columns = ['colsample_bytree','gamma','learning_rate','max_depth','n_estimators','subsample','reg_lambda','AUC_train','AUCE_test'])

for each in range(1,n_iter):
    
    # print ("Modèle n°", each)
    # colsample_bytree_value = np.random.uniform(0.3,0.7)
    colsample_bytree_value = 0.4
    # gamma_value = np.random.uniform(0, 0.5)
    gamma_value = 10
    # learning_rate_value = np.random.uniform(0.03, 0.3)
    #  learning_rate_value = np.random.uniform(0.01, 0.1)
    learning_rate_value = 0.1
    # max_depth_value = np.random.randint(2, 6)
    
    # TEST 1 & 2: max_depth_value = np.random.randint(7,20)
    # TEST 3: 
    # max_depth_value = np.random.randint(3,7) 
    max_depth_value = 4
    
    
    
    # TEST 1: n_estimators_value = np.random.randint(400, 500)
    # TEST 2: n_estimators_value = np.random.randint(100, 150)
    n_estimators_value = np.random.randint(400, 500)
    
    # n_estimators_value = np.random.randint(400, 3000)
    # n_estimators_value = np.random.randint(400, 5000)
    # subsample_value = np.random.uniform(0.4, 0.6)
    # subsample_value = np.random.uniform(0,1)
    subsample_value= 0.8
    
    # reg_alpha_value = np.random.uniform(0, 0.05)
    # reg_alpha_value = np.random.uniform(0, 10)
    # reg_lambda_value = np.random.uniform(0, 10)
    reg_lambda_value = np.random.uniform(5, 20)
    
    # print ("max_depth", max_depth_value)
    
    min_child_weight_value = np.random.randint(0,300)
    
    xgb_model = xgb.XGBClassifier(
        colsample_bytree = colsample_bytree_value,
        # gamma = gamma_value,
        learning_rate = learning_rate_value,
        max_depth = max_depth_value,
        n_estimators = n_estimators_value,
        subsample = subsample_value,
        # reg_alpha = reg_alpha_value,
        reg_lambda = reg_lambda_value,
        n_jobs = 4,
        # min_child_weight = min_child_weight_value,
        objective='binary:logistic')
        
        # eval_metric = "rmse"
          
    result = xgb_model.fit(X_train, Y_train)
    
    predicted_values = xgb_model.predict(X_train)
    # print ("predicted_values", predicted_values)
    predicted_values = np.expand_dims(predicted_values, axis=1)
        
    # RMSE_train_value = np.sqrt(np.mean((predicted_values - Y_train.values.ravel())**2, axis=0))
    # print("le score RMLSE sur le training set vaut : {}".format(RMSE))
    
    print (Y_train.values.shape)
    print (predicted_values.shape)
    AUC_train_value = roc_auc_score(Y_train.values, predicted_values)
    
    
    predicted_values = xgb_model.predict(X_valid)
    predicted_values = np.expand_dims(predicted_values, axis=1)
        
    # RMSE_test_value = np.sqrt(np.mean((predicted_values - Y_valid.values.ravel())**2, axis=0))
    # print("le score RMSE sur le test set vaut : {}".format(RMSE))
    AUC_test_value = roc_auc_score(Y_valid.values, predicted_values)
    
    if AUC_train_value != None and AUC_test_value != None:
    
        df_result_models = df_result_models.append({'colsample_bytree': colsample_bytree_value, 
                                 # 'gamma':gamma_value,
                                 'learning_rate':learning_rate_value,
                                 'max_depth':max_depth_value,
                                 'n_estimators':n_estimators_value,
                                 'subsample':subsample_value,
                                 # 'reg_alpha':reg_alpha_value,
                                 'reg_lambda':reg_lambda_value,
                                 # 'min_child_weight':min_child_weight_value,                 
                                 'AUC_train':AUC_train_value,
                                 'AUC_test':AUC_test_value}, ignore_index= True)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(10992, 1)
(10992, 1)
CPU times: user 9min 27s, sys: 3.03 s, total: 9min 30s
Wall time: 2min 28s


In [63]:
df_result_models

Unnamed: 0,colsample_bytree,gamma,learning_rate,max_depth,n_estimators,subsample,reg_lambda,AUC_train,AUCE_test,AUC_test
0,0.4,,0.1,4.0,401.0,0.8,12.982385,0.580033,,0.558407
1,0.4,,0.1,4.0,407.0,0.8,13.134500,0.583185,,0.565178
2,0.4,,0.1,4.0,463.0,0.8,16.279111,0.593377,,0.578290
3,0.4,,0.1,4.0,427.0,0.8,19.787850,0.580382,,0.560036
4,0.4,,0.1,4.0,459.0,0.8,15.982710,0.592433,,0.576067
5,0.4,,0.1,4.0,425.0,0.8,7.491000,0.594086,,0.568646
6,0.4,,0.1,4.0,420.0,0.8,12.987913,0.584786,,0.563603
7,0.4,,0.1,4.0,461.0,0.8,12.901857,0.593675,,0.576661
8,0.4,,0.1,4.0,425.0,0.8,10.591981,0.589538,,0.566939
9,0.4,,0.1,4.0,447.0,0.8,17.163944,0.587629,,0.568646
