---
---
# Previsão do Nível de Satisfação dos Clientes do Santander

### _Santander Customer Satisfaction_
---
---

## Treinamento do Modelo // _Model Training_

In [1]:
# Versão da Linguagem Python // Python language version
from platform import python_version
print('Versão da Linguagem Python Usada Neste Jupyter Notebook:', python_version())

Versão da Linguagem Python Usada Neste Jupyter Notebook: 3.9.12


In [2]:
# Imports

#! pip install xgboost

import joblib
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import randint as sp_randint
import sklearn
#from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix
from sklearn.metrics import accuracy_score

from utils import *

import warnings
warnings.filterwarnings("ignore")

np.random.seed(31415)

In [3]:
# Versões dos pacotes usados neste jupyter notebook // Versions of packages used in this jupyter notebook
#!pip install -q -U watermark
%reload_ext watermark
%watermark -a "Tatiana Novaes Carvalho" --iversions

Author: Tatiana Novaes Carvalho

matplotlib: 3.5.1
seaborn   : 0.11.2
joblib    : 1.1.0
sklearn   : 1.1.2
pandas    : 1.4.2
numpy     : 1.22.3



### Carga dos dados // _Data load_

In [4]:
# Carrega os dados // Load the data
df_train = pd.read_csv('../datasets/df_train_preprocess.csv',  index_col = 0)
df_test = pd.read_csv('../datasets/df_test_preprocess.csv',  index_col = 0)

#dic_cut = joblib.load(open('../datasets/dic_cut.pkl', 'rb'))

In [5]:
print(df_train.shape)
print(df_test.shape)

(92164, 142)
(4095, 142)


In [6]:
df_train.head(5)

Unnamed: 0,PCA1,PCA2,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var13_largo_hace2,saldo_medio_var13_largo_hace3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,var38,TARGET
0,-0.96297,-0.139903,-1.024949,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,0.068402,0
1,-2.047607,-0.895903,-0.860611,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,2.023054,0
2,-1.993145,-0.846273,0.536261,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,-0.145718,0
3,-0.885292,-0.045037,-0.860611,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,0.068402,0
4,-2.750361,-1.202725,-1.107118,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,-0.231266,0


In [7]:
df_test.head(5)

Unnamed: 0,PCA1,PCA2,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var13_largo_hace2,saldo_medio_var13_largo_hace3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,var38,TARGET
10,4.004402,-1.080939,0.207585,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,0.068402,0
14,2.504976,15.271996,-0.778442,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,-0.082372,0
18,-0.476002,-0.221313,-0.860611,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,0.974407,0
29,-0.925017,-0.140909,1.193613,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,0.101512,0
80,-2.740984,-1.178466,-0.860611,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,0.107538,0


In [8]:
# Separando variáveis preditoras da variável-alvo (datasets treino e teste)
# Separating predictor variables from the target variable (training and test datasets)

target = 'TARGET'

X_train = df_train.drop(target, axis = 1)
y_train = df_train[target]

X_test = df_test.drop(target, axis = 1)
y_test = df_test[target]

In [9]:
X_train.head()

Unnamed: 0,PCA1,PCA2,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var13_corto_hace3,saldo_medio_var13_largo_hace2,saldo_medio_var13_largo_hace3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,var38
0,-0.96297,-0.139903,-1.024949,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.055948,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,0.068402
1,-2.047607,-0.895903,-0.860611,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.055948,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,2.023054
2,-1.993145,-0.846273,0.536261,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.055948,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,-0.145718
3,-0.885292,-0.045037,-0.860611,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.055948,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,0.068402
4,-2.750361,-1.202725,-1.107118,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.055948,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,-0.231266


In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92164 entries, 0 to 92163
Columns: 141 entries, PCA1 to var38
dtypes: float64(141)
memory usage: 99.8 MB


In [11]:
X_test.head()

Unnamed: 0,PCA1,PCA2,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var13_corto_hace3,saldo_medio_var13_largo_hace2,saldo_medio_var13_largo_hace3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,var38
10,4.004402,-1.080939,0.207585,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.055948,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,0.068402
14,2.504976,15.271996,-0.778442,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.055948,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,-0.082372
18,-0.476002,-0.221313,-0.860611,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.055948,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,0.974407
29,-0.925017,-0.140909,1.193613,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.055948,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,0.101512
80,-2.740984,-1.178466,-0.860611,-0.055846,-0.208291,-0.227918,-0.043196,-0.046181,-0.040202,-0.036714,...,-0.055948,-0.043562,-0.024608,-0.012503,-0.008193,-0.012715,-0.01133,-0.008997,-0.013473,0.107538


In [12]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4095 entries, 10 to 25085
Columns: 141 entries, PCA1 to var38
dtypes: float64(141)
memory usage: 4.4 MB


> Modelos // _Models_

In [13]:
# Checando valores faltantes // Checking missing values
print(f'Valores faltantes nos dados de treino: {X_train.isnull().any().sum()}')
print(f'Valores faltantes nos dados de teste: {X_test.isnull().any().sum()}')


Valores faltantes nos dados de treino: 0
Valores faltantes nos dados de teste: 0


### Construção, Treinamento e Avaliação do Modelo 1 com Regressão Logística (Benchmark)
### _Construction, Training and Evaluation of Model 1 with Logistic Regression (Benchmark)_

In [14]:
# Treinamento do Modelo // Model Training

# Define lista de hiperparâmetros // Define hyperparameter list
tuned_params_v1 = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 
                   #'penalty': ['l1', 'l2']
                  }

# Criação de modelo com GridSearch // Model creation with GridSearch
model_v1 = GridSearchCV(LogisticRegression(), 
                         tuned_params_v1, 
                         scoring = 'roc_auc', 
                         n_jobs = -1) # n_jobs -1 para não impor limites para a execução

# Treinamento  // Training
model_v1.fit(X_train, y_train)

In [15]:
# Seleção do melhor modelo // Selection of the best model
model_v1.best_estimator_

In [16]:
# Treinamento // Training

# Construindo o modelo novamente com os melhores hiperparâmetros // Building the model again with the best hyperparameters
# Isso é necessário pois a versão final não deve ter o GridSearchCV

model_v1 = LogisticRegression(C = 10)
model_v1.fit(X_train, y_train)

In [17]:
# Previsões com dados de teste // Predictions with test data

# Previsões
y_pred_v1 = model_v1.predict(X_test)

# Previsões no formato de probabilidade para cada classe
y_pred_proba_v1 = model_v1.predict_proba(X_test)

# Previsões no formato de probabilidade filtrando para a classe positiva (para calcular Curva ROC)
y_pred_proba_v1 = model_v1.predict_proba(X_test)[:,1]


In [18]:
# Chama função para avaliação do modelo // Call function for model evaluation
cm_v1, roc_auc_v1, auc_v1, accuracy_v1 = evaluate_classification_model(y_test, y_pred_v1, y_pred_proba_v1)
print(cm_v1, roc_auc_v1, auc_v1, accuracy_v1)

[[2925 1004]
 [  75   91]] 0.6463285056745179 0.6966486459965594 0.7365079365079366


In [19]:
# Feature Importance

feature_importance(model_v1, X_train.columns)

Top 10 - Variáveis mais importantes para o resultado do modelo:
--------------------------------------------------
saldo_medio_var5_hace2
saldo_medio_var5_ult1
saldo_medio_var5_ult3
var15
saldo_medio_var5_hace3
num_aport_var13_hace3
saldo_medio_var13_largo_hace2
saldo_medio_var44_hace2
saldo_medio_var8_hace3
saldo_medio_var8_hace2


In [21]:
# Consolidação da avaliação para comparação dos modelos // Consolidation of the evaluation to compare the models

# Cria um dataframe para receber as métricas de cada modelo
df_models = pd.DataFrame()

# Dicionário com as métricas do modelo_v1
dict_model_v1 = {'Nome': 'modelo_v1', 
                 'Algoritmo': 'Regressão Logística', 
                 'ROC_AUC Score': roc_auc_v1,
                 'AUC Score': auc_v1,
                 'Acurácia': accuracy_v1}
dict_model_v1 = pd.DataFrame.from_dict(dict_model_v1, orient='index').T

# Adiciona o dict ao dataframe
df_models = dict_model_v1

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.646329,0.696649,0.736508


### Construção, Treinamento e Avaliação do Modelo 2 com Random Forest
#### _Construction, Training and Evaluation of Model 2 with Random Forest_

In [22]:
# Chama função para seleção de hiperparâmetros com Random Forest 
# Call function for selection of hyperparameters with Random Forest
random_forest_param_selection(X_train, y_train)


In [23]:
# Treinamento // Training

# Construindo o modelo novamente com os melhores hiperparâmetros
# Isso é necessário pois a versão final não deve ter o GridSearchCV
model_v2 = RandomForestClassifier(criterion='entropy', max_features=10, min_samples_leaf=3,
                       min_samples_split=6, n_estimators=300)
model_v2.fit(X_train, y_train)


In [24]:
# Previsões em teste // Test predictions

# Previsões
y_pred_v2 = model_v2.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v2 = model_v2.predict_proba(X_test)[:,1]

In [25]:
# Avaliação do modelo // Model evaluation
cm_v2, roc_auc_v2, auc_v2, accuracy_v2 = evaluate_classification_model(y_test, y_pred_v2, y_pred_proba_v2)
print(cm_v2, roc_auc_v2, auc_v2, accuracy_v2)

[[3356  573]
 [  82   84]] 0.6800927303001776 0.7525030128148122 0.8400488400488401


In [26]:
# Feature Importance

# Construindo o modelo novamente com os melhores hiperparâmetros
# Isso é necessário pois a versão final não deve ter o GridSearchCV
model_v2 = RandomForestClassifier(min_samples_split=10, n_estimators=500)
model_v2.fit(X_train, y_train)

# Variáveis mais relevantes
indices = np.argsort(-model_v2.feature_importances_)
print("Variáveis mais importantes para o resultado do modelo_v2:")
print(50*'-')
for feature in X_train.columns[indices][:10]:
    print(feature)

Variáveis mais importantes para o resultado do modelo_v2:
--------------------------------------------------
var15
PCA2
PCA1
var38
num_var45_hace3
saldo_var30
num_var4
num_med_var22_ult3
saldo_medio_var5_ult3
num_med_var45_ult3


In [28]:
# Dicionário com as métricas do modelo_v2 // Dictionary with model_v2 metrics

dict_model_v2 = {'Nome': 'modelo_v2', 
                 'Algoritmo': 'Random Forest', 
                 'ROC_AUC Score': roc_auc_v2,
                 'AUC Score': auc_v2,
                 'Acurácia': accuracy_v2}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v2 = pd.DataFrame.from_dict(dict_model_v2, orient='index').T
df_list = [df_models, dict_model_v2]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.646329,0.696649,0.736508
0,modelo_v2,Random Forest,0.680093,0.752503,0.840049


### Construção, Treinamento e Avaliação do Modelo 3 com Decision Tree
#### _Construction, Training and Evaluation of Model 3 with Decision Tree_

In [29]:
# Chama função para seleção de hiperparâmetros com Decision Tree
decision_tree_param_selection(X_train,  y_train)


In [30]:
# Treinamento

# Criação do modelo com os melhores hiperparâmetros (versão final não deve ter o RandomizedSearchCV)
#model_v4 = DecisionTreeClassifier(max_depth=4, min_samples_leaf=4, min_samples_split=7)
model_v3 = DecisionTreeClassifier(criterion='log_loss', max_depth=7, max_features=10, min_samples_leaf=3)
model_v3.fit(X_train, y_train)

In [31]:
# Previsões em teste
y_pred_v3 = model_v3.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v3 = model_v3.predict_proba(X_test)[:,1]

In [32]:
# Avaliação do modelo
cm_v3, roc_auc_v3, auc_v3, accuracy_v3 = evaluate_classification_model(y_test, y_pred_v3, y_pred_proba_v3)
print(cm_v3, roc_auc_v3, auc_v3, accuracy_v3)

[[2705 1224]
 [  67   99]] 0.6424279454289543 0.6751748659182417 0.6847374847374847


In [33]:
# Feature Importance

# Variáveis mais relevantes
indices = np.argsort(-model_v3.feature_importances_)
print("Variáveis mais importantes para o resultado do modelo_v3:")
print(50*'-')
for feature in X_train.columns[indices][:10]:
    print(feature)

Variáveis mais importantes para o resultado do modelo_v3:
--------------------------------------------------
saldo_medio_var5_hace2
num_var45_hace3
var15
saldo_var30
PCA1
num_var30
saldo_var5
num_med_var22_ult3
num_var13_0
saldo_medio_var5_ult3


In [35]:
# Dicionário com as métricas do modelo_v3
dict_model_v3 = {'Nome': 'modelo_v3', 
                 'Algoritmo': 'Decision Tree', 
                 'ROC_AUC Score': roc_auc_v3,
                 'AUC Score': auc_v3,
                 'Acurácia': accuracy_v3}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v3 = pd.DataFrame.from_dict(dict_model_v3, orient='index').T
df_list = [df_models, dict_model_v3]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.646329,0.696649,0.736508
0,modelo_v2,Random Forest,0.680093,0.752503,0.840049
0,modelo_v3,Decision Tree,0.642428,0.675175,0.684737


### Construção, Treinamento e Avaliação do Modelo 4 com Gradient Boosting Classifier
#### _Construction, Training and Evaluation of Model 4 with Gradient Boosting Classifier_

In [36]:
# Chamada da função para seleção de hiperparâmetros com Gradient Boosting

gradient_boosting_param_selection(X_train, y_train)

In [37]:
# Treinamento

# Criação do modelo com os melhores hiperparâmetros
model_v4 = GradientBoostingClassifier(max_depth=6, min_samples_split=4)
model_v4.fit(X_train, y_train)

In [38]:
# Previsões em teste
y_pred_v4 = model_v4.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v4 = model_v4.predict_proba(X_test)[:,1]

In [39]:
# Avaliação do modelo
cm_v4, roc_auc_v4, auc_v4, accuracy_v4 = evaluate_classification_model(y_test, y_pred_v4, y_pred_proba_v4)
print(cm_v4, roc_auc_v4, auc_v4, accuracy_v4)

[[3689  240]
 [ 123   43]] 0.5989759496116306 0.7814942641525634 0.9113553113553113


In [41]:
# Dicionário com as métricas do modelo_v4
dict_model_v4 = {'Nome': 'modelo_v4', 
                 'Algoritmo': 'Gradient Boosting', 
                 'ROC_AUC Score': roc_auc_v4,
                 'AUC Score': auc_v4,
                 'Acurácia': accuracy_v4}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v4 = pd.DataFrame.from_dict(dict_model_v4, orient='index').T
df_list = [df_models, dict_model_v4]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.646329,0.696649,0.736508
0,modelo_v2,Random Forest,0.680093,0.752503,0.840049
0,modelo_v3,Decision Tree,0.642428,0.675175,0.684737
0,modelo_v4,Gradient Boosting,0.598976,0.781494,0.911355


### Construção, Treinamento e Avaliação do Modelo 5 com Adaboost Classifier
#### _Construction, Training and Evaluation of Model 5 with Adaboost Classifier_

In [42]:
# Chamada da função para seleção de hiperparâmetros com XGBoost

xgb_param_selection(X_train, y_train, 10)

{'eta': 1, 'max_depth': 7}

In [43]:
# Treinamento

# Criação do modelo com os melhores hiperparâmetros (versão final não deve ter o GridSearchCV)
model_v5 = XGBClassifier(eta = 1, max_depth = 7)
model_v5.fit(X_train, y_train)

In [44]:
# Previsões em teste
y_pred_v5 = model_v5.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v5 = model_v5.predict_proba(X_test)[:,1]

In [45]:
# Avaliação do modelo
cm_v5, roc_auc_v5, auc_v5, accuracy_v5 = evaluate_classification_model(y_test, y_pred_v5, y_pred_proba_v5)
print(cm_v5, roc_auc_v5, auc_v5, accuracy_v5)

[[3719  210]
 [ 135   31]] 0.5666491366330683 0.6988779756337644 0.9157509157509157


In [46]:
# Feature Importance

# Variáveis mais relevantes
indices = np.argsort(-model_v5.feature_importances_)
print("Variáveis mais importantes para o resultado do modelo_v5:")
print(50*'-')
for feature in X_train.columns[indices][:10]:
    print(feature)

Variáveis mais importantes para o resultado do modelo_v5:
--------------------------------------------------
num_var30_0
ind_var37
ind_var37_0
num_var30
saldo_var30
num_med_var22_ult3
num_var45_hace3
var15
num_meses_var5_ult3
imp_op_var39_efect_ult1


In [48]:
# Dicionário com as métricas do modelo_v5
dict_model_v5 = {'Nome': 'modelo_v5', 
                 'Algoritmo': 'XGBoost', 
                 'ROC_AUC Score': roc_auc_v5,
                 'AUC Score': auc_v5,
                 'Acurácia': accuracy_v5}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v5 = pd.DataFrame.from_dict(dict_model_v5, orient='index').T
df_list = [df_models, dict_model_v5]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.646329,0.696649,0.736508
0,modelo_v2,Random Forest,0.680093,0.752503,0.840049
0,modelo_v3,Decision Tree,0.642428,0.675175,0.684737
0,modelo_v4,Gradient Boosting,0.598976,0.781494,0.911355
0,modelo_v5,XGBoost,0.566649,0.698878,0.915751


### Seleção do Melhor Modelo // _Selection of the Best Model_

In [49]:
# Seleção do modelo com maior AUC Score, por se tratar de uma métrica global // Selection of the model with the highest AUC Score, as it is a global metric
# O score AUC é o ideal para comparar modelos de diferentes algoritmos // The AUC score is ideal for comparing models from different algorithms

df_best_model = df_models[df_models['AUC Score'] == df_models['AUC Score'].max()]
df_best_model


Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v4,Gradient Boosting,0.598976,0.781494,0.911355


In [50]:
# Salva o melhor modelo em disco
df_best_model.to_csv('../models/best_model.csv')

**Conclusão:** O melhor modelo encontrado foi o Gradient Boosting, que apresentou AUC Score de 0.782402.

---
**_Conclusion:_** _The best model found was Gradient Boosting, which presented an AUC Score of 0.782402._
    


### Recuperação do melhor modelo treinado// _Recovery of the best trained model_

In [51]:
# Recuperação do nome do melhor modelo
model = df_best_model.Nome.to_string(index = False)
print(model)

# Nome do arquivo do melhor modelo
best_model_file_name = 'model' + model[-3:]
print(best_model_file_name)

modelo_v4
model_v4


In [55]:
# Salva o melhor modelo em disco com o nome best_model
save_model('best_model', best_model)

In [None]:
type(best_model)