---
---
# Previsão de Customer Churn em Operadoras de Telecom 
---
---

## Treinamento do Modelo // _Model Training_

In [1]:
# Versão da Linguagem Python // Python language version
from platform import python_version
print('Versão da Linguagem Python Usada Neste Jupyter Notebook:', python_version())

Versão da Linguagem Python Usada Neste Jupyter Notebook: 3.9.12


In [2]:
# Imports

#! pip install xgboost

import joblib
import pickle
#import json
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import randint as sp_randint
import sklearn
#from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

np.random.seed(31415)

In [3]:
# Versões dos pacotes usados neste jupyter notebook // Versions of packages used in this jupyter notebook
#!pip install -q -U watermark
%reload_ext watermark
%watermark -a "Tatiana Novaes Carvalho" --iversions

Author: Tatiana Novaes Carvalho

matplotlib: 3.5.1
joblib    : 1.1.0
pandas    : 1.4.2
seaborn   : 0.11.2
sklearn   : 1.1.2
numpy     : 1.22.3



### Carga dos dados // _Data load_

In [4]:
# Carrega os dados // Load the data
df_train = pd.read_csv('../datasets/df_train_balanced.csv',  index_col = 0)
df_test = pd.read_csv('../datasets/df_test_proc.csv',  index_col = 0)

dic_cut = joblib.load(open('../datasets/dic_cut.pkl', 'rb'))

In [5]:
print(df_train.shape)
print(df_test.shape)

(5454, 99)
(1667, 99)


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1667 entries, 1 to 1667
Data columns (total 99 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   account_length         1667 non-null   int64
 1   international_plan     1667 non-null   int64
 2   voice_mail_plan        1667 non-null   int64
 3   number_vmail_messages  1667 non-null   int64
 4   churn                  1667 non-null   int64
 5   state_AK               1667 non-null   int64
 6   state_AL               1667 non-null   int64
 7   state_AR               1667 non-null   int64
 8   state_AZ               1667 non-null   int64
 9   state_CA               1667 non-null   int64
 10  state_CO               1667 non-null   int64
 11  state_CT               1667 non-null   int64
 12  state_DC               1667 non-null   int64
 13  state_DE               1667 non-null   int64
 14  state_FL               1667 non-null   int64
 15  state_GA               1667 non-null  

In [7]:
df_train.head(5)

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,churn,state_AK,state_AL,state_AR,state_AZ,state_CA,...,total_intl_minutes_1,total_intl_minutes_2,total_intl_minutes_3,total_intl_minutes_4,total_intl_minutes_5,total_intl_calls_1,total_intl_calls_2,total_intl_calls_3,total_intl_calls_4,total_intl_calls_5
0,1,0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,1,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [8]:
df_test.head(5)

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,churn,state_AK,state_AL,state_AR,state_AZ,state_CA,...,total_intl_minutes_1,total_intl_minutes_2,total_intl_minutes_3,total_intl_minutes_4,total_intl_minutes_5,total_intl_calls_1,total_intl_calls_2,total_intl_calls_3,total_intl_calls_4,total_intl_calls_5
1,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,1,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


In [9]:
dic_cut

{'total_day_minutes': array([ 17.2802,  81.56  , 145.52  , 209.48  , 273.44  , 337.4   ]),
 'total_day_calls': array([ 41.882,  65.6  ,  89.2  , 112.8  , 136.4  , 160.   ]),
 'total_eve_minutes': array([ 48.8976, 109.68  , 170.16  , 230.64  , 291.12  , 351.6   ]),
 'total_eve_calls': array([ 41.883,  65.4  ,  88.8  , 112.2  , 135.6  , 159.   ]),
 'total_night_minutes': array([ 49.7976, 110.58  , 171.06  , 231.54  , 292.02  , 352.5   ]),
 'total_night_calls': array([ 41.884,  65.2  ,  88.4  , 111.6  , 134.8  , 158.   ]),
 'total_intl_minutes': array([ 1.9836,  5.28  ,  8.56  , 11.84  , 15.12  , 18.4   ]),
 'total_intl_calls': array([ 0.99,  3.  ,  5.  ,  7.  ,  9.  , 11.  ])}

In [10]:
# Separando variáveis preditoras da variável-alvo (datasets treino e teste)
# Separating predictor variables from the target variable (training and test datasets)

target = 'churn'

X_train = df_train.drop(target, axis = 1)
y_train = df_train[target]

X_test = df_test.drop(target, axis = 1)
y_test = df_test[target]

In [11]:
X_train.head()

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,state_AK,state_AL,state_AR,state_AZ,state_CA,state_CO,...,total_intl_minutes_1,total_intl_minutes_2,total_intl_minutes_3,total_intl_minutes_4,total_intl_minutes_5,total_intl_calls_1,total_intl_calls_2,total_intl_calls_3,total_intl_calls_4,total_intl_calls_5
0,1,0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,1,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5454 entries, 0 to 5453
Data columns (total 98 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   account_length         5454 non-null   int64
 1   international_plan     5454 non-null   int64
 2   voice_mail_plan        5454 non-null   int64
 3   number_vmail_messages  5454 non-null   int64
 4   state_AK               5454 non-null   int64
 5   state_AL               5454 non-null   int64
 6   state_AR               5454 non-null   int64
 7   state_AZ               5454 non-null   int64
 8   state_CA               5454 non-null   int64
 9   state_CO               5454 non-null   int64
 10  state_CT               5454 non-null   int64
 11  state_DC               5454 non-null   int64
 12  state_DE               5454 non-null   int64
 13  state_FL               5454 non-null   int64
 14  state_GA               5454 non-null   int64
 15  state_HI               5454 non-null  

In [13]:
X_test.head()

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,state_AK,state_AL,state_AR,state_AZ,state_CA,state_CO,...,total_intl_minutes_1,total_intl_minutes_2,total_intl_minutes_3,total_intl_minutes_4,total_intl_minutes_5,total_intl_calls_1,total_intl_calls_2,total_intl_calls_3,total_intl_calls_4,total_intl_calls_5
1,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,1,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


In [14]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1667 entries, 1 to 1667
Data columns (total 98 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   account_length         1667 non-null   int64
 1   international_plan     1667 non-null   int64
 2   voice_mail_plan        1667 non-null   int64
 3   number_vmail_messages  1667 non-null   int64
 4   state_AK               1667 non-null   int64
 5   state_AL               1667 non-null   int64
 6   state_AR               1667 non-null   int64
 7   state_AZ               1667 non-null   int64
 8   state_CA               1667 non-null   int64
 9   state_CO               1667 non-null   int64
 10  state_CT               1667 non-null   int64
 11  state_DC               1667 non-null   int64
 12  state_DE               1667 non-null   int64
 13  state_FL               1667 non-null   int64
 14  state_GA               1667 non-null   int64
 15  state_HI               1667 non-null  

> Modelos // _Models_

In [15]:
# Funções auxiliares // Auxiliary functions

def evaluate_classification_model(y_test, y_pred, y_pred_proba):
    """ 
    Avalia modelos de classificação por meio de matriz de confusão, AUC, curva ROC e acurácia.
    Evaluates classification models through confusion matrix, AUC, ROC curve and accuracy.
    """
    
    # Matriz de confusão
    cm = confusion_matrix(y_test, y_pred)
    
    # Extração de cada valor da Confusion Matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Cálculo da métrica global AUC (Area Under The Curve) com dados reais e previsões em teste
    roc_auc = roc_auc_score(y_test, y_pred)

    # Cálculo da curva ROC com dados e previsões em teste
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

    # AUC em teste
    auc_ = auc(fpr, tpr)

    # Acurácia em teste
    accuracy = accuracy_score(y_test, y_pred)
    
    return cm, roc_auc, auc_, accuracy


def feature_importance(model, cols_list):
    """ 
    Imprime as 10 variáveis mais importantes para o resultado do modelo.
    Prints the 10 most important variables for the model result
    """
    indices = np.argsort(-abs(model.coef_))

    print("Top 10 - Variáveis mais importantes para o resultado do modelo:")
    print(50*'-')
    for feature in cols_list[indices][0,:10]:
        print(feature) 

def save_model(model_name, model):
    """
    Salva o modelo em disco com pickle.
    Save the model to disk with pickle.
    """
    with open(f'../models/{model_name}.pkl', 'wb') as pickle_file:
        joblib.dump(model, pickle_file)
        



In [16]:
# Checando valores faltantes // Checking missing values
print(f'Valores faltantes nos dados de treino: {X_train.isnull().any().sum()}')
print(f'Valores faltantes nos dados de teste: {X_test.isnull().any().sum()}')


Valores faltantes nos dados de treino: 0
Valores faltantes nos dados de teste: 0


### Construção, Treinamento e Avaliação do Modelo 1 com Regressão Logística (Benchmark)
### _Construction, Training and Evaluation of Model 1 with Logistic Regression (Benchmark)_

In [17]:
# Treinamento do Modelo // Model Training

# Define lista de hiperparâmetros // Define hyperparameter list
tuned_params_v1 = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 
                   #'penalty': ['l1', 'l2']
                  }

# Criação de modelo com GridSearch // Model creation with GridSearch
model_v1 = GridSearchCV(LogisticRegression(), 
                         tuned_params_v1, 
                         scoring = 'roc_auc', 
                         n_jobs = -1) # n_jobs -1 para não impor limites para a execução

# Treinamento  // Training
model_v1.fit(X_train, y_train)

In [18]:
# Seleção do melhor modelo // Selection of the best model
model_v1.best_estimator_

In [19]:
# Treinamento // Training

# Construindo o modelo novamente com os melhores hiperparâmetros // Building the model again with the best hyperparameters
# Isso é necessário pois a versão final não deve ter o GridSearchCV

model_v1 = LogisticRegression(C = 0.1)
model_v1.fit(X_train, y_train)

In [20]:
# Previsões com dados de teste // Predictions with test data

# Previsões
y_pred_v1 = model_v1.predict(X_test)

# Previsões no formato de probabilidade para cada classe
y_pred_proba_v1 = model_v1.predict_proba(X_test)

# Previsões no formato de probabilidade filtrando para a classe positiva (para calcular Curva ROC)
y_pred_proba_v1 = model_v1.predict_proba(X_test)[:,1]


In [21]:
# Chama função para avaliação do modelo // Call function for model evaluation
cm_v1, roc_auc_v1, auc_v1, accuracy_v1 = evaluate_classification_model(y_test, y_pred_v1, y_pred_proba_v1)
print(cm_v1, roc_auc_v1, auc_v1, accuracy_v1)

[[1074  369]
 [  81  143]] 0.6913378007128007 0.7511818136818138 0.7300539892021596


In [22]:
# Feature Importance

feature_importance(model_v1, X_train.columns)

Top 10 - Variáveis mais importantes para o resultado do modelo:
--------------------------------------------------
international_plan
total_day_minutes_5
number_vmail_messages
state_MT
total_day_minutes_3
state_VA
total_night_calls_5
state_TX
total_eve_calls_5
state_SC


In [23]:
# Salva o modelo em disco // Save the model to disk
save_model('model_v1', model_v1)

In [24]:
# Consolidação da avaliação para comparação dos modelos // Consolidation of the evaluation to compare the models

# Cria um dataframe para receber as métricas de cada modelo
df_models = pd.DataFrame()

# Dicionário com as métricas do modelo_v1
dict_model_v1 = {'Nome': 'modelo_v1', 
                 'Algoritmo': 'Regressão Logística', 
                 'ROC_AUC Score': roc_auc_v1,
                 'AUC Score': auc_v1,
                 'Acurácia': accuracy_v1}
dict_model_v1 = pd.DataFrame.from_dict(dict_model_v1, orient='index').T

# Adiciona o dict ao dataframe
df_models = dict_model_v1

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.691338,0.751182,0.730054


### Construção, Treinamento e Avaliação do Modelo 2 com Random Forest
#### _Construction, Training and Evaluation of Model 2 with Random Forest_

In [25]:
# Função para seleção de hiperparâmetros com Random Forest // Function for selecting hyperparameters with Random Forest
def random_forest_param_selection(X, y):
    """
    Função para seleção de hiperparâmetros com Random Forest. 
    Function for selecting hyperparameters with Random Forest
    """
    n_estimators = [100, 200, 300, 400, 500]
    max_depth = [3, None]
    max_features = sp_randint(1, 11)
    min_samples_split = sp_randint(2, 11)
    min_samples_leaf = sp_randint(2, 11) 
    bootstrap = [True, False]
    criterion = ["gini", "entropy"]
    n_iter_search = 20
        
    param_dist = {'n_estimators': n_estimators,
                  'max_depth': max_depth,
                  'max_features': max_features,
                  'min_samples_split': min_samples_split,
                  'min_samples_leaf': min_samples_leaf,
                  'bootstrap': bootstrap,
                  'criterion': criterion
                  }
    rand_search = RandomizedSearchCV(RandomForestClassifier(),
                                     param_distributions = param_dist,
                                     n_iter = n_iter_search,
                                     scoring = 'roc_auc',
                                     n_jobs  = -1)
    rand_search.fit(X_train, y_train)
    rand_search.best_estimator_
    return rand_search.best_estimator_

In [26]:
# Chama função para seleção de hiperparâmetros com Random Forest 
# Call function for selection of hyperparameters with Random Forest
random_forest_param_selection(X_train, y_train)


In [27]:
# Treinamento // Training

# Construindo o modelo novamente com os melhores hiperparâmetros
# Isso é necessário pois a versão final não deve ter o GridSearchCV
model_v2 = RandomForestClassifier(criterion='entropy', max_features=10, min_samples_leaf=3,
                       min_samples_split=6, n_estimators=300)
model_v2.fit(X_train, y_train)


In [28]:
# Previsões em teste // Test predictions

# Previsões
y_pred_v2 = model_v2.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v2 = model_v2.predict_proba(X_test)[:,1]

In [29]:
# Avaliação do modelo // Model evaluation
cm_v2, roc_auc_v2, auc_v2, accuracy_v2 = evaluate_classification_model(y_test, y_pred_v2, y_pred_proba_v2)
print(cm_v2, roc_auc_v2, auc_v2, accuracy_v2)

[[1279  164]
 [ 101  123]] 0.7177275146025146 0.794580982080982 0.8410317936412718


In [30]:
# Feature Importance

# Construindo o modelo novamente com os melhores hiperparâmetros
# Isso é necessário pois a versão final não deve ter o GridSearchCV
model_v2 = RandomForestClassifier(min_samples_split=10, n_estimators=500)
model_v2.fit(X_train, y_train)

# Variáveis mais relevantes
indices = np.argsort(-model_v2.feature_importances_)
print("Variáveis mais importantes para o resultado do modelo_v2:")
print(50*'-')
for feature in X_train.columns[indices][:10]:
    print(feature)

Variáveis mais importantes para o resultado do modelo_v2:
--------------------------------------------------
international_plan
total_day_minutes_5
number_vmail_messages
total_day_minutes_3
voice_mail_plan
total_intl_calls_1
total_day_minutes_4
total_eve_minutes_2
total_eve_minutes_4
total_day_minutes_2


In [31]:
# Salva o modelo em disco // Save the model to disk
save_model('model_v2', model_v2)

In [32]:
# Dicionário com as métricas do modelo_v2 // Dictionary with model_v2 metrics

dict_model_v2 = {'Nome': 'modelo_v2', 
                 'Algoritmo': 'Random Forest', 
                 'ROC_AUC Score': roc_auc_v2,
                 'AUC Score': auc_v2,
                 'Acurácia': accuracy_v2}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v2 = pd.DataFrame.from_dict(dict_model_v2, orient='index').T
df_list = [df_models, dict_model_v2]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.691338,0.751182,0.730054
0,modelo_v2,Random Forest,0.717728,0.794581,0.841032


### Construção, Treinamento e Avaliação do Modelo 3 com KNN
#### _Construction, Training and Evaluation of Model 3 with KNN_

In [33]:
# Função para seleção de hiperparâmetros com KNN
def knn_param_selection(X_train, y_train):
    """ Função para seleção de hiperparâmetros com KNN. """
    
    # Lista de possíveis valores de K
    neighbors = list(range(1, 30, 2))

    # Lista para os scores
    cv_scores = []

    # Validação cruzada para determinar o melhor valor de k
    for k in neighbors:
        knn = KNeighborsClassifier(n_neighbors = k)
        scores = cross_val_score(knn, X_train, y_train, cv = 5, scoring = 'accuracy')
        cv_scores.append(scores.mean())   

    # Ajustando o erro de classificação
    error = [1 - x for x in cv_scores]

    # Determinando o melhor valor de k (com menor erro)
    optimal_k = neighbors[error.index(min(error))]
    return optimal_k


In [34]:
# Seleção de hiperparâmetro
optimal_k = knn_param_selection(X_train, y_train)
print(f'O valor ideal de k é {optimal_k}')

O valor ideal de k é 1


In [35]:
# Treinamento

# Criamos o modelo versão 3
model_v3 = KNeighborsClassifier(n_neighbors = optimal_k)

# Treinamento
model_v3.fit(X_train, y_train)

In [36]:
# Previsões com dados de teste

# Previsões com dados de teste
y_pred_v3 = model_v3.predict(X_test)

# Obtemos as previsões no formato de probabilidade para cada classe
y_pred_proba_v3 = model_v3.predict_proba(X_test)

# Obtemos as previsões no formato de probabilidade filtrando para a classe positiva
# Precisamos disso para calcula a Curva ROC
y_pred_proba_v3 = model_v3.predict_proba(X_test)[:,1]

In [37]:
# Avaliação do modelo
cm_v3, roc_auc_v3, auc_v3, accuracy_v3 = evaluate_classification_model(y_test, y_pred_v3, y_pred_proba_v3)
print(cm_v3, roc_auc_v3, auc_v3, accuracy_v3)

[[1215  228]
 [ 149   75]] 0.5884086352836353 0.5884086352836353 0.7738452309538092


In [38]:
#Obs: Com o algoritmo KNN não extraímos as variáveis mais importantes, pois o conceito do algoritmo é diferente.

In [39]:
# Salva o modelo em disco
save_model('model_v3', model_v3)

In [40]:
# Dicionário com as métricas do modelo_v3
dict_model_v3 = {'Nome': 'modelo_v3', 
                  'Algoritmo': 'KNN', 
                  'ROC_AUC Score': roc_auc_v3,
                  'AUC Score': auc_v3,
                  'Acurácia': accuracy_v3}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v3 = pd.DataFrame.from_dict(dict_model_v3, orient='index').T
df_list = [df_models, dict_model_v3]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.691338,0.751182,0.730054
0,modelo_v2,Random Forest,0.717728,0.794581,0.841032
0,modelo_v3,KNN,0.588409,0.588409,0.773845


### Construção, Treinamento e Avaliação do Modelo 4 com Decision Tree
#### _Construction, Training and Evaluation of Model 4 with Decision Tree_

In [41]:
#https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [42]:
# Função para seleção de hiperparâmetros com Decision Tree
def decision_tree_param_selection(X_train, y_train):
    """ Função para seleção de hiperparâmetros com Decision Tree. """
    max_features = sp_randint(1, 11)
    criterion = ['entropy', 'gini', 'log_loss']
    min_samples_split= [2, 3, 4, 5, 7]
    min_samples_leaf= [1, 2, 3, 4, 6]
    max_depth= [2, 3, 4, 5, 6, 7]
    n_iter_search = 20
    param_dist = {'max_features': max_features,
                  'criterion': criterion,
                  'min_samples_split':min_samples_split,
                  'min_samples_leaf':min_samples_leaf,
                  'max_depth':max_depth}
    
    rand_search = RandomizedSearchCV(DecisionTreeClassifier(),
                                     param_distributions = param_dist,
                                     n_iter = n_iter_search,
                                     scoring = 'roc_auc',
                                     n_jobs  = -1)
    rand_search.fit(X_train, y_train)
    rand_search.best_estimator_
    return rand_search.best_estimator_

In [43]:
# Chama função para seleção de hiperparâmetros com Decision Tree
decision_tree_param_selection(X_train,  y_train)


In [44]:
# Treinamento

# Criação do modelo com os melhores hiperparâmetros (versão final não deve ter o RandomizedSearchCV)
#model_v4 = DecisionTreeClassifier(max_depth=4, min_samples_leaf=4, min_samples_split=7)
model_v4 = DecisionTreeClassifier(criterion='log_loss', max_depth=6, max_features=9, min_samples_split=3)
model_v4.fit(X_train, y_train)

In [45]:
# Previsões em teste
y_pred_v4 = model_v4.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v4 = model_v4.predict_proba(X_test)[:,1]

In [46]:
# Avaliação do modelo
cm_v4, roc_auc_v4, auc_v4, accuracy_v4 = evaluate_classification_model(y_test, y_pred_v4, y_pred_proba_v4)
print(cm_v4, roc_auc_v4, auc_v4, accuracy_v4)

[[721 722]
 [ 46 178]] 0.6471481783981784 0.6878743441243442 0.5392921415716857


In [47]:
# Feature Importance

# Variáveis mais relevantes
indices = np.argsort(-model_v2.feature_importances_)
print("Variáveis mais importantes para o resultado do modelo_v2:")
print(50*'-')
for feature in X_train.columns[indices][:10]:
    print(feature)

Variáveis mais importantes para o resultado do modelo_v2:
--------------------------------------------------
international_plan
total_day_minutes_5
number_vmail_messages
total_day_minutes_3
voice_mail_plan
total_intl_calls_1
total_day_minutes_4
total_eve_minutes_2
total_eve_minutes_4
total_day_minutes_2


In [48]:
# Salva o modelo em disco
save_model('model_v4', model_v4)

In [49]:
# Dicionário com as métricas do modelo_v4
dict_model_v4 = {'Nome': 'modelo_v4', 
                 'Algoritmo': 'Decision Tree', 
                 'ROC_AUC Score': roc_auc_v4,
                 'AUC Score': auc_v4,
                 'Acurácia': accuracy_v4}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v4 = pd.DataFrame.from_dict(dict_model_v4, orient='index').T
df_list = [df_models, dict_model_v4]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.691338,0.751182,0.730054
0,modelo_v2,Random Forest,0.717728,0.794581,0.841032
0,modelo_v3,KNN,0.588409,0.588409,0.773845
0,modelo_v4,Decision Tree,0.647148,0.687874,0.539292


### Construção, Treinamento e Avaliação do Modelo 5 com SVM
#### _Construction, Training and Evaluation of Model 5 with SVM_

In [50]:
#https://scikit-learn.org/stable/modules/svm.html

In [51]:
# Função para seleção de hiperparâmetros
def svc_param_selection(X_train, y_train, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(SVC(kernel = 'rbf'), param_grid, cv = nfolds)
    grid_search.fit(X_train, y_train)
    grid_search.best_params_
    return grid_search.best_params_

In [52]:
# Aplica a função
svc_param_selection(X_train, y_train, 5)

{'C': 10, 'gamma': 0.1}

In [53]:
# Treinamento

# Criação do modelo com os melhores hiperparâmetros (versão final não deve ter o GridSearchCV)
# model_v5 = SVC(C = 10, gamma = 0.1, probability = True)
model_v5 = SVC(C = 10, gamma = 0.1, probability = True)
model_v5.fit(X_train, y_train)

In [54]:
# Previsões em teste
y_pred_v5 = model_v5.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v5 = model_v5.predict_proba(X_test)[:,1]

In [55]:
# Avaliação do modelo
cm_v5, roc_auc_v5, auc_v5, accuracy_v5 = evaluate_classification_model(y_test, y_pred_v5, y_pred_proba_v5)
print(cm_v5, roc_auc_v5, auc_v5, accuracy_v5)

[[1298  145]
 [ 143   81]] 0.6305610211860211 0.7075196762696763 0.8272345530893821


In [56]:
# Salva o modelo em disco
save_model('model_v5', model_v5)

In [57]:
# Dicionário com as métricas do modelo_v5
dict_model_v5 = {'Nome': 'modelo_v5', 
                 'Algoritmo': 'SVM', 
                 'ROC_AUC Score': roc_auc_v5,
                 'AUC Score': auc_v5,
                 'Acurácia': accuracy_v5}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v5 = pd.DataFrame.from_dict(dict_model_v5, orient='index').T
df_list = [df_models, dict_model_v5]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.691338,0.751182,0.730054
0,modelo_v2,Random Forest,0.717728,0.794581,0.841032
0,modelo_v3,KNN,0.588409,0.588409,0.773845
0,modelo_v4,Decision Tree,0.647148,0.687874,0.539292
0,modelo_v5,SVM,0.630561,0.70752,0.827235


### Construção, Treinamento e Avaliação do Modelo 6 com Gradient Boosting Classifier
#### _Construction, Training and Evaluation of Model 6 with Gradient Boosting Classifier_

In [58]:
# Função para seleção de hiperparâmetros com Gradient Boosting

def gradient_boosting_param_selection(X_train, y_train):
    """ Função para seleção de hiperparâmetros com Gradient Boosting Classifier. """
    learning_rate = [0.0001, 0.001, 0.01, 0.1, 1, 10]
    n_estimators = [50, 100, 150, 200, 300, 400, 500, 600]
    criterion = ['friedman_mse', 'squared_error', 'squared_error']
    min_samples_split= [2, 3, 4, 5, 7]
    min_samples_leaf= [1, 2, 3, 4, 6]
    max_depth= [2, 3, 4, 5, 6, 7]
    n_iter_search = 20
    param_dist = {'learning_rate': learning_rate,
                  'n_estimators': n_estimators,
                  'criterion': criterion,
                  'min_samples_split':min_samples_split,
                  'min_samples_leaf':min_samples_leaf,
                  'max_depth':max_depth}
    
    rand_search = RandomizedSearchCV(GradientBoostingClassifier(),
                                     param_distributions = param_dist,
                                     n_iter = n_iter_search,
                                     scoring = 'roc_auc',
                                     n_jobs  = -1)
    rand_search.fit(X_train, y_train)
    rand_search.best_estimator_
    return rand_search.best_estimator_

In [59]:
# Aplica a função
gradient_boosting_param_selection(X_train, y_train)

In [60]:
# Treinamento

# Criação do modelo com os melhores hiperparâmetros
model_v6 = GradientBoostingClassifier(max_depth=7, min_samples_leaf=4, min_samples_split=5, n_estimators=400)
model_v6.fit(X_train, y_train)

In [61]:
# Previsões em teste
y_pred_v6 = model_v6.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v6 = model_v6.predict_proba(X_test)[:,1]

In [62]:
# Avaliação do modelo
cm_v6, roc_auc_v6, auc_v6, accuracy_v6 = evaluate_classification_model(y_test, y_pred_v6, y_pred_proba_v6)
print(cm_v6, roc_auc_v6, auc_v6, accuracy_v6)

[[1261  182]
 [ 127   97]] 0.653454794079794 0.730651668151668 0.8146370725854829


In [63]:
# Salva o modelo em disco
save_model('model_v6', model_v6)

In [64]:
# Dicionário com as métricas do modelo_v6
dict_model_v6 = {'Nome': 'modelo_v6', 
                 'Algoritmo': 'Gradient Boosting', 
                 'ROC_AUC Score': roc_auc_v6,
                 'AUC Score': auc_v6,
                 'Acurácia': accuracy_v6}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v6 = pd.DataFrame.from_dict(dict_model_v6, orient='index').T
df_list = [df_models, dict_model_v6]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.691338,0.751182,0.730054
0,modelo_v2,Random Forest,0.717728,0.794581,0.841032
0,modelo_v3,KNN,0.588409,0.588409,0.773845
0,modelo_v4,Decision Tree,0.647148,0.687874,0.539292
0,modelo_v5,SVM,0.630561,0.70752,0.827235
0,modelo_v6,Gradient Boosting,0.653455,0.730652,0.814637


### Construção, Treinamento e Avaliação do Modelo 7 com Adaboost Classifier
#### _Construction, Training and Evaluation of Model 7 with Adaboost Classifier_

In [65]:
# Função para seleção de hiperparâmetros com XGBoost
# https://xgboost.readthedocs.io/en/latest/parameter.html

def xgb_param_selection(X_train, y_train, nfolds):
    """ Função para seleção de hiperparâmetros com XGBoost. """
    learning_rate = [0.0001, 0.001, 0.01, 0.1, 1, 10]
    max_depth= [2, 3, 4, 5, 6, 7]
    param_grid = {'eta': learning_rate,
                  'max_depth': max_depth}
    
    grid_search = GridSearchCV(XGBClassifier(),
                                     param_grid = param_grid,
                                     cv = nfolds,)

    grid_search.fit(X_train, y_train)
    grid_search.best_params_
    return  grid_search.best_params_

In [66]:
# Aplica a função
xgb_param_selection(X_train, y_train, 10)

{'eta': 1, 'max_depth': 7}

In [67]:
# Treinamento

# Criação do modelo com os melhores hiperparâmetros (versão final não deve ter o GridSearchCV)
model_v7 = XGBClassifier(eta = 1, max_depth = 7)
model_v7.fit(X_train, y_train)

In [68]:
# Previsões em teste
y_pred_v7 = model_v7.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v7 = model_v7.predict_proba(X_test)[:,1]

In [69]:
# Avaliação do modelo
cm_v7, roc_auc_v7, auc_v7, accuracy_v7 = evaluate_classification_model(y_test, y_pred_v7, y_pred_proba_v7)
print(cm_v7, roc_auc_v7, auc_v7, accuracy_v7)

[[1258  185]
 [ 118  106]] 0.6725045787545787 0.7250365062865064 0.8182363527294542


In [70]:
# Feature Importance

# Variáveis mais relevantes
indices = np.argsort(-model_v7.feature_importances_)
print("Variáveis mais importantes para o resultado do modelo_v7:")
print(50*'-')
for feature in X_train.columns[indices][:10]:
    print(feature)

Variáveis mais importantes para o resultado do modelo_v7:
--------------------------------------------------
total_day_minutes_5
international_plan
number_vmail_messages
total_night_calls_5
total_eve_calls_5
state_HI
voice_mail_plan
total_day_minutes_4
state_VA
state_OH


In [71]:
# Salva o modelo em disco
save_model('model_v7', model_v7)

In [72]:
# Dicionário com as métricas do modelo_v7
dict_model_v7 = {'Nome': 'modelo_v7', 
                 'Algoritmo': 'XGBoost', 
                 'ROC_AUC Score': roc_auc_v7,
                 'AUC Score': auc_v7,
                 'Acurácia': accuracy_v7}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v7 = pd.DataFrame.from_dict(dict_model_v7, orient='index').T
df_list = [df_models, dict_model_v7]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.691338,0.751182,0.730054
0,modelo_v2,Random Forest,0.717728,0.794581,0.841032
0,modelo_v3,KNN,0.588409,0.588409,0.773845
0,modelo_v4,Decision Tree,0.647148,0.687874,0.539292
0,modelo_v5,SVM,0.630561,0.70752,0.827235
0,modelo_v6,Gradient Boosting,0.653455,0.730652,0.814637
0,modelo_v7,XGBoost,0.672505,0.725037,0.818236


### Seleção do Melhor Modelo // _Selection of the Best Model_

In [73]:
# Seleção do modelo com maior AUC Score, por se tratar de uma métrica global // Selection of the model with the highest AUC Score, as it is a global metric
# O score AUC é o ideal para comparar modelos de diferentes algoritmos // The AUC score is ideal for comparing models from different algorithms

df_best_model = df_models[df_models['AUC Score'] == df_models['AUC Score'].max()]
df_best_model


Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v2,Random Forest,0.717728,0.794581,0.841032


In [74]:
# Salva o melhor modelo em disco
df_best_model.to_csv('../models/best_model.csv')

**Conclusão:** O melhor modelo encontrado foi o Random Forest, que apresentou AUC Score de 0.794581. Com mais recursos de tempo, o treinamento ainda poderia ser aperfeiçoado com seleção de atributos ou redução de dimensionalidade dos atributos com o algoritmo PCA, por exemplo.

---
**_Conclusion:_** _The best model found was Random Forest, which presented an AUC Score of 0.794581. With more time resources, the training could still be improved with attribute selection or dimensionality reduction of the attributes with the PCA algorithm, for example._
    


### Previsões com o Melhor Modelo Treinado // _Forecasts with the Best Trained Model_

In [75]:
# Recuperação do nome do melhor modelo
model = df_best_model.Nome.to_string(index = False)
print(model)

# Nome do arquivo do melhor modelo
best_model_file_name = 'model' + model[-3:]
print(best_model_file_name)

modelo_v2
model_v2


In [76]:
# Carga do melhor modelo do disco
best_model = joblib.load(open(f'../models/{best_model_file_name}.pkl', 'rb'))
best_model


In [77]:
# Salva o melhor modelo em disco com o nome best_model
save_model('../models/best_model', best_model)

In [78]:
# Recuperação das colunas iniciais // Recovery of initial columns
df_original = pd.read_csv('../datasets/projeto4_telecom_treino.csv', index_col = 0, nrows=1)

print(len(df_original.columns))
df_original.columns


20


Index(['state', 'account_length', 'area_code', 'international_plan',
       'voice_mail_plan', 'number_vmail_messages', 'total_day_minutes',
       'total_day_calls', 'total_day_charge', 'total_eve_minutes',
       'total_eve_calls', 'total_eve_charge', 'total_night_minutes',
       'total_night_calls', 'total_night_charge', 'total_intl_minutes',
       'total_intl_calls', 'total_intl_charge',
       'number_customer_service_calls', 'churn'],
      dtype='object')

In [79]:
# Dados brutos de um novo consumidor (exemplo) // Raw data from a new consumer (example)
# O número de colunas deve ser o mesmo do que foi usado em treino // The number of columns must be the same as the one used in training

new_costumer = ['KS', 114, 'area_code_408', 'yes', 'yes', 32, 244.2, 120, 32.07, 154.4, 82, 22.54, 154.7, 86, 15.01, 12, 5, 3.7, 2]


In [80]:
# Converte o objeto para array // Convert object to array
arr_costumer = np.array(new_costumer).reshape(1, -1)

df_new = pd.DataFrame(arr_costumer, columns=df_original.columns[:-1])
display(df_new)


Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
0,KS,114,area_code_408,yes,yes,32,244.2,120,32.07,154.4,82,22.54,154.7,86,15.01,12,5,3.7,2


In [81]:
# Lista de colunas categóricas e numéricas // List of categorical and numerical columns
cat_features = ['state',
                'area_code', 
                'international_plan', 
                'voice_mail_plan',
               ]

# Lista de colunas numéricas // List of numerical columns     
num_features = ['account_length', 'number_vmail_messages',
                'total_day_minutes', 'total_day_calls', 'total_day_charge',
                'total_eve_minutes', 'total_eve_calls', 'total_eve_charge',
                'total_night_minutes', 'total_night_calls', 'total_night_charge',
                'total_intl_minutes', 'total_intl_calls', 'total_intl_charge',
                'number_customer_service_calls',
                ]

binary_features = ['account_length',
                   'international_plan', 
                   'voice_mail_plan',
                   'number_vmail_messages',
                  ]


In [82]:
# Conversão dos tipos de dados das variáveis // Conversion of variables data types

# Categóricas
for feat in cat_features:
    df_new[feat] = df_new[feat].astype('category')

# Numéricas
for feat in num_features:
    df_new[feat] = df_new[feat].astype('float64')



In [83]:
# Aplicação das mesmas transformações efetuadas no dataset de treino nos novos dados
# Application of the same transformations performed on the train dataset in the new data

df_new = df_new.drop(columns = ['number_customer_service_calls'])
df_new = df_new.drop(columns = ['total_day_charge','total_eve_charge','total_night_charge','total_intl_charge'])
df_new['account_length'] = np.where(df_new['account_length'] >= 100.86, 1, 0)
df_new['number_vmail_messages'] = np.where(df_new['number_vmail_messages'] >= 7.97, 1, 0)
df_new['area_code'] = df_new['area_code'].apply(lambda x: x[-3:])

# Formatação das demais variáveis binárias
df_new['international_plan'] = df_new['international_plan'].apply(lambda x: 1 if x == 'yes' else 0)
df_new['voice_mail_plan'] = df_new['voice_mail_plan'].apply(lambda x: 1 if x == 'yes' else 0)

In [84]:
df_new

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_eve_minutes,total_eve_calls,total_night_minutes,total_night_calls,total_intl_minutes,total_intl_calls
0,KS,1,408,1,1,1,244.2,120.0,154.4,82.0,154.7,86.0,12.0,5.0


In [85]:
df_new.isnull().any().sum()

0

In [86]:
dic_cut

{'total_day_minutes': array([ 17.2802,  81.56  , 145.52  , 209.48  , 273.44  , 337.4   ]),
 'total_day_calls': array([ 41.882,  65.6  ,  89.2  , 112.8  , 136.4  , 160.   ]),
 'total_eve_minutes': array([ 48.8976, 109.68  , 170.16  , 230.64  , 291.12  , 351.6   ]),
 'total_eve_calls': array([ 41.883,  65.4  ,  88.8  , 112.2  , 135.6  , 159.   ]),
 'total_night_minutes': array([ 49.7976, 110.58  , 171.06  , 231.54  , 292.02  , 352.5   ]),
 'total_night_calls': array([ 41.884,  65.2  ,  88.4  , 111.6  , 134.8  , 158.   ]),
 'total_intl_minutes': array([ 1.9836,  5.28  ,  8.56  , 11.84  , 15.12  , 18.4   ]),
 'total_intl_calls': array([ 0.99,  3.  ,  5.  ,  7.  ,  9.  , 11.  ])}

In [87]:
# Discretização de variáveis numéricas
cols_discret =  [col for col in df_new.columns if col.startswith('total')]
for col in cols_discret:
    df_new[col] = pd.cut(df_new[col], bins=dic_cut[col], include_lowest=True, labels=['1','2','3','4','5']) 


In [88]:
df_new.isnull().any().sum()

0

In [89]:
df_new

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_eve_minutes,total_eve_calls,total_night_minutes,total_night_calls,total_intl_minutes,total_intl_calls
0,KS,1,408,1,1,1,4,4,2,2,2,2,4,2


In [90]:
# Aplicação de one hot encoding em variáveis categóricas
onehot_features = ['state', 'area_code']
cols_tot =  [col for col in df_new.columns if col.startswith('total')]
onehot_features.extend(cols_tot)

for col in onehot_features:
    onehots = pd.get_dummies(df_new[col], prefix = col)
    df_new = df_new.join(onehots)
    
# Remoção das colunas originais
df_new = df_new.drop(columns = onehot_features)

    
# Resultado
display(df_new.head())


Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,state_KS,area_code_408,total_day_minutes_1,total_day_minutes_2,total_day_minutes_3,total_day_minutes_4,...,total_intl_minutes_1,total_intl_minutes_2,total_intl_minutes_3,total_intl_minutes_4,total_intl_minutes_5,total_intl_calls_1,total_intl_calls_2,total_intl_calls_3,total_intl_calls_4,total_intl_calls_5
0,1,1,1,1,1,1,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0


In [91]:
df_new.isnull().any().sum()

0

In [92]:
df_train.columns.drop('churn')

Index(['account_length', 'international_plan', 'voice_mail_plan',
       'number_vmail_messages', 'state_AK', 'state_AL', 'state_AR', 'state_AZ',
       'state_CA', 'state_CO', 'state_CT', 'state_DC', 'state_DE', 'state_FL',
       'state_GA', 'state_HI', 'state_IA', 'state_ID', 'state_IL', 'state_IN',
       'state_KS', 'state_KY', 'state_LA', 'state_MA', 'state_MD', 'state_ME',
       'state_MI', 'state_MN', 'state_MO', 'state_MS', 'state_MT', 'state_NC',
       'state_ND', 'state_NE', 'state_NH', 'state_NJ', 'state_NM', 'state_NV',
       'state_NY', 'state_OH', 'state_OK', 'state_OR', 'state_PA', 'state_RI',
       'state_SC', 'state_SD', 'state_TN', 'state_TX', 'state_UT', 'state_VA',
       'state_VT', 'state_WA', 'state_WI', 'state_WV', 'state_WY',
       'area_code_408', 'area_code_415', 'area_code_510',
       'total_day_minutes_1', 'total_day_minutes_2', 'total_day_minutes_3',
       'total_day_minutes_4', 'total_day_minutes_5', 'total_day_calls_1',
       'total_day_calls_2'

In [93]:
# Completando as colunas não informadas com 0
cols_base = df_train.columns.drop('churn')
cols_new = [col for col in cols_base if col not in df_new.columns]
for col in cols_new:
    df_new[col] = 0

In [94]:
print(f'Número de variáveis dos dados novos: {df_new.shape[1]}')
print(f'Número de variáveis dos dados de treino: {df_train.shape[1]}, sendo uma a target')

Número de variáveis dos dados novos: 98
Número de variáveis dos dados de treino: 99, sendo uma a target


In [95]:
# Previsões de classe
pred_new = model_v2.predict(df_new)

# Verifica o valor e imprime o resultado final
if pred_new == 1:
    print('Churn costumer positive!')
else:
    print('Churn costumer negative!')

Churn costumer negative!


In [96]:
type(best_model)

sklearn.ensemble._forest.RandomForestClassifier