---
---
# Previsão de Customer Churn em Operadoras de Telecom 
---
---

## Treinamento do Modelo // _Model Training_

In [1]:
# Versão da Linguagem Python // Python language version
from platform import python_version
print('Versão da Linguagem Python Usada Neste Jupyter Notebook:', python_version())

Versão da Linguagem Python Usada Neste Jupyter Notebook: 3.9.12


In [2]:
# Imports
import joblib
import pickle
import json
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import sklearn
#from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix
from sklearn.metrics import accuracy_score
#import warnings
#warnings.filterwarnings("ignore")

In [3]:
# Versões dos pacotes usados neste jupyter notebook // Versions of packages used in this jupyter notebook
#!pip install -q -U watermark
%reload_ext watermark
%watermark -a "Tatiana Novaes Carvalho" --iversions

Author: Tatiana Novaes Carvalho

seaborn   : 0.11.2
pandas    : 1.4.2
sklearn   : 1.1.2
joblib    : 1.1.0
json      : 2.0.9
matplotlib: 3.5.1
numpy     : 1.22.3



### Carga dos dados // Data load

In [4]:
# Carrega os dados // Load the data
df_train = pd.read_csv('../datasets/df_train_balanced.csv',  index_col = 0)
df_test = pd.read_csv('../datasets/df_test_proc.csv',  index_col = 0)
#df_train_stats = pd.read_csv('../datasets/df_train_stats.csv',  index_col = 0)

dic_cut = joblib.load(open('../datasets/dic_cut.pkl', 'rb'))

In [5]:
print(df_train.shape)
print(df_test.shape)

(5454, 99)
(1667, 99)


In [6]:
df_train.head(5)

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,churn,state_AK,state_AL,state_AR,state_AZ,state_CA,...,total_intl_minutes_0,total_intl_minutes_1,total_intl_minutes_2,total_intl_minutes_3,total_intl_minutes_4,total_intl_calls_0,total_intl_calls_1,total_intl_calls_2,total_intl_calls_3,total_intl_calls_4
0,1,0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,1,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [7]:
df_test.head(5)

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,churn,state_AK,state_AL,state_AR,state_AZ,state_CA,...,total_intl_minutes_1.0,total_intl_minutes_2.0,total_intl_minutes_3.0,total_intl_minutes_4.0,total_intl_minutes_5.0,total_intl_calls_1.0,total_intl_calls_2.0,total_intl_calls_3.0,total_intl_calls_4.0,total_intl_calls_5.0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,1,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [8]:
df_train_stats

Unnamed: 0,account_length,number_vmail_messages,total_day_minutes,total_day_calls,total_eve_minutes,total_eve_calls,total_night_minutes,total_night_calls,total_intl_minutes,total_intl_calls,...,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,area_code_408,area_code_415,area_code_510
count,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,...,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0
mean,0.465897,0.204804,2.193619,1.931243,2.006784,1.915108,1.953245,1.938027,2.021085,0.847268,...,0.022186,0.014851,0.022002,0.025486,0.021452,0.023836,0.026769,0.246058,0.508618,0.245325
std,0.498881,0.403595,0.964579,0.890195,0.851612,0.83957,0.833363,0.872912,0.828009,0.98808,...,0.1473,0.120969,0.146704,0.15761,0.144899,0.152551,0.161423,0.430752,0.499972,0.430319
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,1.0,0.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,1.0,1.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
dic_cut

{'total_day_minutes': array([ 17.2802,  81.56  , 145.52  , 209.48  , 273.44  , 337.4   ]),
 'total_day_calls': array([ 41.882,  65.6  ,  89.2  , 112.8  , 136.4  , 160.   ]),
 'total_eve_minutes': array([ 48.8976, 109.68  , 170.16  , 230.64  , 291.12  , 351.6   ]),
 'total_eve_calls': array([ 41.883,  65.4  ,  88.8  , 112.2  , 135.6  , 159.   ]),
 'total_night_minutes': array([ 49.7976, 110.58  , 171.06  , 231.54  , 292.02  , 352.5   ]),
 'total_night_calls': array([ 41.884,  65.2  ,  88.4  , 111.6  , 134.8  , 158.   ]),
 'total_intl_minutes': array([ 1.9836,  5.28  ,  8.56  , 11.84  , 15.12  , 18.4   ]),
 'total_intl_calls': array([ 0.99,  3.  ,  5.  ,  7.  ,  9.  , 11.  ])}

In [10]:
# Separando variáveis preditoras da variável-alvo (datasets treino e teste)

target = 'churn'

X_train = df_train.drop(target, axis = 1)
y_train = df_train[target]

X_test = df_test.drop(target, axis = 1)
y_test = df_test[target]

In [11]:
X_train.head()

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,state_AK,state_AL,state_AR,state_AZ,state_CA,state_CO,...,total_intl_minutes_0,total_intl_minutes_1,total_intl_minutes_2,total_intl_minutes_3,total_intl_minutes_4,total_intl_calls_0,total_intl_calls_1,total_intl_calls_2,total_intl_calls_3,total_intl_calls_4
0,1,0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,1,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [12]:
X_test.head()

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,state_AK,state_AL,state_AR,state_AZ,state_CA,state_CO,...,total_intl_minutes_1.0,total_intl_minutes_2.0,total_intl_minutes_3.0,total_intl_minutes_4.0,total_intl_minutes_5.0,total_intl_calls_1.0,total_intl_calls_2.0,total_intl_calls_3.0,total_intl_calls_4.0,total_intl_calls_5.0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,1,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


> Modelos

In [13]:
# Funções auxiliares

def evaluate_classification_model(y_test, y_pred, y_pred_proba):
    """ Avalia modelos de classificação por meio de matriz de confusão, AUC, curva ROC e acurácia."""
    
    # Matriz de confusão
    cm = confusion_matrix(y_test, y_pred)
    
    # Extração de cada valor da Confusion Matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Cálculo da métrica global AUC (Area Under The Curve) com dados reais e previsões em teste
    roc_auc = roc_auc_score(y_test, y_pred)

    # Cálculo da curva ROC com dados e previsões em teste
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

    # AUC em teste
    auc_ = auc(fpr, tpr)

    # Acurácia em teste
    accuracy = accuracy_score(y_test, y_pred)
    
    return cm, roc_auc, auc_, accuracy


def feature_importance(model, criteria, cols_list):
    """ Imprime as 10 variáveis mais importantes para o resultado do modelo. """
    indices = np.argsort(-abs(f'model.{criteria}_'))

    print("Top 10 - Variáveis mais importantes para o resultado do modelo:")
    print(50*'-')
    for feature in cols_list[indices][0,:10]:
        print(feature) 

def save_model(model_name, model):
   # Salva o modelo em disco
    with open(f'../models/{model_name}.pkl', 'wb') as pickle_file:
        joblib.dump({model}, f'../models/{model_name}.pkl')
        



### Construção, Treinamento e Avaliação do Modelo 1 com Regressão Logística (Benchmark)

In [14]:
# Checando valores faltantes
print(f'Valores faltantes nos dados de treino: {X_train.isnull().any().sum()}')
print(f'Valores faltantes nos dados de teste: {X_test.isnull().any().sum()}')


Valores faltantes nos dados de treino: 0
Valores faltantes nos dados de teste: 0

Valores faltantes nos dados de teste: 0


In [35]:
X_test.isnull().any().sum()

8

In [15]:
# Treinamento do Modelo

# Define lista de hiperparâmetros
tuned_params_v1 = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 
                   'penalty': ['l1', 'l2']}

# Criação de modelo com GridSearch 
# Vários modelos serão criados com diferentes combinações de hiperparâmetros
model_v1 = GridSearchCV(LogisticRegression(), 
                         tuned_params_v1, 
                         scoring = 'roc_auc', 
                         n_jobs = -1) # n_jobs -1 para não impor limites para a execução

# Treinamento do modelo
model_v1.fit(X_train, y_train)

45 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.80542829       

In [16]:
# Previsões com dados de teste

# Previsões com dados de teste
y_pred_v1 = model_v1.predict(X_test)

# Obtemos as previsões no formato de probabilidade para cada classe
y_pred_proba_v1 = model_v1.predict_proba(X_test)

# Obtemos as previsões no formato de probabilidade filtrando para a classe positiva
# Precisamos disso para calcula a Curva ROC
y_pred_proba_v1 = model_v1.predict_proba(X_test)[:,1]


Feature names unseen at fit time:
- total_day_calls_1.0
- total_day_calls_2.0
- total_day_calls_3.0
- total_day_calls_4.0
- total_day_calls_5.0
- ...
Feature names seen at fit time, yet now missing:
- total_day_calls_0
- total_day_calls_1
- total_day_calls_2
- total_day_calls_3
- total_day_calls_4
- ...

Feature names unseen at fit time:
- total_day_calls_1.0
- total_day_calls_2.0
- total_day_calls_3.0
- total_day_calls_4.0
- total_day_calls_5.0
- ...
Feature names seen at fit time, yet now missing:
- total_day_calls_0
- total_day_calls_1
- total_day_calls_2
- total_day_calls_3
- total_day_calls_4
- ...

Feature names unseen at fit time:
- total_day_calls_1.0
- total_day_calls_2.0
- total_day_calls_3.0
- total_day_calls_4.0
- total_day_calls_5.0
- ...
Feature names seen at fit time, yet now missing:
- total_day_calls_0
- total_day_calls_1
- total_day_calls_2
- total_day_calls_3
- total_day_calls_4
- ...



In [17]:
# Chama função para avaliação do modelo
cm_v1, roc_auc_v1, auc_v1, accuracy_v1 = evaluate_classification_model(y_test, y_pred_v1, y_pred_proba_v1)
print(cm_v1, roc_auc_v1, auc_v1, accuracy_v1)

[[1072  371]
 [  75  149]] 0.7040376571626572 0.7686924561924563 0.7324535092981403


In [18]:
# Salva o modelo em disco
save_model('model_v1', model_v1)

In [19]:
# Consolidação da avaliação para comparação dos modelos

# Cria um dataframe para receber as métricas de cada modelo
df_models = pd.DataFrame()

# Dicionário com as métricas do modelo_v1
dict_model_v1 = {'Nome': 'modelo_v1', 
                 'Algoritmo': 'Regressão Logística', 
                 'ROC_AUC Score': roc_auc_v1,
                 'AUC Score': auc_v1,
                 'Acurácia': accuracy_v1}
dict_model_v1 = pd.DataFrame.from_dict(dict_model_v1, orient='index').T

# Adiciona o dict ao dataframe
#df_models = pd.concat(df_models, dict_model_v1)
df_models = dict_model_v1

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.704038,0.768692,0.732454


### Construção, Treinamento e Avaliação do Modelo 2 com Random Forest

In [20]:
# Função para seleção de hiperparâmetros com Random Forest
def random_forest_param_selection(X, y):
    """ Função para seleção de hiperparâmetros com Random Forest. """
    n_estimators = [100, 200, 300, 400, 500]
    min_samples_split= [2, 5, 10]
    min_samples_leaf= [1, 2, 4]    
    param_grid = {'n_estimators':n_estimators, 'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf}
    rand_search = RandomizedSearchCV(RandomForestClassifier(), param_grid, n_iter = 15, scoring = 'roc_auc', n_jobs  = -1)
    rand_search.fit(X_train, y_train)
    rand_search.best_estimator_
    return rand_search.best_estimator_

In [21]:
# Chama função para seleção de hiperparâmetros com Random Forest
model_v2 = random_forest_param_selection(X_train,  y_train)
model_v2

In [22]:
# Treinamento

# Construindo o modelo novamente com os melhores hiperparâmetros
# Isso é necessário pois a versão final não deve ter o GridSearchCV
model_v2 = RandomForestClassifier(min_samples_split=10, n_estimators=500)
model_v2.fit(X_train, y_train)


In [23]:
# Previsões em teste
y_pred_v2 = model_v2.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v2 = model_v2.predict_proba(X_test)[:,1]

Feature names unseen at fit time:
- total_day_calls_1.0
- total_day_calls_2.0
- total_day_calls_3.0
- total_day_calls_4.0
- total_day_calls_5.0
- ...
Feature names seen at fit time, yet now missing:
- total_day_calls_0
- total_day_calls_1
- total_day_calls_2
- total_day_calls_3
- total_day_calls_4
- ...

Feature names unseen at fit time:
- total_day_calls_1.0
- total_day_calls_2.0
- total_day_calls_3.0
- total_day_calls_4.0
- total_day_calls_5.0
- ...
Feature names seen at fit time, yet now missing:
- total_day_calls_0
- total_day_calls_1
- total_day_calls_2
- total_day_calls_3
- total_day_calls_4
- ...



In [24]:
# Avaliação do modelo
cm_v2, roc_auc_v2, auc_v2, accuracy_v2 = evaluate_classification_model(y_test, y_pred_v2, y_pred_proba_v2)
print(cm_v2, roc_auc_v2, auc_v2, accuracy_v2)

[[1297  146]
 [  99  125]] 0.7284288065538066 0.800520988020988 0.8530293941211757


In [25]:
# Feature Importante

# Construindo o modelo novamente com os melhores hiperparâmetros
# Isso é necessário pois a versão final não deve ter o GridSearchCV
model_v2 = RandomForestClassifier(min_samples_split=10, n_estimators=500)
model_v2.fit(X_train, y_train)

# Variáveis mais relevantes
indices = np.argsort(-model_v2.feature_importances_)
print("Variáveis mais importantes para o resultado do modelo_v2:")
print(50*'-')
for feature in X_train.columns[indices][:10]:
    print(feature)

Variáveis mais importantes para o resultado do modelo_v2:
--------------------------------------------------
international_plan
number_vmail_messages
total_day_minutes_2
total_day_minutes_4
voice_mail_plan
total_day_minutes_3
total_intl_calls_0
total_eve_minutes_1
total_day_minutes_1
total_eve_minutes_3


In [26]:
# Salva o modelo em disco
save_model('model_v2', model_v2)

In [27]:
# Dicionário com as métricas do modelo_v2
dict_model_v2 = {'Nome': 'modelo_v2', 
                 'Algoritmo': 'Random Forest', 
                 'ROC_AUC Score': roc_auc_v2,
                 'AUC Score': auc_v2,
                 'Acurácia': accuracy_v2}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v2 = pd.DataFrame.from_dict(dict_model_v2, orient='index').T
df_list = [df_models, dict_model_v2]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.704038,0.768692,0.732454
0,modelo_v2,Random Forest,0.728429,0.800521,0.853029


### Construção, Treinamento e Avaliação do Modelo 3 com KNN

In [28]:
# Função para seleção de hiperparâmetros com KNN
def knn_param_selection(X_train, y_train):
    """ Função para seleção de hiperparâmetros com KNN. """
    
    # Lista de possíveis valores de K
    neighbors = list(range(1, 30, 2))

    # Lista para os scores
    cv_scores = []

    # Validação cruzada para determinar o melhor valor de k
    for k in neighbors:
        knn = KNeighborsClassifier(n_neighbors = k)
        scores = cross_val_score(knn, X_train, y_train, cv = 5, scoring = 'accuracy')
        cv_scores.append(scores.mean())   

    # Ajustando o erro de classificação
    error = [1 - x for x in cv_scores]

    # Determinando o melhor valor de k (com menor erro)
    optimal_k = neighbors[error.index(min(error))]
    return optimal_k


In [29]:
# Seleção de hiperparâmetro
optimal_k = knn_param_selection(X_train, y_train)
print(f'O valor ideal de k é {optimal_k}')

O valor ideal de k é 1


In [30]:
# Treinamento

# Criamos o modelo versão 3
model_v3 = KNeighborsClassifier(n_neighbors = optimal_k)

# Treinamento
model_v3.fit(X_train, y_train)

In [31]:
# Previsões com dados de teste

# Previsões com dados de teste
y_pred_v3 = model_v3.predict(X_test)

# Obtemos as previsões no formato de probabilidade para cada classe
y_pred_proba_v3 = model_v3.predict_proba(X_test)

# Obtemos as previsões no formato de probabilidade filtrando para a classe positiva
# Precisamos disso para calcula a Curva ROC
y_pred_proba_v3 = model_v3.predict_proba(X_test)[:,1]

Feature names unseen at fit time:
- total_day_calls_1.0
- total_day_calls_2.0
- total_day_calls_3.0
- total_day_calls_4.0
- total_day_calls_5.0
- ...
Feature names seen at fit time, yet now missing:
- total_day_calls_0
- total_day_calls_1
- total_day_calls_2
- total_day_calls_3
- total_day_calls_4
- ...

Feature names unseen at fit time:
- total_day_calls_1.0
- total_day_calls_2.0
- total_day_calls_3.0
- total_day_calls_4.0
- total_day_calls_5.0
- ...
Feature names seen at fit time, yet now missing:
- total_day_calls_0
- total_day_calls_1
- total_day_calls_2
- total_day_calls_3
- total_day_calls_4
- ...

Feature names unseen at fit time:
- total_day_calls_1.0
- total_day_calls_2.0
- total_day_calls_3.0
- total_day_calls_4.0
- total_day_calls_5.0
- ...
Feature names seen at fit time, yet now missing:
- total_day_calls_0
- total_day_calls_1
- total_day_calls_2
- total_day_calls_3
- total_day_calls_4
- ...



In [32]:
# Avaliação do modelo
cm_v3, roc_auc_v3, auc_v3, accuracy_v3 = evaluate_classification_model(y_test, y_pred_v3, y_pred_proba_v3)
print(cm_v3, roc_auc_v3, auc_v3, accuracy_v3)

[[1195  248]
 [ 154   70]] 0.5703179140679141 0.5703179140679141 0.7588482303539292


In [33]:
#Obs: Com o algoritmo KNN não extraímos as variáveis mais importantes, pois o conceito do algoritmo é diferente.

In [34]:
# Salva o modelo em disco
save_model('model_v3', model_v3)

In [35]:
# Dicionário com as métricas do modelo_v3
dict_model_v3 = {'Nome': 'modelo_v3', 
                  'Algoritmo': 'KNN', 
                  'ROC_AUC Score': roc_auc_v3,
                  'AUC Score': auc_v3,
                  'Acurácia': accuracy_v3}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v3 = pd.DataFrame.from_dict(dict_model_v3, orient='index').T
df_list = [df_models, dict_model_v3]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.704038,0.768692,0.732454
0,modelo_v2,Random Forest,0.728429,0.800521,0.853029
0,modelo_v3,KNN,0.570318,0.570318,0.758848


### Construção, Treinamento e Avaliação do Modelo 4 com Decision Tree

In [36]:
#https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [37]:
# Função para seleção de hiperparâmetros com Decision Tree
def decision_tree_param_selection(X_train, y_train):
    """ Função para seleção de hiperparâmetros com Decision Tree. """
    n_estimators = [100, 200, 300, 400, 500]
    min_samples_split= [2, 3, 4, 5, 7]
    min_samples_leaf= [1, 2, 3, 4, 6]
    max_depth= [2, 3, 4, 5, 6, 7]
    param_grid = {'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf, 'max_depth':max_depth}
    rand_search = RandomizedSearchCV(DecisionTreeClassifier(), param_grid, n_iter = 15, scoring = 'roc_auc', n_jobs  = -1)
    rand_search.fit(X_train, y_train)
    rand_search.best_estimator_
    return rand_search.best_estimator_

In [38]:
# Chama função para seleção de hiperparâmetros com Decision Tree
model_v4 = decision_tree_param_selection(X_train,  y_train)
model_v4

In [39]:
# Treinamento

# Criação do modelo com os melhores hiperparâmetros (versão final não deve ter o RandomizedSearchCV)
model_v4 = DecisionTreeClassifier(max_depth=4, min_samples_leaf=4, min_samples_split=7)
model_v4.fit(X_train, y_train)

In [40]:
# Previsões em teste
y_pred_v4 = model_v4.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v4 = model_v4.predict_proba(X_test)[:,1]

Feature names unseen at fit time:
- total_day_calls_1.0
- total_day_calls_2.0
- total_day_calls_3.0
- total_day_calls_4.0
- total_day_calls_5.0
- ...
Feature names seen at fit time, yet now missing:
- total_day_calls_0
- total_day_calls_1
- total_day_calls_2
- total_day_calls_3
- total_day_calls_4
- ...

Feature names unseen at fit time:
- total_day_calls_1.0
- total_day_calls_2.0
- total_day_calls_3.0
- total_day_calls_4.0
- total_day_calls_5.0
- ...
Feature names seen at fit time, yet now missing:
- total_day_calls_0
- total_day_calls_1
- total_day_calls_2
- total_day_calls_3
- total_day_calls_4
- ...



In [41]:
# Avaliação do modelo
cm_v4, roc_auc_v4, auc_v4, accuracy_v4 = evaluate_classification_model(y_test, y_pred_v4, y_pred_proba_v4)
print(cm_v4, roc_auc_v4, auc_v4, accuracy_v4)

[[1118  325]
 [  77  147]] 0.7155123873873874 0.7760880729630729 0.7588482303539292


In [42]:
# Feature Importante

# Variáveis mais relevantes
indices = np.argsort(-model_v2.feature_importances_)
print("Variáveis mais importantes para o resultado do modelo_v2:")
print(50*'-')
for feature in X_train.columns[indices][:10]:
    print(feature)

Variáveis mais importantes para o resultado do modelo_v2:
--------------------------------------------------
international_plan
number_vmail_messages
total_day_minutes_2
total_day_minutes_4
voice_mail_plan
total_day_minutes_3
total_intl_calls_0
total_eve_minutes_1
total_day_minutes_1
total_eve_minutes_3


In [43]:
# Salva o modelo em disco
save_model('model_v4', model_v4)

In [44]:
# Dicionário com as métricas do modelo_v4
dict_model_v4 = {'Nome': 'modelo_v4', 
                 'Algoritmo': 'Decision Tree', 
                 'ROC_AUC Score': roc_auc_v4,
                 'AUC Score': auc_v4,
                 'Acurácia': accuracy_v4}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v4 = pd.DataFrame.from_dict(dict_model_v4, orient='index').T
df_list = [df_models, dict_model_v4]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.704038,0.768692,0.732454
0,modelo_v2,Random Forest,0.728429,0.800521,0.853029
0,modelo_v3,KNN,0.570318,0.570318,0.758848
0,modelo_v4,Decision Tree,0.715512,0.776088,0.758848


### Construção, Treinamento e Avaliação do Modelo 5 com SVM

In [45]:
#https://scikit-learn.org/stable/modules/svm.html

In [46]:
# Função para seleção de hiperparâmetros
def svc_param_selection(X_train, y_train, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(SVC(kernel = 'rbf'), param_grid, cv = nfolds)
    grid_search.fit(X_train, y_train)
    grid_search.best_params_
    return grid_search.best_params_

In [47]:
# Aplica a função
svc_param_selection(X_train, y_train, 5)

{'C': 10, 'gamma': 0.1}

In [48]:
# Treinamento

# Criação do modelo com os melhores hiperparâmetros (versão final não deve ter o GridSearchCV)
model_v5 = SVC(C = 10, gamma = 0.1, probability = True)
model_v5.fit(X_train, y_train)

In [49]:
# Previsões em teste
y_pred_v5 = model_v5.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v5 = model_v5.predict_proba(X_test)[:,1]

Feature names unseen at fit time:
- total_day_calls_1.0
- total_day_calls_2.0
- total_day_calls_3.0
- total_day_calls_4.0
- total_day_calls_5.0
- ...
Feature names seen at fit time, yet now missing:
- total_day_calls_0
- total_day_calls_1
- total_day_calls_2
- total_day_calls_3
- total_day_calls_4
- ...

Feature names unseen at fit time:
- total_day_calls_1.0
- total_day_calls_2.0
- total_day_calls_3.0
- total_day_calls_4.0
- total_day_calls_5.0
- ...
Feature names seen at fit time, yet now missing:
- total_day_calls_0
- total_day_calls_1
- total_day_calls_2
- total_day_calls_3
- total_day_calls_4
- ...



In [50]:
# Avaliação do modelo
cm_v5, roc_auc_v5, auc_v5, accuracy_v5 = evaluate_classification_model(y_test, y_pred_v4, y_pred_proba_v4)
print(cm_v5, roc_auc_v5, auc_v5, accuracy_v5)

[[1118  325]
 [  77  147]] 0.7155123873873874 0.7760880729630729 0.7588482303539292


In [51]:
# Salva o modelo em disco
save_model('model_v5', model_v5)

In [52]:
# Dicionário com as métricas do modelo_v5
dict_model_v5 = {'Nome': 'modelo_v5', 
                 'Algoritmo': 'SVM', 
                 'ROC_AUC Score': roc_auc_v5,
                 'AUC Score': auc_v5,
                 'Acurácia': accuracy_v5}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v5 = pd.DataFrame.from_dict(dict_model_v5, orient='index').T
df_list = [df_models, dict_model_v5]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.704038,0.768692,0.732454
0,modelo_v2,Random Forest,0.728429,0.800521,0.853029
0,modelo_v3,KNN,0.570318,0.570318,0.758848
0,modelo_v4,Decision Tree,0.715512,0.776088,0.758848
0,modelo_v5,SVM,0.715512,0.776088,0.758848


### Seleção do Melhor Modelo

In [53]:
# Seleção do modelo com maior AUC Score, por se tratar de uma métrica global
# O score AUC é o ideal para comparar modelos de diferentes algoritmos

df_best_model = df_models[df_models['AUC Score'] == df_models['AUC Score'].max()]
df_best_model

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v2,Random Forest,0.728429,0.800521,0.853029


## Previsões com o Melhor Modelo Treinado

In [54]:
# Obtemos o nome do melhor modelo
model = df_best_model.Nome.to_string(index = False)
model

'modelo_v2'

In [55]:
# Carregamos o melhor modelo do disco
best_model = joblib.load('../models/model' + model[-3:] + '.pkl')
best_model

{RandomForestClassifier(min_samples_split=10, n_estimators=500)}

In [173]:
# Recuperação das colunas iniciais
df_original = pd.read_csv('../datasets/projeto4_telecom_treino.csv', index_col = 0, nrows=1)

print(len(df_original.columns))
df_original.columns


20


Index(['state', 'account_length', 'area_code', 'international_plan',
       'voice_mail_plan', 'number_vmail_messages', 'total_day_minutes',
       'total_day_calls', 'total_day_charge', 'total_eve_minutes',
       'total_eve_calls', 'total_eve_charge', 'total_night_minutes',
       'total_night_calls', 'total_night_charge', 'total_intl_minutes',
       'total_intl_calls', 'total_intl_charge',
       'number_customer_service_calls', 'churn'],
      dtype='object')

In [227]:
# Dados brutos de um novo consumidor (exemplo)
# O número de colunas deve ser o mesmo do que foi usado em treino
new_costumer = ['KS', 114, 'area_code_408', 'yes', 'yes', 32, 244.2, 120, 32.07, 154.4, 82, 22.54, 154.7, 86, 15.01, 12, 5, 3.7, 2]


In [228]:
# Converte o objeto para array
arr_costumer = np.array(new_costumer).reshape(1, -1)

df_new = pd.DataFrame(arr_costumer, columns=df_original.columns[:-1])
display(df_new)


Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
0,KS,114,area_code_408,yes,yes,32,244.2,120,32.07,154.4,82,22.54,154.7,86,15.01,12,5,3.7,2


In [229]:
# Lista de colunas categóricas e numéricas // List of categorical and numerical columns
cat_features = ['state',
                'area_code', 
                'international_plan', 
                'voice_mail_plan',
               ]

# Lista de colunas numéricas // List of numerical columns     
num_features = ['account_length', 'number_vmail_messages',
                'total_day_minutes', 'total_day_calls', 'total_day_charge',
                'total_eve_minutes', 'total_eve_calls', 'total_eve_charge',
                'total_night_minutes', 'total_night_calls', 'total_night_charge',
                'total_intl_minutes', 'total_intl_calls', 'total_intl_charge',
                'number_customer_service_calls',
                ]

binary_features = ['account_length',
                   'international_plan', 
                   'voice_mail_plan',
                   'number_vmail_messages',
                  ]




In [230]:
# Conversão dos tipos de dados das variáveis // Conversion of variables data types

# Categóricas
for feat in cat_features:
    df_new[feat] = df_new[feat].astype('category')

# Numéricas
for feat in num_features:
    df_new[feat] = df_new[feat].astype('float64')



In [231]:
# Aplicação das mesmas transformações efetuadas no dataset de treino nos novos dados
# Application of the same transformations performed on the train dataset in the new data

#arr_costumer = [] # completar
df_new = df_new.drop(columns = ['number_customer_service_calls'])
df_new = df_new.drop(columns = ['total_day_charge','total_eve_charge','total_night_charge','total_intl_charge'])
df_new['account_length'] = np.where(df_new['account_length'] >= 100.86, 1, 0)
df_new['number_vmail_messages'] = np.where(df_new['number_vmail_messages'] >= 7.97, 1, 0)
df_new['area_code'] = df_new['area_code'].apply(lambda x: x[-3:])

# Formatação das demais variáveis binárias
df_new['international_plan'] = df_new['international_plan'].apply(lambda x: 1 if x == 'yes' else 0)
df_new['voice_mail_plan'] = df_new['voice_mail_plan'].apply(lambda x: 1 if x == 'yes' else 0)

In [232]:
df_new

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_eve_minutes,total_eve_calls,total_night_minutes,total_night_calls,total_intl_minutes,total_intl_calls
0,KS,1,408,1,1,1,244.2,120.0,154.4,82.0,154.7,86.0,12.0,5.0


In [233]:

# Discretização de variáveis numéricas
cols_discret =  [col for col in df_new.columns if col.startswith('total')]
for col in cols_discret:
    df_new[col] = pd.cut(df_new[col], bins=dic_cut[col], include_lowest=True, labels=['1','2','3','4','5']) 

In [234]:
df_new

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_eve_minutes,total_eve_calls,total_night_minutes,total_night_calls,total_intl_minutes,total_intl_calls
0,KS,1,408,1,1,1,4,4,2,2,2,2,4,2


In [235]:
onehot_features

['state',
 'area_code',
 'total_day_minutes',
 'total_day_calls',
 'total_eve_minutes',
 'total_eve_calls',
 'total_night_minutes',
 'total_night_calls',
 'total_intl_minutes',
 'total_intl_calls']

In [236]:
# Aplicação de one hot encoding em variáveis categóricas
onehot_features = ['state', 'area_code']
cols_tot =  [col for col in df_new.columns if col.startswith('total')]
onehot_features.extend(cols_tot)

for col in onehot_features:
    onehots = pd.get_dummies(df_new[col], prefix = col)
    df_new = df_new.join(onehots)
    
# Remoção das colunas originais
df_new = df_new.drop(columns = onehot_features)


# Padronização dos novos dados // New data standardization
# Devem ser usados a média e o desvio de treino para padronizar o conjunto de novos dados

#train_mean = df_train_stats[df_train_stats.index == 'mean']
#train_std = df_train_stats[df_train_stats.index == 'std']

#cols_padr =  [col for col in df_new.select_dtypes(include=['int','float64']).columns if col.startswith('total')]
#for col in cols_padr:
#    df_new[col] = (df_new[col] - train_mean[col].values) / train_std[col].values
    
    
# Resultado
display(df_new.head())


Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,state_KS,area_code_408,total_day_minutes_1,total_day_minutes_2,total_day_minutes_3,total_day_minutes_4,...,total_intl_minutes_1,total_intl_minutes_2,total_intl_minutes_3,total_intl_minutes_4,total_intl_minutes_5,total_intl_calls_1,total_intl_calls_2,total_intl_calls_3,total_intl_calls_4,total_intl_calls_5
0,1,1,1,1,1,1,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0


In [237]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 46 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   account_length         1 non-null      int32   
 1   international_plan     1 non-null      category
 2   voice_mail_plan        1 non-null      category
 3   number_vmail_messages  1 non-null      int32   
 4   state_KS               1 non-null      uint8   
 5   area_code_408          1 non-null      uint8   
 6   total_day_minutes_1    1 non-null      uint8   
 7   total_day_minutes_2    1 non-null      uint8   
 8   total_day_minutes_3    1 non-null      uint8   
 9   total_day_minutes_4    1 non-null      uint8   
 10  total_day_minutes_5    1 non-null      uint8   
 11  total_day_calls_1      1 non-null      uint8   
 12  total_day_calls_2      1 non-null      uint8   
 13  total_day_calls_3      1 non-null      uint8   
 14  total_day_calls_4      1 non-null      uint8  

In [238]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5454 entries, 0 to 5453
Data columns (total 99 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   account_length         5454 non-null   int64
 1   international_plan     5454 non-null   int64
 2   voice_mail_plan        5454 non-null   int64
 3   number_vmail_messages  5454 non-null   int64
 4   churn                  5454 non-null   int64
 5   state_AK               5454 non-null   int64
 6   state_AL               5454 non-null   int64
 7   state_AR               5454 non-null   int64
 8   state_AZ               5454 non-null   int64
 9   state_CA               5454 non-null   int64
 10  state_CO               5454 non-null   int64
 11  state_CT               5454 non-null   int64
 12  state_DC               5454 non-null   int64
 13  state_DE               5454 non-null   int64
 14  state_FL               5454 non-null   int64
 15  state_GA               5454 non-null  

In [239]:
cols_train = df_train.columns
cols_new = df_new.columns
print(len(cols_train))
print(len(cols_new))

99
46


In [240]:
cols_train_set = set(cols_train)
cols_new_set = set(cols_new)
cols_ = cols_train_set - cols_new_set
print(len(cols_))

61


In [241]:
cols_ # VERIFICAR ERRO NAS VARIÁVEIS TOTAL_XXX

{'area_code_415',
 'area_code_510',
 'churn',
 'state_AK',
 'state_AL',
 'state_AR',
 'state_AZ',
 'state_CA',
 'state_CO',
 'state_CT',
 'state_DC',
 'state_DE',
 'state_FL',
 'state_GA',
 'state_HI',
 'state_IA',
 'state_ID',
 'state_IL',
 'state_IN',
 'state_KY',
 'state_LA',
 'state_MA',
 'state_MD',
 'state_ME',
 'state_MI',
 'state_MN',
 'state_MO',
 'state_MS',
 'state_MT',
 'state_NC',
 'state_ND',
 'state_NE',
 'state_NH',
 'state_NJ',
 'state_NM',
 'state_NV',
 'state_NY',
 'state_OH',
 'state_OK',
 'state_OR',
 'state_PA',
 'state_RI',
 'state_SC',
 'state_SD',
 'state_TN',
 'state_TX',
 'state_UT',
 'state_VA',
 'state_VT',
 'state_WA',
 'state_WI',
 'state_WV',
 'state_WY',
 'total_day_calls_0',
 'total_day_minutes_0',
 'total_eve_calls_0',
 'total_eve_minutes_0',
 'total_intl_calls_0',
 'total_intl_minutes_0',
 'total_night_calls_0',
 'total_night_minutes_0'}

In [242]:
# Completando as colunas não informadas com 0
cols_new = [col for col in df_train.columns if col not in df_new.columns]
for col in cols_new:
    df_new[col] = 0

In [243]:
print(f'Número de variáveis dos dados novos: {df_new.shape[1]}')
print(f'Número de variáveis dos dados de treino: {df_train.shape[1]}, sendo uma a target')

Número de variáveis dos dados novos: 107
Número de variáveis dos dados de treino: 99, sendo uma a target


In [244]:
# Previsões de classe => ERRO!!!
#pred_new_costumer = melhor_modelo.predict(arr_costumer.reshape(1, -1))
pred_new = model_v2.predict(df_new)

# Verifica o valor e imprime o resultado final
if pred_new_costumer == 1:
    print('Churn costumer positive!')
else:
    print('Churn costumer negative!')

Feature names unseen at fit time:
- churn
- total_day_calls_5
- total_day_minutes_5
- total_eve_calls_5
- total_eve_minutes_5
- ...



ValueError: X has 107 features, but RandomForestClassifier is expecting 98 features as input.