---
---
# Previsão de Customer Churn em Operadoras de Telecom 
---
---

## Treinamento do Modelo // _Model Training_

In [1]:
# Versão da Linguagem Python // Python language version
from platform import python_version
print('Versão da Linguagem Python Usada Neste Jupyter Notebook:', python_version())

Versão da Linguagem Python Usada Neste Jupyter Notebook: 3.9.12


In [2]:
# Imports
# Imports
import joblib
import pickle
import json
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import sklearn
#from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix
from sklearn.metrics import accuracy_score
#import warnings
#warnings.filterwarnings("ignore")

In [3]:
# Versões dos pacotes usados neste jupyter notebook // Versions of packages used in this jupyter notebook
#!pip install -q -U watermark
%reload_ext watermark
%watermark -a "Tatiana Novaes Carvalho" --iversions

Author: Tatiana Novaes Carvalho

numpy     : 1.22.3
seaborn   : 0.11.2
pandas    : 1.4.2
sklearn   : 1.1.2
matplotlib: 3.5.1
joblib    : 1.1.0
json      : 2.0.9



### Carga dos dados // Data load

In [4]:
# Carrega os dados // Load the data
df_train = pd.read_csv('../datasets/df_train_balanced.csv',  index_col = 0)
df_test = pd.read_csv('../datasets/df_test_proc.csv',  index_col = 0)
df_train_stats = pd.read_csv('../datasets/df_train_stats.csv',  index_col = 0)


In [5]:
print(df_train.shape)
print(df_test.shape)

(5454, 67)
(1667, 67)


In [6]:
df_train.head(5)

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_eve_minutes,total_eve_calls,total_night_minutes,total_night_calls,...,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,area_code_408,area_code_415,area_code_510
0,1,0,1,1,1.192061,0.452991,-0.168486,-0.073857,0.859001,-0.466632,...,0,0,0,0,0,0,0,0,1,0
1,1,0,1,1,-0.498612,1.129743,-0.206217,0.142255,1.062986,0.170877,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0.837591,0.661222,-1.681685,0.520451,-0.867513,0.224003,...,0,0,0,0,0,0,0,0,1,0
3,0,1,0,0,1.752351,-1.577264,-2.859279,-0.668165,-0.146205,-0.572884,...,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,-0.415303,0.609165,-1.143526,1.168787,-0.356499,1.127142,...,0,0,0,0,0,0,0,0,1,0


In [7]:
df_test.head(5)

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_eve_minutes,total_eve_calls,total_night_minutes,total_night_calls,...,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,area_code_408,area_code_415,area_code_510
1,1,0,0,0,70.9,123.0,211.9,73.0,236.0,73.0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,223.6,86.0,244.8,139.0,94.2,81.0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,1,1,294.7,95.0,237.3,105.0,300.3,127.0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,216.8,123.0,126.4,88.0,220.6,82.0,...,0,0,0,0,0,0,0,0,1,0
5,1,0,0,0,197.4,78.0,124.0,101.0,204.5,107.0,...,0,0,0,0,0,0,0,0,1,0


In [8]:
df_train_stats

Unnamed: 0,account_length,number_vmail_messages,total_day_minutes,total_day_calls,total_eve_minutes,total_eve_calls,total_night_minutes,total_night_calls,total_intl_minutes,total_intl_calls,...,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,area_code_408,area_code_415,area_code_510
count,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,...,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0,5454.0
mean,0.404107,0.172901,-1.645117e-14,-1.405224e-15,-9.184622e-15,5.812429e-15,-5.682794e-15,1.760367e-15,2.437931e-15,7.639592e-15,...,0.018519,0.015218,0.021819,0.027869,0.018519,0.023652,0.020535,0.250458,0.513018,0.236524
std,0.490763,0.378196,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.134829,0.122431,0.146105,0.164614,0.134829,0.151978,0.141836,0.433317,0.499876,0.424986
min,0.0,0.0,-2.850852,-3.086941,-3.111479,-3.153453,-3.233321,-3.069797,-3.39358,-1.604942,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,-0.751396,-0.692281,-0.6921529,-0.668165,-0.6949127,-0.7319161,-0.6473041,-0.6021389,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,-0.0493994,0.0365285,0.01588371,0.004233631,-0.003204684,0.01149999,0.003712177,-0.1007371,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,1.0,0.0,0.7689841,0.7132801,0.6993194,0.6825352,0.7212583,0.7021355,0.6792994,0.4006646,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
max,1.0,1.0,2.373081,3.055882,2.893656,3.167824,3.125971,3.092797,3.154971,3.409075,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Separando variáveis preditoras da variável-alvo (datasets treino e teste)

target = 'churn'

X_train = df_train.drop(target, axis = 1)
y_train = df_train[target]

X_test = df_test.drop(target, axis = 1)
y_test = df_test[target]

In [10]:
X_train.head()

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_eve_minutes,total_eve_calls,total_night_minutes,total_night_calls,...,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,area_code_408,area_code_415,area_code_510
0,1,0,1,1,1.192061,0.452991,-0.168486,-0.073857,0.859001,-0.466632,...,0,0,0,0,0,0,0,0,1,0
1,1,0,1,1,-0.498612,1.129743,-0.206217,0.142255,1.062986,0.170877,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0.837591,0.661222,-1.681685,0.520451,-0.867513,0.224003,...,0,0,0,0,0,0,0,0,1,0
3,0,1,0,0,1.752351,-1.577264,-2.859279,-0.668165,-0.146205,-0.572884,...,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,-0.415303,0.609165,-1.143526,1.168787,-0.356499,1.127142,...,0,0,0,0,0,0,0,0,1,0


In [11]:
X_test.head()

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_eve_minutes,total_eve_calls,total_night_minutes,total_night_calls,...,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,area_code_408,area_code_415,area_code_510
1,1,0,0,0,70.9,123.0,211.9,73.0,236.0,73.0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,223.6,86.0,244.8,139.0,94.2,81.0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,1,1,294.7,95.0,237.3,105.0,300.3,127.0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,216.8,123.0,126.4,88.0,220.6,82.0,...,0,0,0,0,0,0,0,0,1,0
5,1,0,0,0,197.4,78.0,124.0,101.0,204.5,107.0,...,0,0,0,0,0,0,0,0,1,0


> Modelos

In [12]:
# Funções auxiliares

def evaluate_classification_model(y_test, y_pred, y_pred_proba):
    """ Avalia modelos de classificação por meio de matriz de confusão, AUC, curva ROC e acurácia."""
    
    # Matriz de confusão
    cm = confusion_matrix(y_test, y_pred)
    
    # Extração de cada valor da Confusion Matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Cálculo da métrica global AUC (Area Under The Curve) com dados reais e previsões em teste
    roc_auc = roc_auc_score(y_test, y_pred)

    # Cálculo da curva ROC com dados e previsões em teste
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

    # AUC em teste
    auc_ = auc(fpr, tpr)

    # Acurácia em teste
    accuracy = accuracy_score(y_test, y_pred)
    
    return cm, roc_auc, auc_, accuracy


def feature_importance(model, criteria, cols_list):
    """ Imprime as 10 variáveis mais importantes para o resultado do modelo. """
    indices = np.argsort(-abs(f'model.{criteria}_'[0,:10]))

    print("Top 10 - Variáveis mais importantes para o resultado do modelo:")
    print(50*'-')
    for feature in cols_list[indices]:
        print(feature) 

def save_model(model_name, model):
   # Salva o modelo em disco
    with open(f'../models/{model_name}.pkl', 'wb') as pickle_file:
        joblib.dump({model}, f'../models/{model_name}.pkl')
        



  indices = np.argsort(-abs(f'model.{criteria}_'[0,:10]))


### Construção, Treinamento e Avaliação do Modelo 1 com Regressão Logística (Benchmark)

In [13]:
# Treinamento do Modelo

# Define lista de hiperparâmetros
tuned_params_v1 = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 
                   'penalty': ['l1', 'l2']}

# Criação de modelo com GridSearch 
# Vários modelos serão criados com diferentes combinações de hiperparâmetros
model_v1 = GridSearchCV(LogisticRegression(), 
                         tuned_params_v1, 
                         scoring = 'roc_auc', 
                         n_jobs = -1) # n_jobs -1 para não impor limites para a execução

# Treinamento do modelo
model_v1.fit(X_train, y_train)

45 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.83349287       

In [14]:
# Previsões com dados de teste

# Previsões com dados de teste
y_pred_v1 = model_v1.predict(X_test)

# Obtemos as previsões no formato de probabilidade para cada classe
y_pred_proba_v1 = model_v1.predict_proba(X_test)

# Obtemos as previsões no formato de probabilidade filtrando para a classe positiva
# Precisamos disso para calcula a Curva ROC
y_pred_proba_v1 = model_v1.predict_proba(X_test)[:,1]


In [15]:
# Chama função para avaliação do modelo
cm_v1, roc_auc_v1, auc_v1, accuracy_v1 = evaluate_classification_model(y_test, y_pred_v1, y_pred_proba_v1)
print(cm_v1, roc_auc_v1, auc_v1, accuracy_v1)

[[   0 1443]
 [   0  224]] 0.5 0.5 0.13437312537492502


In [16]:
# Salva o modelo em disco
save_model('model_v1', model_v1)

In [17]:
# Consolidação da avaliação para comparação dos modelos

# Cria um dataframe para receber as métricas de cada modelo
df_models = pd.DataFrame()

# Dicionário com as métricas do modelo_v1
dict_model_v1 = {'Nome': 'modelo_v1', 
                 'Algoritmo': 'Regressão Logística', 
                 'ROC_AUC Score': roc_auc_v1,
                 'AUC Score': auc_v1,
                 'Acurácia': accuracy_v1}
dict_model_v1 = pd.DataFrame.from_dict(dict_model_v1, orient='index').T

# Adiciona o dict ao dataframe
#df_models = pd.concat(df_models, dict_model_v1)
df_models = dict_model_v1

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.5,0.5,0.134373


### Construção, Treinamento e Avaliação do Modelo 2 com Random Forest

In [18]:
# Função para seleção de hiperparâmetros com Random Forest
def random_forest_param_selection(X, y):
    """ Função para seleção de hiperparâmetros com Random Forest. """
    n_estimators = [100, 200, 300, 400, 500]
    min_samples_split= [2, 5, 10]
    min_samples_leaf= [1, 2, 4]    
    param_grid = {'n_estimators':n_estimators, 'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf}
    rand_search = RandomizedSearchCV(RandomForestClassifier(), param_grid, n_iter = 15, scoring = 'roc_auc', n_jobs  = -1)
    rand_search.fit(X_train, y_train)
    rand_search.best_estimator_
    return rand_search.best_estimator_

In [19]:
# Chama função para seleção de hiperparâmetros com Random Forest
model_v2 = random_forest_param_selection(X_train,  y_train)
model_v2

In [20]:
# Treinamento

# Construindo o modelo novamente com os melhores hiperparâmetros
# Isso é necessário pois a versão final não deve ter o GridSearchCV
model_v2 = RandomForestClassifier(min_samples_split=10, n_estimators=500)
model_v2.fit(X_train, y_train)


In [21]:
# Previsões em teste
y_pred_v2 = model_v2.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v2 = model_v2.predict_proba(X_test)[:,1]

In [22]:
# Avaliação do modelo
cm_v2, roc_auc_v2, auc_v2, accuracy_v2 = evaluate_classification_model(y_test, y_pred_v2, y_pred_proba_v2)
print(cm_v2, roc_auc_v2, auc_v2, accuracy_v2)

[[ 388 1055]
 [  19  205]] 0.5920314201564201 0.6516588704088704 0.3557288542291542


In [23]:
# Feature Importante

# Construindo o modelo novamente com os melhores hiperparâmetros
# Isso é necessário pois a versão final não deve ter o GridSearchCV
model_v2 = RandomForestClassifier(min_samples_split=10, n_estimators=500)
model_v2.fit(X_train, y_train)

# Variáveis mais relevantes
indices = np.argsort(-model_v2.feature_importances_)
print("Variáveis mais importantes para o resultado do modelo_v2:")
print(50*'-')
for feature in X_train.columns[indices][:10]:
    print(feature)

Variáveis mais importantes para o resultado do modelo_v2:
--------------------------------------------------
total_day_minutes
total_intl_calls
number_vmail_messages
total_eve_minutes
international_plan
total_night_minutes
total_intl_minutes
total_day_calls
total_eve_calls
total_night_calls


In [24]:
# Salva o modelo em disco
save_model('model_v2', model_v2)

In [25]:
# Dicionário com as métricas do modelo_v2
dict_model_v2 = {'Nome': 'modelo_v2', 
                 'Algoritmo': 'Random Forest', 
                 'ROC_AUC Score': roc_auc_v2,
                 'AUC Score': auc_v2,
                 'Acurácia': accuracy_v2}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v2 = pd.DataFrame.from_dict(dict_model_v2, orient='index').T
df_list = [df_models, dict_model_v2]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.5,0.5,0.134373
0,modelo_v2,Random Forest,0.592031,0.651659,0.355729


### Construção, Treinamento e Avaliação do Modelo 3 com KNN

In [30]:
# Função para seleção de hiperparâmetros com KNN
def knn_param_selection(X_train, y_train):
    """ Função para seleção de hiperparâmetros com KNN. """
    
    # Lista de possíveis valores de K
    neighbors = list(range(1, 30, 2))

    # Lista para os scores
    cv_scores = []

    # Validação cruzada para determinar o melhor valor de k
    for k in neighbors:
        knn = KNeighborsClassifier(n_neighbors = k)
        scores = cross_val_score(knn, X_train, y_train, cv = 5, scoring = 'accuracy')
        cv_scores.append(scores.mean())   

    # Ajustando o erro de classificação
    error = [1 - x for x in cv_scores]

    # Determinando o melhor valor de k (com menor erro)
    optimal_k = neighbors[error.index(min(error))]
    return optimal_k


In [31]:
# Seleção de hiperparâmetro
optimal_k = knn_param_selection(X_train, y_train)
print(f'O valor ideal de k é {optimal_k}')

Traceback (most recent call last):
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py", line 226, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 763, in kneighbors
    results = PairwiseDistancesArgKmin.compute(
  Fi

Traceback (most recent call last):
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py", line 226, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 763, in kneighbors
    results = PairwiseDistancesArgKmin.compute(
  Fi

Traceback (most recent call last):
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py", line 226, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 763, in kneighbors
    results = PairwiseDistancesArgKmin.compute(
  Fi

Traceback (most recent call last):
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py", line 226, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 763, in kneighbors
    results = PairwiseDistancesArgKmin.compute(
  Fi

Traceback (most recent call last):
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py", line 226, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 763, in kneighbors
    results = PairwiseDistancesArgKmin.compute(
  Fi

Traceback (most recent call last):
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py", line 226, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 763, in kneighbors
    results = PairwiseDistancesArgKmin.compute(
  Fi

O valor ideal de k é 1


Traceback (most recent call last):
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py", line 226, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "C:\Users\tatia\anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 763, in kneighbors
    results = PairwiseDistancesArgKmin.compute(
  Fi

In [32]:
# Treinamento

# Criamos o modelo versão 3
model_v3 = KNeighborsClassifier(n_neighbors = optimal_k)

# Treinamento
model_v3.fit(X_train, y_train)

In [33]:
# Previsões com dados de teste

# Previsões com dados de teste
y_pred_v3 = model_v3.predict(X_test)

# Obtemos as previsões no formato de probabilidade para cada classe
y_pred_proba_v3 = model_v3.predict_proba(X_test)

# Obtemos as previsões no formato de probabilidade filtrando para a classe positiva
# Precisamos disso para calcula a Curva ROC
y_pred_proba_v3 = model_v3.predict_proba(X_test)[:,1]

AttributeError: 'NoneType' object has no attribute 'split'

In [None]:
# Avaliação do modelo
cm_v3, roc_auc_v3, auc_v3, accuracy_v3 = evaluate_classification_model(y_test, y_pred_v3, y_pred_proba_v3)
print(cm_v3, roc_auc_v3, auc_v3, accuracy_v3)

In [None]:
#Obs: Com o algoritmo KNN não extraímos as variáveis mais importantes, pois o conceito do algoritmo é diferente.

In [None]:
# Salva o modelo em disco
save_model('model_v3', model_v3)

In [None]:
# Dicionário com as métricas do modelo_v3
dict_model_v3 = {'Nome': 'modelo_v3', 
                  'Algoritmo': 'KNN', 
                  'ROC_AUC Score': roc_auc_v3,
                  'AUC Score': auc_v3,
                  'Acurácia': accuracy_v3}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v3 = pd.DataFrame.from_dict(dict_model_v3, orient='index').T
df_list = [df_models, dict_model_v3]
df_models = pd.concat(df_list)

display(df_models)

### Construção, Treinamento e Avaliação do Modelo 4 com Decision Tree

In [None]:
#https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [34]:
# Função para seleção de hiperparâmetros com Decision Tree
def decision_tree_param_selection(X_train, y_train):
    """ Função para seleção de hiperparâmetros com Decision Tree. """
    n_estimators = [100, 200, 300, 400, 500]
    min_samples_split= [2, 3, 4, 5, 7]
    min_samples_leaf= [1, 2, 3, 4, 6]
    max_depth= [2, 3, 4, 5, 6, 7]
    param_grid = {'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf, 'max_depth':max_depth}
    rand_search = RandomizedSearchCV(DecisionTreeClassifier(), param_grid, n_iter = 15, scoring = 'roc_auc', n_jobs  = -1)
    rand_search.fit(X_train, y_train)
    rand_search.best_estimator_
    return rand_search.best_estimator_

In [35]:
# Chama função para seleção de hiperparâmetros com Decision Tree
model_v4 = decision_tree_param_selection(X_train,  y_train)
model_v4

In [36]:
# Treinamento

# Criação do modelo com os melhores hiperparâmetros (versão final não deve ter o RandomizedSearchCV)
model_v4 = DecisionTreeClassifier(max_depth=4, min_samples_leaf=4, min_samples_split=7)
model_v4.fit(X_train, y_train)

In [37]:
# Previsões em teste
y_pred_v4 = model_v4.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v4 = model_v4.predict_proba(X_test)[:,1]

In [38]:
# Avaliação do modelo
cm_v4, roc_auc_v4, auc_v4, accuracy_v4 = evaluate_classification_model(y_test, y_pred_v4, y_pred_proba_v4)
print(cm_v4, roc_auc_v4, auc_v4, accuracy_v4)

[[ 348 1095]
 [  12  212]] 0.5937964062964063 0.6015524453024453 0.3359328134373125


In [39]:
# Feature Importante

# Variáveis mais relevantes
indices = np.argsort(-model_v2.feature_importances_)
print("Variáveis mais importantes para o resultado do modelo_v2:")
print(50*'-')
for feature in X_train.columns[indices][:10]:
    print(feature)

Variáveis mais importantes para o resultado do modelo_v2:
--------------------------------------------------
total_day_minutes
total_intl_calls
number_vmail_messages
total_eve_minutes
international_plan
total_night_minutes
total_intl_minutes
total_day_calls
total_eve_calls
total_night_calls


In [40]:
# Salva o modelo em disco
save_model('model_v4', model_v4)

In [41]:
# Dicionário com as métricas do modelo_v4
dict_model_v4 = {'Nome': 'modelo_v4', 
                 'Algoritmo': 'Decision Tree', 
                 'ROC_AUC Score': roc_auc_v4,
                 'AUC Score': auc_v4,
                 'Acurácia': accuracy_v4}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v4 = pd.DataFrame.from_dict(dict_model_v4, orient='index').T
df_list = [df_models, dict_model_v4]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.5,0.5,0.134373
0,modelo_v2,Random Forest,0.592031,0.651659,0.355729
0,modelo_v4,Decision Tree,0.593796,0.601552,0.335933


### Construção, Treinamento e Avaliação do Modelo 5 com SVM

In [42]:
#https://scikit-learn.org/stable/modules/svm.html

In [43]:
# Função para seleção de hiperparâmetros
def svc_param_selection(X_train, y_train, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(SVC(kernel = 'rbf'), param_grid, cv = nfolds)
    grid_search.fit(X_train, y_train)
    grid_search.best_params_
    return grid_search.best_params_

In [44]:
# Aplica a função
svc_param_selection(X_train, y_train, 5)

{'C': 10, 'gamma': 0.1}

In [45]:
# Treinamento

# Criação do modelo com os melhores hiperparâmetros (versão final não deve ter o GridSearchCV)
model_v5 = SVC(C = 10, gamma = 0.1, probability = True)
model_v5.fit(X_train, y_train)

In [46]:
# Previsões em teste
y_pred_v5 = model_v5.predict(X_test)

# Obtém as previsões para a classe positiva
y_pred_proba_v5 = model_v5.predict_proba(X_test)[:,1]

In [47]:
# Avaliação do modelo
cm_v5, roc_auc_v5, auc_v5, accuracy_v5 = evaluate_classification_model(y_test, y_pred_v4, y_pred_proba_v4)
print(cm_v5, roc_auc_v5, auc_v5, accuracy_v5)

[[ 348 1095]
 [  12  212]] 0.5937964062964063 0.6015524453024453 0.3359328134373125


In [48]:
# Salva o modelo em disco
save_model('model_v5', model_v5)

In [49]:
# Dicionário com as métricas do modelo_v5
dict_model_v5 = {'Nome': 'modelo_v5', 
                 'Algoritmo': 'SVM', 
                 'ROC_AUC Score': roc_auc_v5,
                 'AUC Score': auc_v5,
                 'Acurácia': accuracy_v5}

# Adiciona o dict ao dataframe de consolidação das métricas
dict_model_v5 = pd.DataFrame.from_dict(dict_model_v5, orient='index').T
df_list = [df_models, dict_model_v5]
df_models = pd.concat(df_list)

display(df_models)

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v1,Regressão Logística,0.5,0.5,0.134373
0,modelo_v2,Random Forest,0.592031,0.651659,0.355729
0,modelo_v4,Decision Tree,0.593796,0.601552,0.335933
0,modelo_v5,SVM,0.593796,0.601552,0.335933


### Seleção do Melhor Modelo

In [50]:
# Seleção do modelo com maior AUC Score, por se tratar de uma métrica global
# O score AUC é o ideal para comparar modelos de diferentes algoritmos

df_best_model = df_models[df_models['AUC Score'] == df_models['AUC Score'].max()]
df_best_model

Unnamed: 0,Nome,Algoritmo,ROC_AUC Score,AUC Score,Acurácia
0,modelo_v2,Random Forest,0.592031,0.651659,0.355729


## Previsões com o Melhor Modelo Treinado

In [51]:
# Obtemos o nome do melhor modelo
model = df_best_model.Nome.to_string(index = False)
model

'modelo_v2'

In [62]:
# Carregamos o melhor modelo do disco
best_model = joblib.load('../models/model' + model[-3:] + '.pkl')
best_model

{RandomForestClassifier(min_samples_split=10, n_estimators=500)}

In [54]:
# Recuperação das colunas iniciais
df_original = pd.read_csv('../datasets/projeto4_telecom_treino.csv', index_col = 0, nrows=1)

print(len(df_original.columns))
df_original.columns


20


Index(['state', 'account_length', 'area_code', 'international_plan',
       'voice_mail_plan', 'number_vmail_messages', 'total_day_minutes',
       'total_day_calls', 'total_day_charge', 'total_eve_minutes',
       'total_eve_calls', 'total_eve_charge', 'total_night_minutes',
       'total_night_calls', 'total_night_charge', 'total_intl_minutes',
       'total_intl_calls', 'total_intl_charge',
       'number_customer_service_calls', 'churn'],
      dtype='object')

In [55]:
# Dados brutos de um novo consumidor (exemplo)
# O número de colunas deve ser o mesmo do que foi usado em treino
new_costumer = ['KS', 114, 'area_code_408', 'yes', 'yes', 32, 244.2, 120, 32.07, 154.4, 82, 22.54, 154.7, 86, 15.01, 12, 5, 3.7, 2]


In [56]:
# Converte o objeto para array
arr_costumer = np.array(new_costumer).reshape(1, -1)

df_new = pd.DataFrame(arr_costumer, columns=df_original.columns[:-1])
display(df_new)


Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
0,KS,114,area_code_408,yes,yes,32,244.2,120,32.07,154.4,82,22.54,154.7,86,15.01,12,5,3.7,2


In [57]:
# Lista de colunas categóricas // List of categorical columns
cat_features = ['state',
                'area_code', 
                'international_plan', 
                'voice_mail_plan',
                ]

# Lista de colunas numéricas // List of numerical columns     
num_features = ['account_length', 'number_vmail_messages',
                'total_day_minutes', 'total_day_calls', 'total_day_charge',
                'total_eve_minutes', 'total_eve_calls', 'total_eve_charge',
                'total_night_minutes', 'total_night_calls', 'total_night_charge',
                'total_intl_minutes', 'total_intl_calls', 'total_intl_charge',
                'number_customer_service_calls',
                ]


# Conversão dos tipos de dados das variáveis // Conversion of variables data types

# Categóricas
for feat in cat_features:
    df_new[feat] = df_new[feat].astype('category')

# Numéricas
for feat in num_features:
    df_new[feat] = df_new[feat].astype('float64')



In [58]:
# Lista de colunas categóricas // List of categorical columns
binary_features = ['account_length',
                   'international_plan', 
                   'voice_mail_plan',
                   'number_vmail_messages',
                  ]

# Lista de colunas numéricas // List of numerical columns     
onehot_features = ['state', 'area_code',
                       'total_day_minutes', 'total_day_calls',
                       'total_eve_minutes', 'total_eve_calls',
                       'total_night_minutes', 'total_night_calls',
                       'total_intl_minutes', 'total_intl_calls', 
                      ]


In [59]:
# Aplicação das mesmas transformações efetuadas no dataset de treino nos novos dados
# Application of the same transformations performed on the train dataset in the new data

#arr_costumer = [] # completar
df_new = df_new.drop(columns = ['number_customer_service_calls'])
df_new = df_new.drop(columns = ['total_day_charge','total_eve_charge','total_night_charge','total_intl_charge'])
df_new['account_length'] = np.where(df_new['account_length'] >= 100.86, 1, 0)
df_new['number_vmail_messages'] = np.where(df_new['number_vmail_messages'] >= 7.97, 1, 0)
df_new['area_code'] = df_new['area_code'].apply(lambda x: x[-3:])

# Formatação das demais variáveis binárias
df_new['international_plan'] = df_new['international_plan'].apply(lambda x: 1 if x == 'yes' else 0)
df_new['voice_mail_plan'] = df_new['voice_mail_plan'].apply(lambda x: 1 if x == 'yes' else 0)

# Aplicação de one hot encoding em variáveis categóricas nominais binárias
for col in onehot_features:

    # Aplicação do objeto encoder treinado (datasets de treino, de teste e novos dados)
    # (fazemos isso em dados de treino e teste e também em novos dados usados no modelo)
    onehots = pd.get_dummies(df_new[col], prefix = col)
    df_new = df_new.join(onehots)
    
# Remoção das colunas originais
df_new = df_new.drop(columns = onehot_features)


# Padronização dos novos dados // New data standardization
# Devem ser usados a média e o desvio de treino para padronizar o conjunto de novos dados

train_mean = df_train_stats[df_train_stats.index == 'mean']
train_std = df_train_stats[df_train_stats.index == 'std']

cols_padr =  [col for col in df_new.select_dtypes(include=['int','float64']).columns if col.startswith('total')]
for col in cols_padr:
    df_new[col] = (df_new[col] - train_mean[col].values) / train_std[col].values
    
    
# Resultado
display(df_new.head())


Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,state_KS,area_code_408,total_day_minutes_244.2,total_day_calls_120.0,total_eve_minutes_154.4,total_eve_calls_82.0,total_night_minutes_154.7,total_night_calls_86.0,total_intl_minutes_12.0,total_intl_calls_5.0
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [60]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   account_length             1 non-null      int32   
 1   international_plan         1 non-null      category
 2   voice_mail_plan            1 non-null      category
 3   number_vmail_messages      1 non-null      int32   
 4   state_KS                   1 non-null      uint8   
 5   area_code_408              1 non-null      uint8   
 6   total_day_minutes_244.2    1 non-null      uint8   
 7   total_day_calls_120.0      1 non-null      uint8   
 8   total_eve_minutes_154.4    1 non-null      uint8   
 9   total_eve_calls_82.0       1 non-null      uint8   
 10  total_night_minutes_154.7  1 non-null      uint8   
 11  total_night_calls_86.0     1 non-null      uint8   
 12  total_intl_minutes_12.0    1 non-null      uint8   
 13  total_intl_calls_5.0       1 non-null  

In [None]:
# Previsões de classe => ERRO!!!
#pred_new_costumer = melhor_modelo.predict(arr_costumer.reshape(1, -1))
pred_new = model_v2.predict(df_new)

# Verifica o valor e imprime o resultado final
if pred_new_costumer == 1:
    print('Churn costumer positive!')
else:
    print('Churn costumer negative!')