In [1]:
import sklearn.model_selection

In [2]:
from hypertuner import SKLearnModelSelection
from report import Report
import joblib
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier

In [3]:
data = pd.read_csv('dataset.csv')
data.head()

Unnamed: 0,Idade,UsoMensal,Plano,SatisfacaoCliente,TempoContrato,ValorMensal,Churn
0,56,52,Premium,1,Curto,75.48,0
1,69,65,Basico,4,Curto,79.25,0
2,46,76,Standard,3,Longo,183.56,0
3,32,42,Basico,2,Longo,162.5,0
4,60,74,Standard,2,Longo,186.23,1


In [4]:
def split_dataset(data: pd.DataFrame, target_col: str, test_size=0.2, random_state=None) -> tuple:
    X = data.drop(target_col, axis=1)
    y = data[target_col]
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

In [5]:
X_train, X_test, y_train, y_test = split_dataset(data, 'Churn', test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((700, 6), (300, 6), (700,), (300,))

In [6]:
categorical_cols = X_train.select_dtypes(include=['object']).columns
print(categorical_cols)
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(X_train[categorical_cols])

Index(['Plano', 'TempoContrato'], dtype='object')


In [7]:
X_train_cat = pd.DataFrame(encoder.transform(X_train[categorical_cols]))
X_test_cat = pd.DataFrame(encoder.transform(X_test[categorical_cols]))
X_train_cat.columns = encoder.get_feature_names_out(categorical_cols)
X_test_cat.columns = encoder.get_feature_names_out(categorical_cols)

In [8]:
X_train_preprocessed_cat = pd.concat([X_train.drop(categorical_cols, axis=1).reset_index(drop=True), X_train_cat], axis=1)
X_test_preprocessed_cat = pd.concat([X_test.drop(
    categorical_cols, axis=1).reset_index(drop=True), X_test_cat], axis=1)

In [9]:
X_train_preprocessed_cat.head()

Unnamed: 0,Idade,UsoMensal,SatisfacaoCliente,ValorMensal,Plano_Basico,Plano_Premium,Plano_Standard,TempoContrato_Curto,TempoContrato_Longo,TempoContrato_Medio
0,65,80,4,174.1,0.0,0.0,1.0,0.0,0.0,1.0
1,49,18,3,101.59,1.0,0.0,0.0,0.0,1.0,0.0
2,19,91,4,87.93,1.0,0.0,0.0,0.0,0.0,1.0
3,52,0,1,90.74,0.0,0.0,1.0,0.0,1.0,0.0
4,62,60,1,134.59,1.0,0.0,0.0,1.0,0.0,0.0


In [10]:
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
print(numeric_cols)
scaler = StandardScaler()
scaler.fit(X_train_preprocessed_cat[numeric_cols])

Index(['Idade', 'UsoMensal', 'SatisfacaoCliente', 'ValorMensal'], dtype='object')


In [11]:
X_train_num = pd.DataFrame(scaler.transform(X_train_preprocessed_cat[numeric_cols]))
X_test_num = pd.DataFrame(scaler.transform(X_test_preprocessed_cat[numeric_cols]))
X_train_num.columns = numeric_cols
X_test_num.columns = numeric_cols
print(X_train_num.head())

      Idade  UsoMensal  SatisfacaoCliente  ValorMensal
0  1.180292   1.069020           0.676895     1.130872
1  0.178225  -1.055509          -0.029255    -0.544723
2 -1.700652   1.445952           0.676895    -0.860385
3  0.366112  -1.672308          -1.441554    -0.795450
4  0.992405   0.383688          -1.441554     0.217856


In [12]:
X_train_preprocessed_cat[numeric_cols] = X_train_num
X_test_preprocessed_cat[numeric_cols] = X_test_num

In [13]:
X_train_preprocessed = X_train_preprocessed_cat
X_test_preprocessed = X_test_preprocessed_cat

display(X_train_preprocessed.head())
display(X_test_preprocessed.head())

Unnamed: 0,Idade,UsoMensal,SatisfacaoCliente,ValorMensal,Plano_Basico,Plano_Premium,Plano_Standard,TempoContrato_Curto,TempoContrato_Longo,TempoContrato_Medio
0,1.180292,1.06902,0.676895,1.130872,0.0,0.0,1.0,0.0,0.0,1.0
1,0.178225,-1.055509,-0.029255,-0.544723,1.0,0.0,0.0,0.0,1.0,0.0
2,-1.700652,1.445952,0.676895,-0.860385,1.0,0.0,0.0,0.0,0.0,1.0
3,0.366112,-1.672308,-1.441554,-0.79545,0.0,0.0,1.0,0.0,1.0,0.0
4,0.992405,0.383688,-1.441554,0.217856,1.0,0.0,0.0,1.0,0.0,0.0


Unnamed: 0,Idade,UsoMensal,SatisfacaoCliente,ValorMensal,Plano_Basico,Plano_Premium,Plano_Standard,TempoContrato_Curto,TempoContrato_Longo,TempoContrato_Medio
0,-0.635955,1.274619,-0.029255,1.51609,0.0,0.0,1.0,0.0,0.0,1.0
1,0.929775,-0.610043,0.676895,1.19858,1.0,0.0,0.0,0.0,0.0,1.0
2,-1.387505,-0.370177,-0.735404,0.323,1.0,0.0,0.0,1.0,0.0,0.0
3,-1.262247,1.548752,0.676895,0.871364,0.0,0.0,1.0,1.0,0.0,0.0
4,1.493438,0.726354,-1.441554,-1.544164,0.0,1.0,0.0,1.0,0.0,0.0


In [14]:
params = [
    {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 5, 10, 15],
        'min_samples_split': [2, 4, 6],
        'min_samples_leaf': [1, 2, 4]
    },
    {
        'loss': ['log_loss', 'exponential'],
        'learning_rate': [0.05, 0.1, 0.2],
        'n_estimators': [50, 100, 200],
        'max_depth': [1, 3, 5, 10],
        'max_leaf_nodes': [None, 5, 10, 15],
    }, 
    {
        'estimator': [RandomForestClassifier(random_state=42), GradientBoostingClassifier(random_state=42)],
        'n_estimators': [25, 50, 100],
        'learning_rate': [0.5, 1, 2]
    }
]


In [15]:
model_selector = SKLearnModelSelection([RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier], random_state=42).compile(params).fit(X_train_preprocessed, y_train, keep_all_models=True)

Unnamed: 0,Model,Best estimator,Best params,Best score
0,RandomForestClassifier,"(DecisionTreeClassifier(max_depth=5, max_featu...","{'max_depth': 5, 'min_samples_leaf': 4, 'min_s...",0.754286





Unnamed: 0,Model,Best estimator,Best params,Best score
0,GradientBoostingClassifier,([DecisionTreeRegressor(criterion='friedman_ms...,"{'learning_rate': 0.05, 'loss': 'exponential',...",0.775714





Unnamed: 0,Model,Best estimator,Best params,Best score
0,AdaBoostClassifier,(([DecisionTreeRegressor(criterion='friedman_m...,{'estimator': GradientBoostingClassifier(rando...,0.745714





In [16]:
model_selector.best_model, model_selector.best_params, model_selector.best_score

(GradientBoostingClassifier(learning_rate=0.05, loss='exponential', max_depth=5,
                            random_state=42),
 {'learning_rate': 0.05,
  'loss': 'exponential',
  'max_depth': 5,
  'max_leaf_nodes': None,
  'n_estimators': 100},
 0.7757142857142857)

In [17]:
report = Report(model_selector.best_model, X_test_preprocessed, y_test)

In [18]:
report.accuracy()

0.81

In [19]:
report.confusion_matrix()

array([[147,  35],
       [ 22,  96]])

In [20]:
print(report.report())

              precision    recall  f1-score   support

           0       0.87      0.81      0.84       182
           1       0.73      0.81      0.77       118

    accuracy                           0.81       300
   macro avg       0.80      0.81      0.80       300
weighted avg       0.82      0.81      0.81       300

{'0': {'precision': 0.8698224852071006, 'recall': 0.8076923076923077, 'f1-score': 0.8376068376068375, 'support': 182}, '1': {'precision': 0.732824427480916, 'recall': 0.8135593220338984, 'f1-score': 0.7710843373493977, 'support': 118}, 'accuracy': 0.81, 'macro avg': {'precision': 0.8013234563440084, 'recall': 0.810625814863103, 'f1-score': 0.8043455874781176, 'support': 300}, 'weighted avg': {'precision': 0.815936582501468, 'recall': 0.81, 'f1-score': 0.8114413208389112, 'support': 300}}


In [21]:
model = model_selector.build_best_model()

In [22]:
display(model_selector.results)

Unnamed: 0,Model,Best Estimator,Best params,Best score
0,RandomForestClassifier,"(DecisionTreeClassifier(max_depth=5, max_featu...","{'max_depth': 5, 'min_samples_leaf': 4, 'min_s...",0.754286
1,GradientBoostingClassifier,([DecisionTreeRegressor(criterion='friedman_ms...,"{'learning_rate': 0.05, 'loss': 'exponential',...",0.775714
2,AdaBoostClassifier,(([DecisionTreeRegressor(criterion='friedman_m...,{'estimator': GradientBoostingClassifier(rando...,0.745714
