In [97]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go 
import numpy as np

sns.set_style('whitegrid')

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


### Carga de dados

In [98]:
df_leads = pd.read_csv('./datasets/leads_cleaned.csv')

In [99]:
df_leads.head(10)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,0,0,0,0.0,0,0.0,Page Visited on Website,0,0,0,0,0,0,0,Modified
1,API,Organic Search,0,0,0,5.0,674,2.5,Email Opened,0,0,0,0,0,0,0,Email Opened
2,Landing Page Submission,Direct Traffic,0,0,1,2.0,1532,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
3,Landing Page Submission,Direct Traffic,0,0,0,1.0,305,1.0,Unreachable,0,0,0,0,0,0,0,Modified
4,Landing Page Submission,Google,0,0,1,2.0,1428,1.0,Converted to Lead,0,0,0,0,0,0,0,Modified
5,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
6,Landing Page Submission,Google,0,0,1,2.0,1640,2.0,Email Opened,0,0,0,0,0,0,0,Modified
7,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
8,Landing Page Submission,Direct Traffic,0,0,0,2.0,71,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
9,API,Google,0,0,0,4.0,58,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened


### Preparar dados

In [100]:
X = df_leads.drop(columns=['Converted'])
y = df_leads['Converted']

In [101]:
#criar lista de colunas
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [102]:
#usar o preprocessor
import joblib
preprocessor = joblib.load('./preprocessor_leads.pkl')

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [104]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [105]:
X_test.shape

(1815, 68)

In [106]:
X_train.shape

(7259, 68)

### Treinamento do modelo

In [107]:
#criar o modelo de boostingclassifier
boosting_model = AdaBoostClassifier(
    estimator=LogisticRegression(),
    n_estimators=50,
    learning_rate=1.0,
    random_state=51
)

In [108]:
#treinar modelo
boosting_model.fit(X_train, y_train)

0,1,2
,estimator,LogisticRegression()
,n_estimators,50
,learning_rate,1.0
,algorithm,'deprecated'
,random_state,51

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


### Avaliação do modelo

In [109]:
#fazer predições no conjunto de testes
y_pred = boosting_model.predict(X_test)

In [110]:
#Métricas do modelo
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [111]:
print(f'Acurária {accuracy}')
print(f'Precisão {precision}')
print(f'recall {recall}')
print(f'f1 {f1}')

Acurária 0.7856749311294766
Precisão 0.7004279600570613
recall 0.7328358208955223
f1 0.7162654996353027


In [112]:
#mostrar matriz de confusão
conf_matrix = confusion_matrix(y_test, y_pred)

fig = px.imshow(conf_matrix,
                labels=dict(x='Predição', y='Real', color='Contagem'),
                x=['Not Converted', 'Converted'],
                y=['Not Converted', 'Converted'],
                 color_continuous_scale='Viridis' )

fig.update_traces(text=conf_matrix, texttemplate='%{z}')
fig.show()

In [113]:
#calcular a importancia das variaveis
importances = np.mean([np.abs(estimator.coef_[0]) for estimator in boosting_model.estimators_], axis=0)

In [114]:
#obter os nomes reais das features
feature_names = (numeric_features.tolist() + 
                 preprocessor.named_transformers_['cat'].
                 get_feature_names_out(categorical_features).tolist())

In [115]:
feature_names

['Do Not Email',
 'Do Not Call',
 'TotalVisits',
 'Total Time Spent on Website',
 'Page Views Per Visit',
 'Search',
 'Newspaper Article',
 'X Education Forums',
 'Newspaper',
 'Digital Advertisement',
 'Through Recommendations',
 'A free copy of Mastering The Interview',
 'Lead Origin_API',
 'Lead Origin_Landing Page Submission',
 'Lead Origin_Lead Add Form',
 'Lead Origin_Lead Import',
 'Lead Source_Click2call',
 'Lead Source_Direct Traffic',
 'Lead Source_Facebook',
 'Lead Source_Google',
 'Lead Source_Live Chat',
 'Lead Source_NC_EDM',
 'Lead Source_Olark Chat',
 'Lead Source_Organic Search',
 'Lead Source_Pay per Click Ads',
 'Lead Source_Reference',
 'Lead Source_Referral Sites',
 'Lead Source_Social Media',
 'Lead Source_WeLearn',
 'Lead Source_Welingak Website',
 'Lead Source_bing',
 'Lead Source_blog',
 'Lead Source_testone',
 'Lead Source_welearnblog_Home',
 'Lead Source_youtubechannel',
 'Last Activity_Approached upfront',
 'Last Activity_Converted to Lead',
 'Last Activity_

In [116]:
#criar um dataframe combinando os nomes com as importancias
df_features_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

In [117]:
#ordenar pea importancia
df_features_importances = df_features_importances.sort_values(by='Importance', ascending=True)

In [118]:
#plotar grafico de importancia
fig = px.bar(df_features_importances,
             x='Importance',
             y= 'Feature',
             orientation='h',
             )
fig.show()

### Saídas do modelo

In [119]:
# Erros dos estimadores
boosting_model.estimator_errors_

array([0.37994214, 0.25903465, 0.44301389, 0.33267367, 0.43273034,
       0.42626923, 0.43217547, 0.48644275, 0.4268625 , 0.46478223,
       0.46065486, 0.46999515, 0.43488796, 0.48436183, 0.46258432,
       0.47685372, 0.48452293, 0.47781898, 0.47395167, 0.48928659,
       0.47750794, 0.48789657, 0.48545929, 0.48847906, 0.48047169,
       0.48643136, 0.49735856, 0.49632236, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ])

In [120]:
#Pesos dos estimadores
boosting_model.estimator_weights_

array([0.48979381, 1.05099201, 0.22893916, 0.69611714, 0.27072002,
       0.29708911, 0.27298076, 0.05424229, 0.29466371, 0.14110472,
       0.1577066 , 0.12016377, 0.26193558, 0.0625731 , 0.14994301,
       0.09265133, 0.06192807, 0.08878236, 0.10428774, 0.04286018,
       0.09002898, 0.04842316, 0.05817924, 0.04609193, 0.07815299,
       0.05428788, 0.01056587, 0.01471083, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [121]:
#predizer a probabilidade de conversão
y_pred_prob = boosting_model.predict_proba(X_test)

In [122]:
y_pred_prob

array([[0.28892438, 0.71107562],
       [0.55085138, 0.44914862],
       [0.55085138, 0.44914862],
       ...,
       [0.75407715, 0.24592285],
       [0.70374288, 0.29625712],
       [0.30284882, 0.69715118]])