In [71]:
import pandas as pd
import plotly.express as px
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [72]:
df_leads = pd.read_csv('./datasets/leads_cleaned.csv')

In [73]:
df_leads.head(10)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,0,0,0,0.0,0,0.0,Page Visited on Website,0,0,0,0,0,0,0,Modified
1,API,Organic Search,0,0,0,5.0,674,2.5,Email Opened,0,0,0,0,0,0,0,Email Opened
2,Landing Page Submission,Direct Traffic,0,0,1,2.0,1532,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
3,Landing Page Submission,Direct Traffic,0,0,0,1.0,305,1.0,Unreachable,0,0,0,0,0,0,0,Modified
4,Landing Page Submission,Google,0,0,1,2.0,1428,1.0,Converted to Lead,0,0,0,0,0,0,0,Modified
5,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
6,Landing Page Submission,Google,0,0,1,2.0,1640,2.0,Email Opened,0,0,0,0,0,0,0,Modified
7,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
8,Landing Page Submission,Direct Traffic,0,0,0,2.0,71,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
9,API,Google,0,0,0,4.0,58,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened


In [74]:
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9074 entries, 0 to 9073
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9074 non-null   object 
 1   Lead Source                             9074 non-null   object 
 2   Do Not Email                            9074 non-null   int64  
 3   Do Not Call                             9074 non-null   int64  
 4   Converted                               9074 non-null   int64  
 5   TotalVisits                             9074 non-null   float64
 6   Total Time Spent on Website             9074 non-null   int64  
 7   Page Views Per Visit                    9074 non-null   float64
 8   Last Activity                           9074 non-null   object 
 9   Search                                  9074 non-null   int64  
 10  Newspaper Article                       9074 non-null   int6

### Preparar dados

In [75]:
#preparar dados
X = df_leads.drop(columns=['Converted'])
y = df_leads['Converted']

In [76]:
import joblib
preprocessor = joblib.load('./preprocessor_leads.pkl')

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)


X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [78]:
preprocessor.get_feature_names_out()

array(['num__Do Not Email', 'num__Do Not Call', 'num__TotalVisits',
       'num__Total Time Spent on Website', 'num__Page Views Per Visit',
       'num__Search', 'num__Newspaper Article', 'num__X Education Forums',
       'num__Newspaper', 'num__Digital Advertisement',
       'num__Through Recommendations',
       'num__A free copy of Mastering The Interview',
       'cat__Lead Origin_API', 'cat__Lead Origin_Landing Page Submission',
       'cat__Lead Origin_Lead Add Form', 'cat__Lead Origin_Lead Import',
       'cat__Lead Source_Click2call', 'cat__Lead Source_Direct Traffic',
       'cat__Lead Source_Facebook', 'cat__Lead Source_Google',
       'cat__Lead Source_Live Chat', 'cat__Lead Source_NC_EDM',
       'cat__Lead Source_Olark Chat', 'cat__Lead Source_Organic Search',
       'cat__Lead Source_Pay per Click Ads', 'cat__Lead Source_Reference',
       'cat__Lead Source_Referral Sites', 'cat__Lead Source_Social Media',
       'cat__Lead Source_WeLearn', 'cat__Lead Source_Welingak Webs

### Treinar voting classifier

In [83]:
lr_model = LogisticRegression(random_state=51)
svc_model = SVC(kernel='linear', probability=True)
tree_model = DecisionTreeClassifier(random_state=51)

voting_model = VotingClassifier(
    estimators=[
        ('logistic regression', lr_model),
        ('svc', svc_model),
        ('decision', tree_model)
    ],
   voting='hard'
)

In [84]:
voting_model.fit(X_train, y_train)

0,1,2
,estimators,"[('logistic regression', ...), ('svc', ...), ...]"
,voting,'hard'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,51
,solver,'lbfgs'
,max_iter,100

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,51
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Analise dos resultados

In [None]:
y_pred = voting_model.predict(X_test)

In [None]:
y_pred

array([1, 0, 0, ..., 0, 0, 1], dtype=int64)

In [None]:
#calcular metricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print(f'acuracia: {accuracy}')
print(f'precisão: {precision}')
print(f'recall: {recall}')
print(f'f1: {f1}')

acuracia: 0.7983471074380165
precisão: 0.7420382165605095
recall: 0.6955223880597015
f1: 0.7180277349768875


In [None]:
#mostrar a matriz de confusão
conf_matrix = confusion_matrix(y_test, y_pred)

fig = px.imshow(conf_matrix,
                labels=dict(x='predição', y='real', color='contagem'),
                x=['Not Converted', 'Converted'],
                 y=['Not Converted', 'Converted'],
                )

fig.update_traces(text=conf_matrix, texttemplate='%{z}')

fig.show()

In [None]:
import numpy as np
from collections import Counter

importances = []

# Coleta importâncias dos estimadores do VotingClassifier
for estimador in voting_model.estimators_:
    if hasattr(estimador, 'coef_'):
        imp = np.abs(estimador.coef_[0])
    elif hasattr(estimador, 'feature_importances_'):
        imp = estimador.feature_importances_
    else:
        print(f'Não foi possível carregar importâncias de: {type(estimador).__name__}')
        continue

    # Converte para array denso se for esparso
    if hasattr(imp, "toarray"):
        imp = imp.toarray().flatten()
    importances.append(np.array(imp))

# Descobrir o tamanho mais comum dos vetores
lengths = [len(imp) for imp in importances]
most_common_length = Counter(lengths).most_common(1)[0][0]

# Filtrar apenas vetores com o tamanho mais comum
importances_filtered = [imp for imp in importances if len(imp) == most_common_length]

# Calcular a média
importances_array = np.array(importances_filtered)
importancia_media = np.mean(importances_array, axis=0)

print("✅ Importância média calculada com sucesso:")
print(importancia_media)


✅ Importância média calculada com sucesso:
[0.22523852 0.03587513 0.11334446 0.85206433 0.08244887 0.0072784
 0.03554339 0.05729069 0.04262539 0.00572446 0.03675622 0.01354635
 0.59017145 0.62217065 1.33160526 0.1270707  0.15406345 0.332447
 0.43163126 0.12453701 0.01089549 0.31246918 0.32214603 0.19972264
 0.04277403 0.27364251 0.25843587 0.35100332 0.19591516 0.88615977
 0.1092695  0.33488185 0.05459321 0.14172053 0.08172463 0.68320062
 0.60979761 0.55129466 0.02167269 0.1209071  0.10067165 0.29920279
 0.25233692 0.6822895  0.48054617 0.31871189 0.3549633  0.16075804
 0.22973273 0.10262642 0.05419765 0.02177073 0.03318338 0.18525745
 0.48984408 0.1209071  0.45366622 0.1378337  0.05822045 0.60502148
 0.52612309 0.57091876 0.24872689 0.35476814 0.35041113 0.90905856
 0.02855717 0.25441671]


In [None]:
feature_names = preprocessor.get_feature_names_out()

In [None]:
df_feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importancia_media})

In [None]:
df_feature_importances = df_feature_importances.sort_values(by='Importance', ascending=True)

In [None]:
fig = px.bar(df_feature_importances,
             x='Importance',
             y='Feature',
             orientation='h')

fig.update_layout(height=1200, width=1000)
fig.show()

### propriedades do modelo

In [None]:
#mostrar evidencias hard voting
#selecionar um registro da base
X_sample = X_test[7].reshape(1, -1)

#predições individuais dos estimadores 
log_pred = voting_model.named_estimators_['logistic regression'].predict(X_sample)
svc_pred = voting_model.named_estimators_['svc'].predict(X_sample)
tree_pred = voting_model.named_estimators_['decision'].predict(X_sample)

voting_pred = voting_model.predict(X_sample)


In [None]:
print(f'reg logistic: {log_pred}')
print(f'svc: {svc_pred}')
print(f'tree: {tree_pred}')
print(f'votingpred: {voting_pred}')

reg logistic: [0]
svc: [0]
tree: [1]
votingpred: [0]


In [None]:
#mostrar evidencias soft voting
#selecionar um registro da base
X_sample = X_test[340].reshape(1, -1)

#predições individuais dos estimadores 
log_proba = voting_model.named_estimators_['logistic regression'].predict_proba(X_sample)
svc_proba = voting_model.named_estimators_['svc'].predict_proba(X_sample)
tree_proba = voting_model.named_estimators_['decision'].predict_proba(X_sample)

voting_proba = voting_model.predict_proba(X_sample)


AttributeError: This 'VotingClassifier' has no attribute 'predict_proba'

In [None]:


print(f'reg logistic: {log_proba}')
print(f'svc: {svc_proba}')
print(f'tree: {tree_proba}')
print(f'votingproba: {voting_proba}')