In [14]:
import pandas as pd
import plotly.express as px
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [2]:
df_leads = pd.read_csv('./datasets/leads_cleaned.csv')

In [3]:
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9074 entries, 0 to 9073
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9074 non-null   object 
 1   Lead Source                             9074 non-null   object 
 2   Do Not Email                            9074 non-null   int64  
 3   Do Not Call                             9074 non-null   int64  
 4   Converted                               9074 non-null   int64  
 5   TotalVisits                             9074 non-null   float64
 6   Total Time Spent on Website             9074 non-null   int64  
 7   Page Views Per Visit                    9074 non-null   float64
 8   Last Activity                           9074 non-null   object 
 9   Search                                  9074 non-null   int64  
 10  Newspaper Article                       9074 non-null   int6

### Preparação de dados

In [4]:
#preparar os dados para o modelo
X = df_leads.drop(columns=['Converted'])
y = df_leads['Converted']


In [5]:
#criar lista de colunas
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [6]:
#usar preprocessor
import joblib
preprocessor = joblib.load('./preprocessor_leads.pkl')

In [8]:
#dividir treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [11]:
#aplicar preprocessor
X_train = preprocessor.fit_transform(X_train).toarray()
X_test = preprocessor.transform(X_test).toarray()

In [12]:
X_train.shape

(7259, 68)

In [13]:
X_test.shape

(1815, 68)

### Treinar modelo

In [16]:
# Criar modelo
lr_model = LogisticRegression(random_state=51)

#modelos Base
tree_model = DecisionTreeClassifier(random_state=51)
svc_model = SVC(kernel='linear')
sgd_model = SGDClassifier(penalty='elasticnet')

#criar o objeto do stacking 
stacking_model = StackingClassifier(
    estimators=[
        ('sgd classifier', sgd_model),
        ('svc', svc_model),
        ('decision tree', tree_model)
    ],
    final_estimator=lr_model,
    passthrough=False
)

In [17]:
#treinar modelo
stacking_model.fit(X_train, y_train)

0,1,2
,estimators,"[('sgd classifier', ...), ('svc', ...), ...]"
,final_estimator,LogisticRegre...ndom_state=51)
,cv,
,stack_method,'auto'
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,loss,'hinge'
,penalty,'elasticnet'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,51
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,51
,solver,'lbfgs'
,max_iter,100


### Avaliação do modelo

In [18]:
y_pred = stacking_model.predict(X_test)

In [19]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [20]:
print(f'accuracy: {accuracy}')
print(f'precisão: {precision}')
print(f'recall: {recall}')
print(f'f1: {f1}')

accuracy: 0.8011019283746557
precisão: 0.7448494453248812
recall: 0.7014925373134329
f1: 0.722521137586472


In [22]:
conf_matrix = confusion_matrix(y_test, y_pred)

fig = px.imshow(conf_matrix,
                labels=dict(x='predição', y='real', color = 'contagem'),
                x=['Not Converted', 'Converted'],
                y=['Not Converted', 'Converted'],
                )

fig.update_traces(text=conf_matrix, texttemplate='%{z}')
fig.show()

In [23]:
#calcular importancia das variaveis considerando o stacking classifier
importances = []

for estimador in stacking_model.estimators_:
    if hasattr(estimador, 'coef_'):
        importances.append(np.abs(estimador.coef_[0]))
        print(f'Coeficientes do modelo {type(estimador).__name__}')
        #modelos baseados em arvores
    elif hasattr(estimador, 'feature_importances_'):
        importances.append(np.abs(estimador.feature_importances_))
        print(f'Feature Importances do modelo {type(estimador).__name__}')
        #Caso nao encontre coef e feature importances
    else:
        print(f'Não foi possível calcular a importância para {type(estimador).__name__}')


Coeficientes do modelo SGDClassifier
Coeficientes do modelo SVC
Feature Importances do modelo DecisionTreeClassifier


In [24]:
#média das importâncias
importancia_media = np.mean(importances, axis=0)

In [25]:
#obter os nomes das features
feature_names = (numeric_features.tolist() + 
                 preprocessor.named_transformers_['cat']
                 .get_feature_names_out(categorical_features).tolist()
                 )

In [27]:
#criar um dataframe
df_features_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importancia_media})

In [28]:
#ordernar dataframe
df_features_importances = df_features_importances.sort_values(by='Importance', ascending=True)

In [29]:
fig = px.bar(df_features_importances,
             x='Importance',
             y='Feature',
             orientation='h')
fig.show()

### Propriedades dos dados

In [32]:
#fazer uma predição num exemplo especifico
X_sample = X_test[7].reshape(1, -1)

#predições individuais dos estimadores
sgd_pred = stacking_model.named_estimators_['sgd classifier'].predict(X_sample)
svc_pred = stacking_model.named_estimators_['svc'].predict(X_sample)
tree_pred = stacking_model.named_estimators_['decision tree'].predict(X_sample)

#Predição final
stacking_pred = stacking_model.predict(X_sample)

In [34]:
#exibir
print(f'{sgd_pred[0]}')
print(f'{svc_pred[0]}')
print(f'{tree_pred[0]}')
print(f'{stacking_pred[0]}')

0
0
1
0
