In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB

from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import resample

# Importando dados

In [2]:
df = pd.read_csv('df_bin_concat2.csv')

# Treino e teste

In [3]:
x = df.drop('satisf_econ_gen', axis=1)
y = df['satisf_econ_gen']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)

In [5]:
df['satisf_econ_gen'].value_counts()

0.0    34808
1.0     8333
Name: satisf_econ_gen, dtype: int64

# Balanceamento

In [6]:
#undersample = RandomUnderSampler()

In [7]:
#x_train, y_train = undersample.fit_resample(x_train, y_train)

# Normalização

In [8]:
#sc = StandardScaler()

In [9]:
#x_train = sc.fit_transform(x_train)
#x_test = sc.fit_transform(x_test)

# Gradient Boosting Classifier

In [10]:
gbc = GradientBoostingClassifier()

gbc.fit(x_train,y_train)

pred_gbc = gbc.predict(x_test)

print(classification_report(y_test, pred_gbc))

              precision    recall  f1-score   support

         0.0       0.90      0.94      0.92     10443
         1.0       0.70      0.54      0.61      2500

    accuracy                           0.87     12943
   macro avg       0.80      0.74      0.76     12943
weighted avg       0.86      0.87      0.86     12943



In [11]:
xgb = XGBClassifier(n_jobs=-1)

xgb.fit(x_train,y_train)

pred_xgb = xgb.predict(x_test)

print(classification_report(y_test, pred_xgb))

              precision    recall  f1-score   support

         0.0       0.90      0.94      0.92     10443
         1.0       0.67      0.54      0.60      2500

    accuracy                           0.86     12943
   macro avg       0.78      0.74      0.76     12943
weighted avg       0.85      0.86      0.85     12943



In [12]:
rf = RandomForestClassifier(n_jobs=-1)

rf.fit(x_train,y_train)

pred_rf = rf.predict(x_test)

print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

         0.0       0.88      0.96      0.92     10443
         1.0       0.73      0.48      0.58      2500

    accuracy                           0.86     12943
   macro avg       0.81      0.72      0.75     12943
weighted avg       0.85      0.86      0.85     12943



In [13]:
svm = SVC(probability=True, max_iter=1000)

svm.fit(x_train,y_train)

pred_svm = svm.predict(x_test)

print(classification_report(y_test, pred_svm))

              precision    recall  f1-score   support

         0.0       0.83      0.33      0.47     10443
         1.0       0.20      0.71      0.31      2500

    accuracy                           0.41     12943
   macro avg       0.51      0.52      0.39     12943
weighted avg       0.71      0.41      0.44     12943



In [14]:
nb = BernoulliNB()

nb.fit(x_train,y_train)

pred_nb = nb.predict(x_test)

print(classification_report(y_test, pred_nb))

              precision    recall  f1-score   support

         0.0       0.88      0.88      0.88     10443
         1.0       0.51      0.51      0.51      2500

    accuracy                           0.81     12943
   macro avg       0.70      0.70      0.70     12943
weighted avg       0.81      0.81      0.81     12943



In [None]:
param_grid_gbc = {
    'learning_rate':[0.001, 0.01, 0.01, 0.25, 0.4, 0.5],
    'n_estimators':[100, 200, 300, 1000], 
    'min_samples_split':[1, 2, 3, 4, 5], 
    'min_samples_leaf':[1, 2, 3, 4, 5], 
    'min_weight_fraction_leaf':[0.0, 0.25, 0.5], 
    'max_depth':[3, 4, 5, 6],
    'max_features':['sqrt', 'log2', 2, 3] 

}

In [16]:
param_grid_xgb = {
    'max_depth':[1, 2, 3, 4, 5, 6, 7, 8],
    'max_features':['sqrt', 'log2', 1, 2, 3],
    'n_estimators':[20, 50, 70, 100],
    'learning_rate':[0.001, 0.01, 0.01, 0.25, 0.4, 0.5]

}

In [None]:
param_grid_rf = {
    'bootstrap':[True, False],
    'max_depth':[80, 90, 100, 110],
    'max_features':['sqrt', 'log2', 2, 3],
    'min_samples_leaf':[3, 4, 5],
    'min_samples_split':[8, 10, 12],
    'n_estimators':[100, 200, 300, 1000],
    'criterion':['gini', 'entropy']
}

In [18]:
param_grid_nb = {
    'alpha': [np.linspace(0.1, 2.0, 20)],
    'binarize': [np.linspace(0.0, 1.0, 20)]
    
}

In [19]:
gbc_Grid = RandomizedSearchCV(estimator = gbc, param_distributions=param_grid_gbc,  cv=10, verbose=2, n_jobs=18)

In [20]:
xgb_Grid = RandomizedSearchCV(estimator = xgb, param_distributions=param_grid_xgb, cv=10, verbose=2, n_jobs=18)

In [21]:
rf_Grid = RandomizedSearchCV(estimator = rf, param_distributions=param_grid_rf, cv=10, verbose=2, n_jobs=18)

In [22]:
nb_Grid = RandomizedSearchCV(estimator = nb, param_distributions=param_grid_nb, cv=10, verbose=2, n_jobs=18)

In [23]:
gbc_Grid.fit(x_train,y_train)

pred_gbc = gbc_Grid.predict(x_test)

print(classification_report(y_test, pred_gbc))

Fitting 10 folds for each of 10 candidates, totalling 100 fits
              precision    recall  f1-score   support

         0.0       0.81      1.00      0.89     10443
         1.0       0.00      0.00      0.00      2500

    accuracy                           0.81     12943
   macro avg       0.40      0.50      0.45     12943
weighted avg       0.65      0.81      0.72     12943



In [24]:
xgb_Grid.fit(x_train,y_train)

pred_xgb = xgb_Grid.predict(x_test)

print(classification_report(y_test, pred_xgb))

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Parameters: { "max_features" } are not used.

              precision    recall  f1-score   support

         0.0       0.90      0.95      0.92     10443
         1.0       0.70      0.54      0.61      2500

    accuracy                           0.87     12943
   macro avg       0.80      0.74      0.76     12943
weighted avg       0.86      0.87      0.86     12943



In [25]:
rf_Grid.fit(x_train,y_train)

pred_rf = rf_Grid.predict(x_test)

print(classification_report(y_test, pred_rf))

Fitting 10 folds for each of 10 candidates, totalling 100 fits
              precision    recall  f1-score   support

         0.0       0.88      0.96      0.92     10443
         1.0       0.74      0.47      0.58      2500

    accuracy                           0.87     12943
   macro avg       0.81      0.72      0.75     12943
weighted avg       0.86      0.87      0.85     12943

