In [None]:
!pip install scikit-optimize
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE


pd.set_option('display.max_columns', None)
df = pd.read_csv('https://drive.google.com/uc?export=download&id=1Lma-amQu0WjV-PCAr2LOe377H99z4BoR')

X = df.drop('y', axis=1)
y = df['y']



# Dividindo o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializando o modelo com hiperparâmetros padrão
model = GradientBoostingClassifier()

# Treinando o modelo
model.fit(X_train, y_train)

# Fazendo previsões
y_pred = model.predict(X_test)

# Avaliando a acurácia do modelo
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
AUC = roc_auc_score(y_test, y_pred)
print(f'Acurácia com hiperparâmetros padrão: {accuracy:.2f}')
print(f'Precisão com hiperparâmetros padrão: {precision:.2f}')
print(f'Recall com hiperparâmetros padrão: {recall:.2f}')
print(f'F1 com hiperparâmetros padrão: {f1:.2f}')
print(f'AUC com hiperparâmetros padrão: {AUC:.2f}')

Acurácia com hiperparâmetros padrão: 0.90
Precisão com hiperparâmetros padrão: 0.59
Recall com hiperparâmetros padrão: 0.37
F1 com hiperparâmetros padrão: 0.45
AUC com hiperparâmetros padrão: 0.67


* Com Resampling SMOTEENN

In [None]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('https://drive.google.com/uc?export=download&id=1Lma-amQu0WjV-PCAr2LOe377H99z4BoR')

X = df.drop('y', axis=1)
y = df['y']



# Dividindo o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#resampling
smote = SMOTEENN(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
# Inicializando o modelo com hiperparâmetros padrão
model = GradientBoostingClassifier()

# Treinando o modelo
model.fit(X_train, y_train)

# Fazendo previsões
y_pred = model.predict(X_test)

# Avaliando a acurácia do modelo
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
AUC = roc_auc_score(y_test, y_pred)
print(f'Acurácia com hiperparâmetros padrão: {accuracy:.2f}')
print(f'Precisão com hiperparâmetros padrão: {precision:.2f}')
print(f'Recall com hiperparâmetros padrão: {recall:.2f}')
print(f'F1 com hiperparâmetros padrão: {f1:.2f}')
print(f'AUC com hiperparâmetros padrão: {AUC:.2f}')

Acurácia com hiperparâmetros padrão: 0.85
Precisão com hiperparâmetros padrão: 0.41
Recall com hiperparâmetros padrão: 0.82
F1 com hiperparâmetros padrão: 0.54
AUC com hiperparâmetros padrão: 0.83


* Com Resampling SMOTE

In [None]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('https://drive.google.com/uc?export=download&id=1Lma-amQu0WjV-PCAr2LOe377H99z4BoR')

X = df.drop('y', axis=1)
y = df['y']



# Dividindo o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#resampling
smote = SMOTEENN(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
# Inicializando o modelo com hiperparâmetros padrão
model = GradientBoostingClassifier()

# Treinando o modelo
model.fit(X_train, y_train)

# Fazendo previsões
y_pred = model.predict(X_test)

# Avaliando a acurácia do modelo
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
AUC = roc_auc_score(y_test, y_pred)
print(f'Acurácia com hiperparâmetros padrão: {accuracy:.2f}')
print(f'Precisão com hiperparâmetros padrão: {precision:.2f}')
print(f'Recall com hiperparâmetros padrão: {recall:.2f}')
print(f'F1 com hiperparâmetros padrão: {f1:.2f}')
print(f'AUC com hiperparâmetros padrão: {AUC:.2f}')

Acurácia com hiperparâmetros padrão: 0.85
Precisão com hiperparâmetros padrão: 0.41
Recall com hiperparâmetros padrão: 0.82
F1 com hiperparâmetros padrão: 0.54
AUC com hiperparâmetros padrão: 0.83


* Otimização de bayes e cross validation

In [None]:
# Definindo o espaço de busca dos hiperparâmetros
search_space = {
    'n_estimators': Integer(100, 500),
    'learning_rate': Real(0.01, 0.2, 'log-uniform'),
    'max_depth': Integer(3, 10),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 20),
    'subsample': Real(0.6, 1.0)
}


# Inicializando o BayesSearchCV
bayes_search = BayesSearchCV(estimator=model, search_spaces=search_space, n_iter=50, cv=3, n_jobs=-1, verbose=2, random_state=42)

# Treinando o BayesSearchCV
bayes_search.fit(X_train, y_train)

# Obtendo os melhores hiperparâmetros
best_params = bayes_search.best_params_
print(f'Melhores hiperparâmetros: {best_params}')

# Avaliando o modelo com os melhores hiperparâmetros
best_model = bayes_search.best_estimator_
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Acurácia com melhores hiperparâmetros: {accuracy_best:.2f}')
# o código demora cerca de 2 horas para rodar
#melhores parâmetros sem resampling: ('learning_rate', 0.03164796082647757), ('max_depth', 8), ('min_samples_leaf', 3), ('min_samples_split', 14), ('n_estimators', 300), ('subsample', 0.7807321165862303)
#melhores parâmetros com resampling: ('learning_rate', 0.029519302007117918), ('max_depth', 10), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 427), ('subsample', 1.0)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

best_model = bayes_search.best_estimator_
y_pred_best = best_model.predict(X_test)
y_prob_best = best_model.predict_proba(X_test)[:, 1]
precision_best = precision_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)
roc_auc_best = roc_auc_score(y_test, y_prob_best)
print(f'Precisão: {precision_best:.2f}')
print(f'Revocação: {recall_best:.2f}')
print(f'F1-score: {f1_best:.2f}')
print(f'AUC-ROC: {roc_auc_best:.2f}')

Precisão: 0.46
Revocação: 0.75
F1-score: 0.57
AUC-ROC: 0.91
