In [None]:
!pip install scikit-optimize lightgbm imbalanced-learn

import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import lightgbm as lgb
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE

# Gerar dataset fictício
from sklearn.datasets import make_classification
pd.set_option('display.max_columns', None)
df = pd.read_csv('https://drive.google.com/uc?export=download&id=1Lma-amQu0WjV-PCAr2LOe377H99z4BoR')

X = df.drop('y', axis=1)
y = df['y']


# Dividir o dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTEENN(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Definir o espaço de busca dos hiperparâmetros
search_space = {
    'n_estimators': Integer(100, 500),
    'learning_rate': Real(0.01, 0.2, 'log-uniform'),
    'max_depth': Integer(3, 10),
    'num_leaves': Integer(20, 50),
    'min_child_samples': Integer(5, 30),
    'subsample': Real(0.6, 1.0),
    'colsample_bytree': Real(0.6, 1.0)
}

# Inicializar o modelo
model = lgb.LGBMClassifier()

# Inicializar o BayesSearchCV
bayes_search = BayesSearchCV(estimator=model, search_spaces=search_space, n_iter=50, cv=3, n_jobs=-1, verbose=2, random_state=42)

# Medir o tempo de execução
start_time = time.time()
bayes_search.fit(X_train, y_train)
end_time = time.time()

# Calcular o tempo total de execução
total_time = end_time - start_time
print(f'Tempo total de execução: {total_time / 60:.2f} minutos')

# Obtendo os melhores hiperparâmetros
best_params = bayes_search.best_params_
print(f'Melhores hiperparâmetros: {best_params}')

# Avaliando o modelo com os melhores hiperparâmetros
best_model = bayes_search.best_estimator_

# Prever no conjunto de teste
y_pred_best = best_model.predict(X_test)
y_prob_best = best_model.predict_proba(X_test)[:, 1]

# Calculando diferentes métricas de desempenho
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)
roc_auc_best = roc_auc_score(y_test, y_prob_best)

print(f'Acurácia: {accuracy_best:.2f}')
print(f'Precisão: {precision_best:.2f}')
print(f'Revocação: {recall_best:.2f}')
print(f'F1-score: {f1_best:.2f}')
print(f'AUC-ROC: {roc_auc_best:.2f}')
#Melhores hiperparâmetros: OrderedDict([('colsample_bytree', 1.0), ('learning_rate', 0.055719924550648614), ('max_depth', 8), ('min_child_samples', 5), ('n_estimators', 219), ('num_leaves', 50), ('subsample', 0.6)])


Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.7.0-py3-none-any.whl.metadata (11 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-24.7.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.7.0 scikit-optimize-0.10.2


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

In [None]:
import joblib
joblib.dump(best_model, 'best_model_lgbm.pkl')

['best_model_lgbm.pkl']