In [2]:
import optuna
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import train_test_split

from project_ml_course.data_process import filter_columns_by_correlation_threshold

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
raw_df = pd.read_csv("../data/dados.csv", index_col="Unnamed: 0")

df = filter_columns_by_correlation_threshold(
    df=raw_df,
    ref_col="class",
    method_type="pearson",
    lower_threshold=0.001,
    higher_threshold=0.999,
)

X = df.drop(columns=["class"])
y = df["class"]

# Separar treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [4]:
import lightgbm as lgb


def objective(trial):
    param = {
        "objective": "binary",
        "metric": "None",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 7, 128),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for train_idx, valid_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
        lgbm = lgb.LGBMClassifier(**param)
        lgbm.fit(X_tr, y_tr)
        y_pred = lgbm.predict(X_val)
        score = f1_score(y_val, y_pred)
        scores.append(score)
    return sum(scores) / len(scores)


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30, show_progress_bar=True)

print("Melhores hiperparâmetros encontrados:")
print(study.best_params)
print(f"Melhor F1 médio (CV): {study.best_value:.4f}")


[I 2025-07-15 22:13:26,569] A new study created in memory with name: no-name-a3cda32b-b4d7-4f67-872d-e8c8b4af99ff
Best trial: 0. Best value: 0.966623:   3%|▎         | 1/30 [00:00<00:22,  1.30it/s]

[I 2025-07-15 22:13:27,358] Trial 0 finished with value: 0.9666228944835833 and parameters: {'learning_rate': 0.13759404269504588, 'num_leaves': 66, 'max_depth': 5, 'min_child_samples': 6, 'subsample': 0.9028470964630217, 'colsample_bytree': 0.9351095424632166, 'reg_alpha': 8.644974296363064e-07, 'reg_lambda': 7.162737401761477}. Best is trial 0 with value: 0.9666228944835833.


Best trial: 0. Best value: 0.966623:   7%|▋         | 2/30 [00:01<00:18,  1.50it/s]

[I 2025-07-15 22:13:27,948] Trial 1 finished with value: 0.8350089745650402 and parameters: {'learning_rate': 0.010973432297011204, 'num_leaves': 41, 'max_depth': 7, 'min_child_samples': 80, 'subsample': 0.5002427006200391, 'colsample_bytree': 0.9985923935501141, 'reg_alpha': 0.018740321104421063, 'reg_lambda': 6.962758272863654}. Best is trial 0 with value: 0.9666228944835833.


Best trial: 0. Best value: 0.966623:  10%|█         | 3/30 [00:01<00:14,  1.91it/s]

[I 2025-07-15 22:13:28,301] Trial 2 finished with value: 0.32814042007677624 and parameters: {'learning_rate': 0.009035719046205896, 'num_leaves': 104, 'max_depth': 3, 'min_child_samples': 17, 'subsample': 0.5634514916969653, 'colsample_bytree': 0.7183558861409234, 'reg_alpha': 0.06777837781824621, 'reg_lambda': 0.045072011042154386}. Best is trial 0 with value: 0.9666228944835833.


Best trial: 0. Best value: 0.966623:  13%|█▎        | 4/30 [00:02<00:12,  2.03it/s]

[I 2025-07-15 22:13:28,749] Trial 3 finished with value: 0.8395028212926243 and parameters: {'learning_rate': 0.02502256497428515, 'num_leaves': 124, 'max_depth': 4, 'min_child_samples': 54, 'subsample': 0.9325350814722708, 'colsample_bytree': 0.9876055241932058, 'reg_alpha': 1.3695657788716595e-07, 'reg_lambda': 0.0008150200253301987}. Best is trial 0 with value: 0.9666228944835833.


Best trial: 0. Best value: 0.966623:  17%|█▋        | 5/30 [00:02<00:10,  2.47it/s]

[I 2025-07-15 22:13:28,996] Trial 4 finished with value: 0.5793622224512793 and parameters: {'learning_rate': 0.01097465040894031, 'num_leaves': 101, 'max_depth': 2, 'min_child_samples': 89, 'subsample': 0.6784768295660772, 'colsample_bytree': 0.826571063273839, 'reg_alpha': 3.0213014508971856, 'reg_lambda': 3.1501576077924876e-07}. Best is trial 0 with value: 0.9666228944835833.


Best trial: 0. Best value: 0.966623:  20%|██        | 6/30 [00:03<00:12,  1.86it/s]

[I 2025-07-15 22:13:29,791] Trial 5 finished with value: 0.9054991155552333 and parameters: {'learning_rate': 0.01675941219844957, 'num_leaves': 69, 'max_depth': 9, 'min_child_samples': 62, 'subsample': 0.8030585808496069, 'colsample_bytree': 0.6666388117525824, 'reg_alpha': 0.09178896880204254, 'reg_lambda': 2.449407734376489e-08}. Best is trial 0 with value: 0.9666228944835833.


Best trial: 0. Best value: 0.966623:  23%|██▎       | 7/30 [00:03<00:13,  1.71it/s]

[I 2025-07-15 22:13:30,474] Trial 6 finished with value: 0.8434548903104737 and parameters: {'learning_rate': 0.008448325683530471, 'num_leaves': 42, 'max_depth': 8, 'min_child_samples': 38, 'subsample': 0.626598246479328, 'colsample_bytree': 0.99506242432928, 'reg_alpha': 0.8970192544393205, 'reg_lambda': 5.422979110778078}. Best is trial 0 with value: 0.9666228944835833.


Best trial: 0. Best value: 0.966623:  27%|██▋       | 8/30 [00:04<00:11,  1.93it/s]

[I 2025-07-15 22:13:30,852] Trial 7 finished with value: 0.9002079497177838 and parameters: {'learning_rate': 0.05189308814386132, 'num_leaves': 28, 'max_depth': 4, 'min_child_samples': 98, 'subsample': 0.7409325370774655, 'colsample_bytree': 0.8065977725930233, 'reg_alpha': 1.5924698859401823e-07, 'reg_lambda': 4.593177005713982e-08}. Best is trial 0 with value: 0.9666228944835833.


Best trial: 0. Best value: 0.966623:  30%|███       | 9/30 [00:04<00:09,  2.20it/s]

[I 2025-07-15 22:13:31,164] Trial 8 finished with value: 0.9352856033747592 and parameters: {'learning_rate': 0.1787389355053062, 'num_leaves': 16, 'max_depth': 3, 'min_child_samples': 84, 'subsample': 0.7867879308016714, 'colsample_bytree': 0.6714730136862341, 'reg_alpha': 2.2420734699855287e-05, 'reg_lambda': 5.941520322116823e-08}. Best is trial 0 with value: 0.9666228944835833.


Best trial: 0. Best value: 0.966623:  33%|███▎      | 10/30 [00:05<00:09,  2.10it/s]

[I 2025-07-15 22:13:31,686] Trial 9 finished with value: 0.581515738779166 and parameters: {'learning_rate': 0.004147452855630079, 'num_leaves': 43, 'max_depth': 6, 'min_child_samples': 72, 'subsample': 0.5210835862577172, 'colsample_bytree': 0.6962986581694541, 'reg_alpha': 1.7622948158943868e-08, 'reg_lambda': 0.5076569594762701}. Best is trial 0 with value: 0.9666228944835833.


Best trial: 0. Best value: 0.966623:  37%|███▋      | 11/30 [00:05<00:11,  1.66it/s]

[I 2025-07-15 22:13:32,574] Trial 10 finished with value: 0.0 and parameters: {'learning_rate': 0.0011249228534248165, 'num_leaves': 71, 'max_depth': 6, 'min_child_samples': 6, 'subsample': 0.9707080313390641, 'colsample_bytree': 0.8670076951438209, 'reg_alpha': 3.671540240283751e-05, 'reg_lambda': 3.5182908449146234e-05}. Best is trial 0 with value: 0.9666228944835833.


Best trial: 0. Best value: 0.966623:  40%|████      | 12/30 [00:06<00:11,  1.57it/s]

[I 2025-07-15 22:13:33,287] Trial 11 finished with value: 0.9610066896304295 and parameters: {'learning_rate': 0.29961586213694935, 'num_leaves': 16, 'max_depth': 5, 'min_child_samples': 32, 'subsample': 0.8547610718049374, 'colsample_bytree': 0.5191106013042561, 'reg_alpha': 3.049715410853934e-05, 'reg_lambda': 1.2754070576519272e-05}. Best is trial 0 with value: 0.9666228944835833.


Best trial: 0. Best value: 0.966623:  43%|████▎     | 13/30 [00:07<00:10,  1.66it/s]

[I 2025-07-15 22:13:33,809] Trial 12 finished with value: 0.9650306738867741 and parameters: {'learning_rate': 0.2685615879961351, 'num_leaves': 8, 'max_depth': 5, 'min_child_samples': 32, 'subsample': 0.8777966598264993, 'colsample_bytree': 0.5137064808194718, 'reg_alpha': 5.3916278088718835e-06, 'reg_lambda': 1.4998359691332863e-05}. Best is trial 0 with value: 0.9666228944835833.


Best trial: 0. Best value: 0.966623:  47%|████▋     | 14/30 [00:07<00:10,  1.58it/s]

[I 2025-07-15 22:13:34,510] Trial 13 finished with value: 0.9651905733092956 and parameters: {'learning_rate': 0.10826294957403104, 'num_leaves': 83, 'max_depth': 5, 'min_child_samples': 32, 'subsample': 0.8925242059885949, 'colsample_bytree': 0.5136154242927522, 'reg_alpha': 1.763527793801467e-06, 'reg_lambda': 0.0009924520949159278}. Best is trial 0 with value: 0.9666228944835833.


Best trial: 14. Best value: 0.966697:  50%|█████     | 15/30 [00:09<00:15,  1.00s/it]

[I 2025-07-15 22:13:36,372] Trial 14 finished with value: 0.966697260854889 and parameters: {'learning_rate': 0.08694959890341561, 'num_leaves': 85, 'max_depth': 10, 'min_child_samples': 15, 'subsample': 0.907134218657872, 'colsample_bytree': 0.5789168465969274, 'reg_alpha': 0.0008077704882055599, 'reg_lambda': 0.010181006378922287}. Best is trial 14 with value: 0.966697260854889.


Best trial: 14. Best value: 0.966697:  53%|█████▎    | 16/30 [00:11<00:18,  1.29s/it]

[I 2025-07-15 22:13:38,345] Trial 15 finished with value: 0.965695248753953 and parameters: {'learning_rate': 0.0728937466667864, 'num_leaves': 92, 'max_depth': 9, 'min_child_samples': 5, 'subsample': 0.9901061179650267, 'colsample_bytree': 0.6035894428748707, 'reg_alpha': 0.0016070959491312625, 'reg_lambda': 0.025744779978943987}. Best is trial 14 with value: 0.966697260854889.


Best trial: 16. Best value: 0.968086:  57%|█████▋    | 17/30 [00:13<00:17,  1.33s/it]

[I 2025-07-15 22:13:39,771] Trial 16 finished with value: 0.9680861830290333 and parameters: {'learning_rate': 0.03471154762174085, 'num_leaves': 56, 'max_depth': 10, 'min_child_samples': 19, 'subsample': 0.8471028918507326, 'colsample_bytree': 0.9025055972422991, 'reg_alpha': 0.0007955304288694156, 'reg_lambda': 0.021598100373842074}. Best is trial 16 with value: 0.9680861830290333.


Best trial: 16. Best value: 0.968086:  60%|██████    | 18/30 [00:14<00:16,  1.40s/it]

[I 2025-07-15 22:13:41,316] Trial 17 finished with value: 0.9627881147195122 and parameters: {'learning_rate': 0.03770136425458345, 'num_leaves': 61, 'max_depth': 10, 'min_child_samples': 23, 'subsample': 0.8165116153936927, 'colsample_bytree': 0.5830571454290057, 'reg_alpha': 0.0006988019731219768, 'reg_lambda': 0.00880050515025218}. Best is trial 16 with value: 0.9680861830290333.


Best trial: 16. Best value: 0.968086:  63%|██████▎   | 19/30 [00:15<00:14,  1.28s/it]

[I 2025-07-15 22:13:42,329] Trial 18 finished with value: 0.9650878949643484 and parameters: {'learning_rate': 0.03057641389001179, 'num_leaves': 53, 'max_depth': 10, 'min_child_samples': 42, 'subsample': 0.7424159861121034, 'colsample_bytree': 0.7728021966219739, 'reg_alpha': 0.0063303362186473006, 'reg_lambda': 0.3244896599317204}. Best is trial 16 with value: 0.9680861830290333.


Best trial: 19. Best value: 0.973582:  67%|██████▋   | 20/30 [00:17<00:13,  1.30s/it]

[I 2025-07-15 22:13:43,684] Trial 19 finished with value: 0.9735824311931396 and parameters: {'learning_rate': 0.07428937266006047, 'num_leaves': 82, 'max_depth': 8, 'min_child_samples': 18, 'subsample': 0.9357851894197677, 'colsample_bytree': 0.901456078029736, 'reg_alpha': 0.00020272486788318193, 'reg_lambda': 0.002247818985581766}. Best is trial 19 with value: 0.9735824311931396.


Best trial: 19. Best value: 0.973582:  70%|███████   | 21/30 [00:18<00:11,  1.25s/it]

[I 2025-07-15 22:13:44,800] Trial 20 finished with value: 0.8164536402155361 and parameters: {'learning_rate': 0.004010088965571729, 'num_leaves': 120, 'max_depth': 8, 'min_child_samples': 21, 'subsample': 0.9494860276140331, 'colsample_bytree': 0.9030596469467824, 'reg_alpha': 0.00014064700146772762, 'reg_lambda': 0.00012796896297867795}. Best is trial 19 with value: 0.9735824311931396.


Best trial: 21. Best value: 0.97439:  73%|███████▎  | 22/30 [00:19<00:10,  1.32s/it] 

[I 2025-07-15 22:13:46,300] Trial 21 finished with value: 0.9743902513676577 and parameters: {'learning_rate': 0.05588119464680504, 'num_leaves': 82, 'max_depth': 9, 'min_child_samples': 14, 'subsample': 0.8466398877355169, 'colsample_bytree': 0.9156682910427868, 'reg_alpha': 0.000239851442633549, 'reg_lambda': 0.0033246794209540052}. Best is trial 21 with value: 0.9743902513676577.


Best trial: 21. Best value: 0.97439:  77%|███████▋  | 23/30 [00:20<00:08,  1.19s/it]

[I 2025-07-15 22:13:47,167] Trial 22 finished with value: 0.9672586166973443 and parameters: {'learning_rate': 0.059911946509914174, 'num_leaves': 79, 'max_depth': 8, 'min_child_samples': 46, 'subsample': 0.8232333346608975, 'colsample_bytree': 0.9214052693645501, 'reg_alpha': 0.00024607385010308615, 'reg_lambda': 0.0023870150522203497}. Best is trial 21 with value: 0.9743902513676577.


Best trial: 21. Best value: 0.97439:  80%|████████  | 24/30 [00:21<00:07,  1.18s/it]

[I 2025-07-15 22:13:48,323] Trial 23 finished with value: 0.9697022122950884 and parameters: {'learning_rate': 0.04361903445251947, 'num_leaves': 53, 'max_depth': 9, 'min_child_samples': 25, 'subsample': 0.8581209550249407, 'colsample_bytree': 0.8648455852044405, 'reg_alpha': 0.004204481441076201, 'reg_lambda': 0.2442182106036956}. Best is trial 21 with value: 0.9743902513676577.


Best trial: 21. Best value: 0.97439:  83%|████████▎ | 25/30 [00:22<00:05,  1.16s/it]

[I 2025-07-15 22:13:49,451] Trial 24 finished with value: 0.965907510613393 and parameters: {'learning_rate': 0.021576288675757643, 'num_leaves': 99, 'max_depth': 9, 'min_child_samples': 26, 'subsample': 0.7753374155434222, 'colsample_bytree': 0.8651752679169351, 'reg_alpha': 0.006169387803780111, 'reg_lambda': 0.17147229573681966}. Best is trial 21 with value: 0.9743902513676577.


Best trial: 21. Best value: 0.97439:  87%|████████▋ | 26/30 [00:24<00:04,  1.22s/it]

[I 2025-07-15 22:13:50,806] Trial 25 finished with value: 0.9743838156312059 and parameters: {'learning_rate': 0.052597611609030176, 'num_leaves': 112, 'max_depth': 7, 'min_child_samples': 12, 'subsample': 0.705235448168031, 'colsample_bytree': 0.9525308086313955, 'reg_alpha': 8.094480148125553e-05, 'reg_lambda': 8.808675198250298e-05}. Best is trial 21 with value: 0.9743902513676577.


Best trial: 21. Best value: 0.97439:  90%|█████████ | 27/30 [00:25<00:03,  1.25s/it]

[I 2025-07-15 22:13:52,122] Trial 26 finished with value: 0.973600389888141 and parameters: {'learning_rate': 0.11384924319947413, 'num_leaves': 113, 'max_depth': 7, 'min_child_samples': 14, 'subsample': 0.7195057925523389, 'colsample_bytree': 0.9568104978920755, 'reg_alpha': 7.55711860030224e-05, 'reg_lambda': 1.7220211689895406e-06}. Best is trial 21 with value: 0.9743902513676577.


Best trial: 21. Best value: 0.97439:  93%|█████████▎| 28/30 [00:26<00:02,  1.29s/it]

[I 2025-07-15 22:13:53,513] Trial 27 finished with value: 0.9735887184295416 and parameters: {'learning_rate': 0.1609308967296058, 'num_leaves': 111, 'max_depth': 7, 'min_child_samples': 14, 'subsample': 0.6935216452636197, 'colsample_bytree': 0.9389743142230612, 'reg_alpha': 3.7001958248551097e-06, 'reg_lambda': 1.3698580731103153e-06}. Best is trial 21 with value: 0.9743902513676577.


Best trial: 21. Best value: 0.97439:  97%|█████████▋| 29/30 [00:28<00:01,  1.32s/it]

[I 2025-07-15 22:13:54,890] Trial 28 finished with value: 0.9720924663970543 and parameters: {'learning_rate': 0.105655087628954, 'num_leaves': 113, 'max_depth': 7, 'min_child_samples': 12, 'subsample': 0.6893723314421407, 'colsample_bytree': 0.961955051224911, 'reg_alpha': 7.837161195082606e-05, 'reg_lambda': 1.366017765037326e-06}. Best is trial 21 with value: 0.9743902513676577.


Best trial: 21. Best value: 0.97439: 100%|██████████| 30/30 [00:29<00:00,  1.02it/s]

[I 2025-07-15 22:13:56,023] Trial 29 finished with value: 0.9705774116973046 and parameters: {'learning_rate': 0.1514096211199036, 'num_leaves': 115, 'max_depth': 6, 'min_child_samples': 10, 'subsample': 0.6304307714930304, 'colsample_bytree': 0.9428609811270805, 'reg_alpha': 1.0729153789735333e-05, 'reg_lambda': 0.0001278686599681631}. Best is trial 21 with value: 0.9743902513676577.
Melhores hiperparâmetros encontrados:
{'learning_rate': 0.05588119464680504, 'num_leaves': 82, 'max_depth': 9, 'min_child_samples': 14, 'subsample': 0.8466398877355169, 'colsample_bytree': 0.9156682910427868, 'reg_alpha': 0.000239851442633549, 'reg_lambda': 0.0033246794209540052}
Melhor F1 médio (CV): 0.9744



