In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import warnings

In [18]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [19]:
# Importing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [20]:
%run features_testing.ipynb

In [21]:
# Split the data into explanatory and target variables
X = train.drop("TARGET", axis=1).values
y = train["TARGET"].values
X_test = test.values

In [22]:
#from sklearn.preprocessing import PowerTransformer
#
#pt = PowerTransformer(method='yeo-johnson')
#X_pt = pt.fit_transform(X)
#X_test_pt = pt.transform(X_test)

In [23]:
# Standardization
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)
X_test_std = sc.transform(X_test)

In [24]:
# Split the original data into the training data and the validation data
X_train, X_valid, y_train, y_valid = train_test_split(X_std, y, test_size=0.3, stratify=y, random_state=0)

In [25]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd

def objective(trial):
    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",

        "num_leaves": trial.suggest_int("num_leaves", 16, 128),  # Reduced from 256
        "max_depth": trial.suggest_int("max_depth", 3, 8),  # Reduced from -1 to 12
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),  # Narrowed range

        "feature_fraction": trial.suggest_float("feature_fraction", 0.7, 1.0),  # Narrowed
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.7, 1.0),  # Narrowed
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 5),  # Reduced from 10

        "min_child_samples": trial.suggest_int("min_child_samples", 20, 100),  # Narrowed
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-4, 1.0, log=True),  # Narrowed
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-4, 1.0, log=True),  # Narrowed

        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),  # Reduced from 3000
        "verbose": -1,
        
        # GPU parameters
        "device": "gpu" ,
        "gpu_platform_id": 0 ,
        "gpu_device_id": 0 ,
    }

    # Initialize k-fold cross-validation
    # Reduced to 3 folds for faster training
    n_splits = 3
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    auc_scores = []
    
    # Convert X_std to DataFrame to avoid feature names warning
    # Create feature names if they don't exist
    if isinstance(X_std, np.ndarray):
        feature_names = [f'feature_{i}' for i in range(X_std.shape[1])]
        x_train_df = pd.DataFrame(X_std, columns=feature_names)
    else:
        x_train_df = X_std
    
    # Perform k-fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(skf.split(x_train_df, y)):
        x_train_fold = x_train_df.iloc[train_idx]
        y_train_fold = y[train_idx]
        x_val_fold = x_train_df.iloc[val_idx]
        y_val_fold = y[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        
        model.fit(
            x_train_fold,
            y_train_fold,
            eval_set=[(x_val_fold, y_val_fold)],
            callbacks=[
                lgb.early_stopping(stopping_rounds=30),  # Reduced from 100
                lgb.log_evaluation(period=0)  # Silent training
            ]
        )
        
        preds = model.predict_proba(x_val_fold)[:, 1]
        fold_auc = roc_auc_score(y_val_fold, preds)
        auc_scores.append(fold_auc)
        
        print(f"Trial {trial.number} - Fold {fold + 1}/{n_splits} AUC: {fold_auc:.6f}")
    
    # Return mean AUC across all folds
    mean_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
    print(f"Trial {trial.number} - Mean AUC: {mean_auc:.6f} (±{std_auc:.6f})\n")
    
    return mean_auc


# Create and run the study
# n_jobs=1 when using GPU (GPU doesn't benefit from parallel trials)
study = optuna.create_study(
    direction="maximize",
    study_name="lgbm_optimization_kfold",
    sampler=optuna.samplers.TPESampler(seed=42),  # For reproducibility
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)  # Prune bad trials early
)

n_trials = 20 

study.optimize(
    objective, 
    n_trials=n_trials,
    n_jobs=1,  # LightGBM GPU doesn't support parallel training
    show_progress_bar=True,
    timeout=7200  # 2 hour timeout as safety measure
)

print("\nBest Mean AUC:", study.best_value)
print("Best hyperparameters:\n", study.best_params)

# Train final model with best parameters on full training data
best_params = study.best_params.copy()
best_params.update({
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "verbose": -1,

    "device": "gpu" ,
    "gpu_platform_id": 0 ,
    "gpu_device_id": 0 ,
})

# Split data for final validation
X_train_final, X_valid_final, y_train_final, y_valid_final = train_test_split(
    X_std, y, test_size=0.3, stratify=y, random_state=0
)

lgbm = lgb.LGBMClassifier(**best_params)
lgbm.fit(X_train_final, y_train_final, eval_set=[(X_valid_final, y_valid_final)])

lgbm_train_pred = lgbm.predict_proba(X_train_final)[:, 1]
lgbm_valid_pred = lgbm.predict_proba(X_valid_final)[:, 1]

print(f"\nFinal Model Performance:")
print(f"Train Score: {roc_auc_score(y_train_final, lgbm_train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid_final, lgbm_valid_pred)}")

[I 2025-11-28 10:41:57,640] A new study created in memory with name: lgbm_optimization_kfold
  0%|          | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[164]	valid_0's auc: 0.746769
Trial 0 - Fold 1/3 AUC: 0.746769
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[179]	valid_0's auc: 0.751195
Trial 0 - Fold 2/3 AUC: 0.751195
Training until validation scores don't improve for 30 rounds


Best trial: 0. Best value: 0.748312:   5%|▌         | 1/20 [00:13<04:23, 13.84s/it, 13.84/7200 seconds]

Early stopping, best iteration is:
[180]	valid_0's auc: 0.746972
Trial 0 - Fold 3/3 AUC: 0.746972
Trial 0 - Mean AUC: 0.748312 (±0.002040)

[I 2025-11-28 10:42:11,484] Trial 0 finished with value: 0.7483117661702626 and parameters: {'num_leaves': 58, 'max_depth': 8, 'learning_rate': 0.05395030966670229, 'feature_fraction': 0.8795975452591109, 'bagging_fraction': 0.7468055921327309, 'bagging_freq': 1, 'min_child_samples': 24, 'lambda_l1': 0.29154431891537513, 'lambda_l2': 0.02537815508265665, 'n_estimators': 737}. Best is trial 0 with value: 0.7483117661702626.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[173]	valid_0's auc: 0.747382
Trial 1 - Fold 1/3 AUC: 0.747382
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[257]	valid_0's auc: 0.751108
Trial 1 - Fold 2/3 AUC: 0.751108
Training until validation scores don't improve for 30 rounds


Best trial: 1. Best value: 0.748852:  10%|█         | 2/20 [00:20<02:58,  9.89s/it, 20.97/7200 seconds]

Early stopping, best iteration is:
[169]	valid_0's auc: 0.748066
Trial 1 - Fold 3/3 AUC: 0.748066
Trial 1 - Mean AUC: 0.748852 (±0.001619)

[I 2025-11-28 10:42:18,605] Trial 1 finished with value: 0.7488519418259575 and parameters: {'num_leaves': 18, 'max_depth': 8, 'learning_rate': 0.06798962421591129, 'feature_fraction': 0.7637017332034828, 'bagging_fraction': 0.7545474901621302, 'bagging_freq': 1, 'min_child_samples': 44, 'lambda_l1': 0.012561043700013555, 'lambda_l2': 0.005342937261279773, 'n_estimators': 362}. Best is trial 1 with value: 0.7488519418259575.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[141]	valid_0's auc: 0.731359
Trial 2 - Fold 1/3 AUC: 0.731359
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[140]	valid_0's auc: 0.728835
Trial 2 - Fold 2/3 AUC: 0.728835
Training until validation scores don't improve for 30 rounds


Best trial: 1. Best value: 0.748852:  15%|█▌        | 3/20 [00:24<02:01,  7.17s/it, 24.89/7200 seconds]

Did not meet early stopping. Best iteration is:
[141]	valid_0's auc: 0.727149
Trial 2 - Fold 3/3 AUC: 0.727149
Trial 2 - Mean AUC: 0.729114 (±0.001730)

[I 2025-11-28 10:42:22,530] Trial 2 finished with value: 0.7291144838664537 and parameters: {'num_leaves': 85, 'max_depth': 3, 'learning_rate': 0.019594972058679168, 'feature_fraction': 0.8099085529881075, 'bagging_fraction': 0.8368209952651108, 'bagging_freq': 4, 'min_child_samples': 36, 'lambda_l1': 0.011400863701127324, 'lambda_l2': 0.0234238498471129, 'n_estimators': 141}. Best is trial 1 with value: 0.7488519418259575.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[496]	valid_0's auc: 0.742224
Trial 3 - Fold 1/3 AUC: 0.742224
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[496]	valid_0's auc: 0.74482
Trial 3 - Fold 2/3 AUC: 0.744820
Training until validation scores don't improve for 30 rounds
Did not meet ea

Best trial: 1. Best value: 0.748852:  20%|██        | 4/20 [00:39<02:39,  9.95s/it, 39.11/7200 seconds]

[I 2025-11-28 10:42:36,747] Trial 3 finished with value: 0.743333954633262 and parameters: {'num_leaves': 84, 'max_depth': 4, 'learning_rate': 0.011615865989246453, 'feature_fraction': 0.984665661176, 'bagging_fraction': 0.9896896099223678, 'bagging_freq': 5, 'min_child_samples': 44, 'lambda_l1': 0.00024586032763280086, 'lambda_l2': 0.054567254856014755, 'n_estimators': 496}. Best is trial 1 with value: 0.7488519418259575.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[266]	valid_0's auc: 0.738513
Trial 4 - Fold 1/3 AUC: 0.738513
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[266]	valid_0's auc: 0.739745
Trial 4 - Fold 2/3 AUC: 0.739745
Training until validation scores don't improve for 30 rounds


Best trial: 1. Best value: 0.748852:  25%|██▌       | 5/20 [00:49<02:34, 10.27s/it, 49.94/7200 seconds]

Did not meet early stopping. Best iteration is:
[266]	valid_0's auc: 0.737199
Trial 4 - Fold 3/3 AUC: 0.737199
Trial 4 - Mean AUC: 0.738486 (±0.001040)

[I 2025-11-28 10:42:47,577] Trial 4 finished with value: 0.7384858109885871 and parameters: {'num_leaves': 29, 'max_depth': 5, 'learning_rate': 0.01082401838150096, 'feature_fraction': 0.9727961206236346, 'bagging_fraction': 0.777633994480005, 'bagging_freq': 4, 'min_child_samples': 45, 'lambda_l1': 0.012030178871154668, 'lambda_l2': 0.015375920235481757, 'n_estimators': 266}. Best is trial 1 with value: 0.7488519418259575.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[96]	valid_0's auc: 0.743746
Trial 5 - Fold 1/3 AUC: 0.743746
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[103]	valid_0's auc: 0.748258
Trial 5 - Fold 2/3 AUC: 0.748258
Training until validation scores don't improve for 30 rounds


Best trial: 1. Best value: 0.748852:  30%|███       | 6/20 [01:00<02:24, 10.31s/it, 60.34/7200 seconds]

Early stopping, best iteration is:
[103]	valid_0's auc: 0.744061
Trial 5 - Fold 3/3 AUC: 0.744061
Trial 5 - Mean AUC: 0.745355 (±0.002057)

[I 2025-11-28 10:42:57,979] Trial 5 finished with value: 0.7453549437751413 and parameters: {'num_leaves': 125, 'max_depth': 7, 'learning_rate': 0.08699593128513321, 'feature_fraction': 0.9684482051282947, 'bagging_fraction': 0.8793699936433255, 'bagging_freq': 5, 'min_child_samples': 27, 'lambda_l1': 0.0006080390190296605, 'lambda_l2': 0.00015167330688076205, 'n_estimators': 393}. Best is trial 1 with value: 0.7488519418259575.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[263]	valid_0's auc: 0.749635
Trial 6 - Fold 1/3 AUC: 0.749635
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[293]	valid_0's auc: 0.75091
Trial 6 - Fold 2/3 AUC: 0.750910
Training until validation scores don't improve for 30 rounds


Best trial: 6. Best value: 0.750361:  35%|███▌      | 7/20 [01:10<02:12, 10.18s/it, 70.23/7200 seconds]

Early stopping, best iteration is:
[336]	valid_0's auc: 0.750537
Trial 6 - Fold 3/3 AUC: 0.750537
Trial 6 - Mean AUC: 0.750361 (±0.000535)

[I 2025-11-28 10:43:07,875] Trial 6 finished with value: 0.750360957937794 and parameters: {'num_leaves': 59, 'max_depth': 4, 'learning_rate': 0.0674120461070276, 'feature_fraction': 0.8070259980080767, 'bagging_fraction': 0.7842803529062142, 'bagging_freq': 3, 'min_child_samples': 31, 'lambda_l1': 0.16172900811143134, 'lambda_l2': 0.00019870215385428647, 'n_estimators': 989}. Best is trial 6 with value: 0.750360957937794.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[204]	valid_0's auc: 0.730504
Trial 7 - Fold 1/3 AUC: 0.730504
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[204]	valid_0's auc: 0.729082
Trial 7 - Fold 2/3 AUC: 0.729082
Training until validation scores don't improve for 30 rounds


Best trial: 6. Best value: 0.750361:  40%|████      | 8/20 [01:16<01:48,  9.01s/it, 76.75/7200 seconds]

Did not meet early stopping. Best iteration is:
[204]	valid_0's auc: 0.726866
Trial 7 - Fold 3/3 AUC: 0.726866
Trial 7 - Mean AUC: 0.728817 (±0.001497)

[I 2025-11-28 10:43:14,388] Trial 7 finished with value: 0.728817231728431 and parameters: {'num_leaves': 103, 'max_depth': 4, 'learning_rate': 0.010127963257331486, 'feature_fraction': 0.9446384285364502, 'bagging_fraction': 0.9120572031542851, 'bagging_freq': 4, 'min_child_samples': 82, 'lambda_l1': 0.00019777828512462724, 'lambda_l2': 0.00271558195528294, 'n_estimators': 204}. Best is trial 6 with value: 0.750360957937794.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[517]	valid_0's auc: 0.749291
Trial 8 - Fold 1/3 AUC: 0.749291
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[510]	valid_0's auc: 0.751919
Trial 8 - Fold 2/3 AUC: 0.751919
Training until validation scores don't improve for 30 rounds
Early stoppi

Best trial: 8. Best value: 0.750423:  45%|████▌     | 9/20 [01:45<02:48, 15.31s/it, 105.92/7200 seconds]

Trial 8 - Fold 3/3 AUC: 0.750058
Trial 8 - Mean AUC: 0.750423 (±0.001103)

[I 2025-11-28 10:43:43,558] Trial 8 finished with value: 0.7504227352039287 and parameters: {'num_leaves': 113, 'max_depth': 6, 'learning_rate': 0.02142387495644906, 'feature_fraction': 0.7190675050858071, 'bagging_fraction': 0.7932946965146986, 'bagging_freq': 2, 'min_child_samples': 79, 'lambda_l1': 0.035500125258511595, 'lambda_l2': 0.35387588647792356, 'n_estimators': 525}. Best is trial 8 with value: 0.7504227352039287.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[149]	valid_0's auc: 0.747295
Trial 9 - Fold 1/3 AUC: 0.747295
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[192]	valid_0's auc: 0.750079
Trial 9 - Fold 2/3 AUC: 0.750079
Training until validation scores don't improve for 30 rounds


Best trial: 8. Best value: 0.750423:  50%|█████     | 10/20 [01:54<02:13, 13.37s/it, 114.94/7200 seconds]

Did not meet early stopping. Best iteration is:
[197]	valid_0's auc: 0.748601
Trial 9 - Fold 3/3 AUC: 0.748601
Trial 9 - Mean AUC: 0.748658 (±0.001137)

[I 2025-11-28 10:43:52,584] Trial 9 finished with value: 0.7486584149420997 and parameters: {'num_leaves': 29, 'max_depth': 7, 'learning_rate': 0.057648106701146694, 'feature_fraction': 0.8683831592708489, 'bagging_fraction': 0.9312901539863683, 'bagging_freq': 3, 'min_child_samples': 62, 'lambda_l1': 0.005130551760589831, 'lambda_l2': 0.00012637946338082883, 'n_estimators': 197}. Best is trial 8 with value: 0.7504227352039287.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[331]	valid_0's auc: 0.748834
Trial 10 - Fold 1/3 AUC: 0.748834
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[358]	valid_0's auc: 0.75159
Trial 10 - Fold 2/3 AUC: 0.751590
Training until validation scores don't improve for 30 rounds


Best trial: 8. Best value: 0.750423:  55%|█████▌    | 11/20 [02:15<02:20, 15.61s/it, 135.61/7200 seconds]

Early stopping, best iteration is:
[306]	valid_0's auc: 0.750229
Trial 10 - Fold 3/3 AUC: 0.750229
Trial 10 - Mean AUC: 0.750218 (±0.001125)

[I 2025-11-28 10:44:13,254] Trial 10 finished with value: 0.7502178790349653 and parameters: {'num_leaves': 126, 'max_depth': 6, 'learning_rate': 0.03143105612316977, 'feature_fraction': 0.7053885626844458, 'bagging_fraction': 0.7073341126300988, 'bagging_freq': 2, 'min_child_samples': 97, 'lambda_l1': 0.05753064281737121, 'lambda_l2': 0.7571129714763881, 'n_estimators': 679}. Best is trial 8 with value: 0.7504227352039287.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[446]	valid_0's auc: 0.750435
Trial 11 - Fold 1/3 AUC: 0.750435
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[600]	valid_0's auc: 0.752916
Trial 11 - Fold 2/3 AUC: 0.752916
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[435]	valid_0'

Best trial: 11. Best value: 0.751293:  60%|██████    | 12/20 [02:39<02:23, 17.99s/it, 159.06/7200 seconds]

Trial 11 - Fold 3/3 AUC: 0.750528
Trial 11 - Mean AUC: 0.751293 (±0.001148)

[I 2025-11-28 10:44:36,702] Trial 11 finished with value: 0.7512928393674313 and parameters: {'num_leaves': 56, 'max_depth': 5, 'learning_rate': 0.03085454401591916, 'feature_fraction': 0.7113514821151513, 'bagging_fraction': 0.8170436171571821, 'bagging_freq': 2, 'min_child_samples': 69, 'lambda_l1': 0.9542454897016222, 'lambda_l2': 0.5640857589810594, 'n_estimators': 954}. Best is trial 11 with value: 0.7512928393674313.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[430]	valid_0's auc: 0.749938
Trial 12 - Fold 1/3 AUC: 0.749938
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[509]	valid_0's auc: 0.7525
Trial 12 - Fold 2/3 AUC: 0.752500
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[434]	valid_0's auc: 0.750624


Best trial: 11. Best value: 0.751293:  65%|██████▌   | 13/20 [03:08<02:30, 21.52s/it, 188.70/7200 seconds]

Trial 12 - Fold 3/3 AUC: 0.750624
Trial 12 - Mean AUC: 0.751021 (±0.001083)

[I 2025-11-28 10:45:06,343] Trial 12 finished with value: 0.7510210611422469 and parameters: {'num_leaves': 54, 'max_depth': 6, 'learning_rate': 0.027617334613875093, 'feature_fraction': 0.7136065891631209, 'bagging_fraction': 0.824141732631138, 'bagging_freq': 2, 'min_child_samples': 71, 'lambda_l1': 0.9639757903159535, 'lambda_l2': 0.9036413185159636, 'n_estimators': 954}. Best is trial 11 with value: 0.7512928393674313.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[462]	valid_0's auc: 0.74953
Trial 13 - Fold 1/3 AUC: 0.749530
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[451]	valid_0's auc: 0.752517
Trial 13 - Fold 2/3 AUC: 0.752517
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[429]	valid_0's auc: 0.750319


Best trial: 11. Best value: 0.751293:  70%|███████   | 14/20 [03:29<02:08, 21.42s/it, 209.89/7200 seconds]

Trial 13 - Fold 3/3 AUC: 0.750319
Trial 13 - Mean AUC: 0.750789 (±0.001264)

[I 2025-11-28 10:45:27,531] Trial 13 finished with value: 0.750788647929804 and parameters: {'num_leaves': 51, 'max_depth': 5, 'learning_rate': 0.03568811552134348, 'feature_fraction': 0.7507692424202381, 'bagging_fraction': 0.8374399190366836, 'bagging_freq': 2, 'min_child_samples': 66, 'lambda_l1': 0.7266943511144078, 'lambda_l2': 0.170712034346924, 'n_estimators': 997}. Best is trial 11 with value: 0.7512928393674313.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[368]	valid_0's auc: 0.748697
Trial 14 - Fold 1/3 AUC: 0.748697
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[447]	valid_0's auc: 0.752103
Trial 14 - Fold 2/3 AUC: 0.752103
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[350]	valid_0's auc: 0.749657


Best trial: 11. Best value: 0.751293:  75%|███████▌  | 15/20 [03:54<01:51, 22.35s/it, 234.41/7200 seconds]

Trial 14 - Fold 3/3 AUC: 0.749657
Trial 14 - Mean AUC: 0.750152 (±0.001434)

[I 2025-11-28 10:45:52,048] Trial 14 finished with value: 0.750152375429196 and parameters: {'num_leaves': 75, 'max_depth': 6, 'learning_rate': 0.03197157364938144, 'feature_fraction': 0.7651364684774126, 'bagging_fraction': 0.8612488378706876, 'bagging_freq': 2, 'min_child_samples': 76, 'lambda_l1': 0.7534933229670142, 'lambda_l2': 0.11334487999639689, 'n_estimators': 833}. Best is trial 11 with value: 0.7512928393674313.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[728]	valid_0's auc: 0.749948
Trial 15 - Fold 1/3 AUC: 0.749948
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[806]	valid_0's auc: 0.753033
Trial 15 - Fold 2/3 AUC: 0.753033
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[788]	valid_0's auc: 0.75144


Best trial: 15. Best value: 0.751474:  80%|████████  | 16/20 [04:19<01:32, 23.13s/it, 259.34/7200 seconds]

Trial 15 - Fold 3/3 AUC: 0.751440
Trial 15 - Mean AUC: 0.751474 (±0.001259)

[I 2025-11-28 10:46:16,977] Trial 15 finished with value: 0.7514736659400567 and parameters: {'num_leaves': 45, 'max_depth': 5, 'learning_rate': 0.020241572916541595, 'feature_fraction': 0.8077006944020728, 'bagging_fraction': 0.8208435992012316, 'bagging_freq': 1, 'min_child_samples': 92, 'lambda_l1': 0.17960548177377253, 'lambda_l2': 0.6669757808743925, 'n_estimators': 860}. Best is trial 15 with value: 0.7514736659400567.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[833]	valid_0's auc: 0.747414
Trial 16 - Fold 1/3 AUC: 0.747414
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[833]	valid_0's auc: 0.748679
Trial 16 - Fold 2/3 AUC: 0.748679
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[833]	valid_0's auc: 0.747338


Best trial: 15. Best value: 0.751474:  85%|████████▌ | 17/20 [04:32<01:00, 20.17s/it, 272.63/7200 seconds]

Trial 16 - Fold 3/3 AUC: 0.747338
Trial 16 - Mean AUC: 0.747810 (±0.000615)

[I 2025-11-28 10:46:30,270] Trial 16 finished with value: 0.7478103840008737 and parameters: {'num_leaves': 40, 'max_depth': 3, 'learning_rate': 0.016604801870094624, 'feature_fraction': 0.8173990431898378, 'bagging_fraction': 0.8826874841198813, 'bagging_freq': 1, 'min_child_samples': 96, 'lambda_l1': 0.15870858486868894, 'lambda_l2': 0.0008038219149872795, 'n_estimators': 833}. Best is trial 15 with value: 0.7514736659400567.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[302]	valid_0's auc: 0.749423
Trial 17 - Fold 1/3 AUC: 0.749423
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[441]	valid_0's auc: 0.753309
Trial 17 - Fold 2/3 AUC: 0.753309
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[446]	valid_0's auc: 0.751181


Best trial: 15. Best value: 0.751474:  90%|█████████ | 18/20 [04:45<00:35, 18.00s/it, 285.56/7200 seconds]

Trial 17 - Fold 3/3 AUC: 0.751181
Trial 17 - Mean AUC: 0.751305 (±0.001589)

[I 2025-11-28 10:46:43,202] Trial 17 finished with value: 0.7513045415484623 and parameters: {'num_leaves': 67, 'max_depth': 5, 'learning_rate': 0.04006245359310065, 'feature_fraction': 0.9003377348752277, 'bagging_fraction': 0.8130987912299135, 'bagging_freq': 1, 'min_child_samples': 88, 'lambda_l1': 0.0024912417260024742, 'lambda_l2': 0.2402582766512846, 'n_estimators': 662}. Best is trial 15 with value: 0.7514736659400567.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[388]	valid_0's auc: 0.749944
Trial 18 - Fold 1/3 AUC: 0.749944
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[488]	valid_0's auc: 0.753108
Trial 18 - Fold 2/3 AUC: 0.753108
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[503]	valid_0's auc: 0.750707


Best trial: 15. Best value: 0.751474:  95%|█████████▌| 19/20 [04:56<00:15, 15.94s/it, 296.72/7200 seconds]

Trial 18 - Fold 3/3 AUC: 0.750707
Trial 18 - Mean AUC: 0.751253 (±0.001348)

[I 2025-11-28 10:46:54,362] Trial 18 finished with value: 0.7512529183278366 and parameters: {'num_leaves': 72, 'max_depth': 4, 'learning_rate': 0.043312407226116824, 'feature_fraction': 0.903362077422144, 'bagging_fraction': 0.7129602548622942, 'bagging_freq': 1, 'min_child_samples': 86, 'lambda_l1': 0.0023491614730919144, 'lambda_l2': 0.1844177110368866, 'n_estimators': 690}. Best is trial 15 with value: 0.7514736659400567.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[600]	valid_0's auc: 0.74722
Trial 19 - Fold 1/3 AUC: 0.747220
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[601]	valid_0's auc: 0.750156
Trial 19 - Fold 2/3 AUC: 0.750156
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[599]	valid_0's auc: 0.747994


Best trial: 15. Best value: 0.751474: 100%|██████████| 20/20 [05:15<00:00, 15.78s/it, 315.69/7200 seconds]

Trial 19 - Fold 3/3 AUC: 0.747994
Trial 19 - Mean AUC: 0.748457 (±0.001242)

[I 2025-11-28 10:47:13,327] Trial 19 finished with value: 0.748456972665258 and parameters: {'num_leaves': 70, 'max_depth': 5, 'learning_rate': 0.014036581759563604, 'feature_fraction': 0.9182611788035248, 'bagging_fraction': 0.9239582443530352, 'bagging_freq': 1, 'min_child_samples': 90, 'lambda_l1': 0.0012937206175193438, 'lambda_l2': 0.07359974431492625, 'n_estimators': 601}. Best is trial 15 with value: 0.7514736659400567.

Best Mean AUC: 0.7514736659400567
Best hyperparameters:
 {'num_leaves': 45, 'max_depth': 5, 'learning_rate': 0.020241572916541595, 'feature_fraction': 0.8077006944020728, 'bagging_fraction': 0.8208435992012316, 'bagging_freq': 1, 'min_child_samples': 92, 'lambda_l1': 0.17960548177377253, 'lambda_l2': 0.6669757808743925, 'n_estimators': 860}






Final Model Performance:
Train Score: 0.8174869064166779
Valid Score: 0.7516898276959596


# lightGBM cause seem like this is the best for the old model
import lightgbm as lgb
# optuna para 
hyperpara = {'num_leaves': 57, 'max_depth': 4, 'learning_rate': 0.03277842323048595, 'feature_fraction': 0.9569717768433371, 'bagging_fraction': 0.8506767379185446, 'bagging_freq': 1, 'min_child_samples': 48, 'lambda_l1': 0.17636584028657937, 'lambda_l2': 0.025244809309038312, 'n_estimators': 2486}

lgbm = lgb.LGBMClassifier(**hyperpara)
lgbm.fit(X_train, y_train, eval_set= [(X_valid, y_valid)])

lgbm_train_pred = lgbm.predict_proba(X_train)[:, 1]
lgbm_valid_pred = lgbm.predict_proba(X_valid)[:, 1]

print(f"Train Score: {roc_auc_score(y_train, lgbm_train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid, lgbm_valid_pred)}")

### under this is the output part


In [26]:
# Make predictions for the test data
# Change model name if needed
pred = lgbm.predict_proba(X_test_std)[:, 1]



In [27]:
# Put the prediction into the format of submission
sample_sub['TARGET'] = pred
sample_sub

Unnamed: 0,SK_ID_CURR,TARGET
0,171202,0.027374
1,171203,0.195480
2,171204,0.112429
3,171205,0.122038
4,171206,0.141326
...,...,...
61495,232697,0.157011
61496,232698,0.046755
61497,232699,0.031213
61498,232700,0.076052


In [28]:
# Create the "output" directory if it doesn't exist
output_dir = Path.cwd() / "output"
os.makedirs(output_dir, exist_ok=True)

# Specify the new output file path
output_file = output_dir / "submission.csv"

# Save the CSV file to the "output" directory
sample_sub.to_csv(output_file, index=False)