In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import warnings

In [2]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Importing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [4]:
%run features_testing.ipynb

In [5]:
# Split the data into explanatory and target variables
X = train.drop("TARGET", axis=1).values
y = train["TARGET"].values
X_test = test.values

In [6]:
#from sklearn.preprocessing import PowerTransformer
#
#pt = PowerTransformer(method='yeo-johnson')
#X_pt = pt.fit_transform(X)
#X_test_pt = pt.transform(X_test)

In [7]:
# Standardization
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)
X_test_std = sc.transform(X_test)

In [8]:
# Split the original data into the training data and the validation data
X_train, X_valid, y_train, y_valid = train_test_split(X_std, y, test_size=0.3, stratify=y, random_state=0)

In [9]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd

def objective(trial):
    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",

        "num_leaves": trial.suggest_int("num_leaves", 16, 128),  # Reduced from 256
        "max_depth": trial.suggest_int("max_depth", 3, 8),  # Reduced from -1 to 12
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),  # Narrowed range

        "feature_fraction": trial.suggest_float("feature_fraction", 0.7, 1.0),  # Narrowed
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.7, 1.0),  # Narrowed
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 5),  # Reduced from 10

        "min_child_samples": trial.suggest_int("min_child_samples", 20, 100),  # Narrowed
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-4, 1.0, log=True),  # Narrowed
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-4, 1.0, log=True),  # Narrowed

        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),  # Reduced from 3000
        "verbose": -1,
        
        # GPU parameters
        "device": "gpu" ,
        "gpu_platform_id": 0 ,
        "gpu_device_id": 0 ,
    }

    # Initialize k-fold cross-validation
    # Reduced to 3 folds for faster training
    n_splits = 3
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    auc_scores = []
    
    # Convert X_std to DataFrame to avoid feature names warning
    # Create feature names if they don't exist
    if isinstance(X_std, np.ndarray):
        feature_names = [f'feature_{i}' for i in range(X_std.shape[1])]
        x_train_df = pd.DataFrame(X_std, columns=feature_names)
    else:
        x_train_df = X_std
    
    # Perform k-fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(skf.split(x_train_df, y)):
        x_train_fold = x_train_df.iloc[train_idx]
        y_train_fold = y[train_idx]
        x_val_fold = x_train_df.iloc[val_idx]
        y_val_fold = y[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        
        model.fit(
            x_train_fold,
            y_train_fold,
            eval_set=[(x_val_fold, y_val_fold)],
            callbacks=[
                lgb.early_stopping(stopping_rounds=30),  # Reduced from 100
                lgb.log_evaluation(period=0)  # Silent training
            ]
        )
        
        preds = model.predict_proba(x_val_fold)[:, 1]
        fold_auc = roc_auc_score(y_val_fold, preds)
        auc_scores.append(fold_auc)
        
        print(f"Trial {trial.number} - Fold {fold + 1}/{n_splits} AUC: {fold_auc:.6f}")
    
    # Return mean AUC across all folds
    mean_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
    print(f"Trial {trial.number} - Mean AUC: {mean_auc:.6f} (±{std_auc:.6f})\n")
    
    return mean_auc


# Create and run the study
# n_jobs=1 when using GPU (GPU doesn't benefit from parallel trials)
study = optuna.create_study(
    direction="maximize",
    study_name="lgbm_optimization_kfold",
    sampler=optuna.samplers.TPESampler(seed=42),  # For reproducibility
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)  # Prune bad trials early
)

n_trials = 20 

study.optimize(
    objective, 
    n_trials=n_trials,
    n_jobs=1,  # LightGBM GPU doesn't support parallel training
    show_progress_bar=True,
    timeout=7200  # 2 hour timeout as safety measure
)

print("\nBest Mean AUC:", study.best_value)
print("Best hyperparameters:\n", study.best_params)

# Train final model with best parameters on full training data
best_params = study.best_params.copy()
best_params.update({
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "verbose": -1,

    "device": "gpu" ,
    "gpu_platform_id": 0 ,
    "gpu_device_id": 0 ,
})

# Split data for final validation
X_train_final, X_valid_final, y_train_final, y_valid_final = train_test_split(
    X_std, y, test_size=0.3, stratify=y, random_state=0
)

lgbm = lgb.LGBMClassifier(**best_params)
lgbm.fit(X_train_final, y_train_final, eval_set=[(X_valid_final, y_valid_final)])

lgbm_train_pred = lgbm.predict_proba(X_train_final)[:, 1]
lgbm_valid_pred = lgbm.predict_proba(X_valid_final)[:, 1]

print(f"\nFinal Model Performance:")
print(f"Train Score: {roc_auc_score(y_train_final, lgbm_train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid_final, lgbm_valid_pred)}")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-11-28 11:25:15,934] A new study created in memory with name: lgbm_optimization_kfold
  0%|          | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[164]	valid_0's auc: 0.746762
Trial 0 - Fold 1/3 AUC: 0.746762
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[198]	valid_0's auc: 0.751247
Trial 0 - Fold 2/3 AUC: 0.751247
Training until validation scores don't improve for 30 rounds


Best trial: 0. Best value: 0.748468:   5%|▌         | 1/20 [00:15<04:59, 15.76s/it, 15.76/7200 seconds]

Early stopping, best iteration is:
[207]	valid_0's auc: 0.747394
Trial 0 - Fold 3/3 AUC: 0.747394
Trial 0 - Mean AUC: 0.748468 (±0.001982)

[I 2025-11-28 11:25:31,692] Trial 0 finished with value: 0.748467809316832 and parameters: {'num_leaves': 58, 'max_depth': 8, 'learning_rate': 0.05395030966670229, 'feature_fraction': 0.8795975452591109, 'bagging_fraction': 0.7468055921327309, 'bagging_freq': 1, 'min_child_samples': 24, 'lambda_l1': 0.29154431891537513, 'lambda_l2': 0.02537815508265665, 'n_estimators': 737}. Best is trial 0 with value: 0.748467809316832.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[173]	valid_0's auc: 0.747384
Trial 1 - Fold 1/3 AUC: 0.747384
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[257]	valid_0's auc: 0.751095
Trial 1 - Fold 2/3 AUC: 0.751095
Training until validation scores don't improve for 30 rounds


Best trial: 1. Best value: 0.748847:  10%|█         | 2/20 [00:24<03:25, 11.41s/it, 24.12/7200 seconds]

Early stopping, best iteration is:
[169]	valid_0's auc: 0.748063
Trial 1 - Fold 3/3 AUC: 0.748063
Trial 1 - Mean AUC: 0.748847 (±0.001613)

[I 2025-11-28 11:25:40,059] Trial 1 finished with value: 0.7488472731671864 and parameters: {'num_leaves': 18, 'max_depth': 8, 'learning_rate': 0.06798962421591129, 'feature_fraction': 0.7637017332034828, 'bagging_fraction': 0.7545474901621302, 'bagging_freq': 1, 'min_child_samples': 44, 'lambda_l1': 0.012561043700013555, 'lambda_l2': 0.005342937261279773, 'n_estimators': 362}. Best is trial 1 with value: 0.7488472731671864.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[141]	valid_0's auc: 0.731359
Trial 2 - Fold 1/3 AUC: 0.731359
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[140]	valid_0's auc: 0.728835
Trial 2 - Fold 2/3 AUC: 0.728835
Training until validation scores don't improve for 30 rounds


Best trial: 1. Best value: 0.748847:  15%|█▌        | 3/20 [00:27<02:15,  7.95s/it, 27.96/7200 seconds]

Did not meet early stopping. Best iteration is:
[141]	valid_0's auc: 0.727149
Trial 2 - Fold 3/3 AUC: 0.727149
Trial 2 - Mean AUC: 0.729114 (±0.001730)

[I 2025-11-28 11:25:43,891] Trial 2 finished with value: 0.7291144852456698 and parameters: {'num_leaves': 85, 'max_depth': 3, 'learning_rate': 0.019594972058679168, 'feature_fraction': 0.8099085529881075, 'bagging_fraction': 0.8368209952651108, 'bagging_freq': 4, 'min_child_samples': 36, 'lambda_l1': 0.011400863701127324, 'lambda_l2': 0.0234238498471129, 'n_estimators': 141}. Best is trial 1 with value: 0.7488472731671864.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[496]	valid_0's auc: 0.742224
Trial 3 - Fold 1/3 AUC: 0.742224
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[496]	valid_0's auc: 0.74482
Trial 3 - Fold 2/3 AUC: 0.744820
Training until validation scores don't improve for 30 rounds
Did not meet ea

Best trial: 1. Best value: 0.748847:  20%|██        | 4/20 [00:43<02:53, 10.87s/it, 43.30/7200 seconds]

Trial 3 - Fold 3/3 AUC: 0.742958
Trial 3 - Mean AUC: 0.743334 (±0.001093)

[I 2025-11-28 11:25:59,233] Trial 3 finished with value: 0.7433340056642495 and parameters: {'num_leaves': 84, 'max_depth': 4, 'learning_rate': 0.011615865989246453, 'feature_fraction': 0.984665661176, 'bagging_fraction': 0.9896896099223678, 'bagging_freq': 5, 'min_child_samples': 44, 'lambda_l1': 0.00024586032763280086, 'lambda_l2': 0.054567254856014755, 'n_estimators': 496}. Best is trial 1 with value: 0.7488472731671864.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[266]	valid_0's auc: 0.738513
Trial 4 - Fold 1/3 AUC: 0.738513
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[266]	valid_0's auc: 0.739745
Trial 4 - Fold 2/3 AUC: 0.739745
Training until validation scores don't improve for 30 rounds


Best trial: 1. Best value: 0.748847:  25%|██▌       | 5/20 [00:55<02:51, 11.43s/it, 55.72/7200 seconds]

Did not meet early stopping. Best iteration is:
[266]	valid_0's auc: 0.737199
Trial 4 - Fold 3/3 AUC: 0.737199
Trial 4 - Mean AUC: 0.738486 (±0.001040)

[I 2025-11-28 11:26:11,650] Trial 4 finished with value: 0.7384858661572219 and parameters: {'num_leaves': 29, 'max_depth': 5, 'learning_rate': 0.01082401838150096, 'feature_fraction': 0.9727961206236346, 'bagging_fraction': 0.777633994480005, 'bagging_freq': 4, 'min_child_samples': 45, 'lambda_l1': 0.012030178871154668, 'lambda_l2': 0.015375920235481757, 'n_estimators': 266}. Best is trial 1 with value: 0.7488472731671864.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[97]	valid_0's auc: 0.744304
Trial 5 - Fold 1/3 AUC: 0.744304
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[103]	valid_0's auc: 0.748266
Trial 5 - Fold 2/3 AUC: 0.748266
Training until validation scores don't improve for 30 rounds


Best trial: 1. Best value: 0.748847:  30%|███       | 6/20 [01:07<02:42, 11.64s/it, 67.77/7200 seconds]

Early stopping, best iteration is:
[113]	valid_0's auc: 0.74416
Trial 5 - Fold 3/3 AUC: 0.744160
Trial 5 - Mean AUC: 0.745577 (±0.001902)

[I 2025-11-28 11:26:23,707] Trial 5 finished with value: 0.7455765953868747 and parameters: {'num_leaves': 125, 'max_depth': 7, 'learning_rate': 0.08699593128513321, 'feature_fraction': 0.9684482051282947, 'bagging_fraction': 0.8793699936433255, 'bagging_freq': 5, 'min_child_samples': 27, 'lambda_l1': 0.0006080390190296605, 'lambda_l2': 0.00015167330688076205, 'n_estimators': 393}. Best is trial 1 with value: 0.7488472731671864.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[263]	valid_0's auc: 0.749635
Trial 6 - Fold 1/3 AUC: 0.749635
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[293]	valid_0's auc: 0.750911
Trial 6 - Fold 2/3 AUC: 0.750911
Training until validation scores don't improve for 30 rounds


Best trial: 6. Best value: 0.750361:  35%|███▌      | 7/20 [01:18<02:27, 11.32s/it, 78.42/7200 seconds]

Early stopping, best iteration is:
[336]	valid_0's auc: 0.750538
Trial 6 - Fold 3/3 AUC: 0.750538
Trial 6 - Mean AUC: 0.750361 (±0.000535)

[I 2025-11-28 11:26:34,358] Trial 6 finished with value: 0.7503613413595965 and parameters: {'num_leaves': 59, 'max_depth': 4, 'learning_rate': 0.0674120461070276, 'feature_fraction': 0.8070259980080767, 'bagging_fraction': 0.7842803529062142, 'bagging_freq': 3, 'min_child_samples': 31, 'lambda_l1': 0.16172900811143134, 'lambda_l2': 0.00019870215385428647, 'n_estimators': 989}. Best is trial 6 with value: 0.7503613413595965.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[204]	valid_0's auc: 0.730504
Trial 7 - Fold 1/3 AUC: 0.730504
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[204]	valid_0's auc: 0.729082
Trial 7 - Fold 2/3 AUC: 0.729082
Training until validation scores don't improve for 30 rounds


Best trial: 6. Best value: 0.750361:  40%|████      | 8/20 [01:25<01:59,  9.99s/it, 85.58/7200 seconds]

Did not meet early stopping. Best iteration is:
[204]	valid_0's auc: 0.726866
Trial 7 - Fold 3/3 AUC: 0.726866
Trial 7 - Mean AUC: 0.728817 (±0.001497)

[I 2025-11-28 11:26:41,515] Trial 7 finished with value: 0.728817231728431 and parameters: {'num_leaves': 103, 'max_depth': 4, 'learning_rate': 0.010127963257331486, 'feature_fraction': 0.9446384285364502, 'bagging_fraction': 0.9120572031542851, 'bagging_freq': 4, 'min_child_samples': 82, 'lambda_l1': 0.00019777828512462724, 'lambda_l2': 0.00271558195528294, 'n_estimators': 204}. Best is trial 6 with value: 0.7503613413595965.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[522]	valid_0's auc: 0.749389
Trial 8 - Fold 1/3 AUC: 0.749389
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[510]	valid_0's auc: 0.751602
Trial 8 - Fold 2/3 AUC: 0.751602
Training until validation scores don't improve for 30 rounds
Early stopp

Best trial: 6. Best value: 0.750361:  45%|████▌     | 9/20 [01:55<02:59, 16.35s/it, 115.91/7200 seconds]

Trial 8 - Fold 3/3 AUC: 0.749899
Trial 8 - Mean AUC: 0.750297 (±0.000946)

[I 2025-11-28 11:27:11,843] Trial 8 finished with value: 0.750296961125248 and parameters: {'num_leaves': 113, 'max_depth': 6, 'learning_rate': 0.02142387495644906, 'feature_fraction': 0.7190675050858071, 'bagging_fraction': 0.7932946965146986, 'bagging_freq': 2, 'min_child_samples': 79, 'lambda_l1': 0.035500125258511595, 'lambda_l2': 0.35387588647792356, 'n_estimators': 525}. Best is trial 6 with value: 0.7503613413595965.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[149]	valid_0's auc: 0.747295
Trial 9 - Fold 1/3 AUC: 0.747295
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[192]	valid_0's auc: 0.750079
Trial 9 - Fold 2/3 AUC: 0.750079
Training until validation scores don't improve for 30 rounds


Best trial: 6. Best value: 0.750361:  50%|█████     | 10/20 [02:05<02:21, 14.15s/it, 125.15/7200 seconds]

Did not meet early stopping. Best iteration is:
[197]	valid_0's auc: 0.748602
Trial 9 - Fold 3/3 AUC: 0.748602
Trial 9 - Mean AUC: 0.748659 (±0.001137)

[I 2025-11-28 11:27:21,079] Trial 9 finished with value: 0.7486588162939188 and parameters: {'num_leaves': 29, 'max_depth': 7, 'learning_rate': 0.057648106701146694, 'feature_fraction': 0.8683831592708489, 'bagging_fraction': 0.9312901539863683, 'bagging_freq': 3, 'min_child_samples': 62, 'lambda_l1': 0.005130551760589831, 'lambda_l2': 0.00012637946338082883, 'n_estimators': 197}. Best is trial 6 with value: 0.7503613413595965.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[657]	valid_0's auc: 0.75052
Trial 10 - Fold 1/3 AUC: 0.750520
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[749]	valid_0's auc: 0.752481
Trial 10 - Fold 2/3 AUC: 0.752481
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:


Best trial: 10. Best value: 0.751453:  55%|█████▌    | 11/20 [02:23<02:20, 15.57s/it, 143.93/7200 seconds]

Trial 10 - Fold 3/3 AUC: 0.751358
Trial 10 - Mean AUC: 0.751453 (±0.000803)

[I 2025-11-28 11:27:39,869] Trial 10 finished with value: 0.751453168536918 and parameters: {'num_leaves': 59, 'max_depth': 3, 'learning_rate': 0.03783138462055962, 'feature_fraction': 0.8040552674219865, 'bagging_fraction': 0.7066392625569425, 'bagging_freq': 2, 'min_child_samples': 97, 'lambda_l1': 0.7795612091139277, 'lambda_l2': 0.0006934125446843669, 'n_estimators': 994}. Best is trial 10 with value: 0.751453168536918.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[907]	valid_0's auc: 0.751569
Trial 11 - Fold 1/3 AUC: 0.751569
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[822]	valid_0's auc: 0.752982
Trial 11 - Fold 2/3 AUC: 0.752982
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[734]	valid_0's auc: 0.75108


Best trial: 11. Best value: 0.751877:  60%|██████    | 12/20 [02:45<02:18, 17.34s/it, 165.33/7200 seconds]

Trial 11 - Fold 3/3 AUC: 0.751080
Trial 11 - Mean AUC: 0.751877 (±0.000806)

[I 2025-11-28 11:28:01,263] Trial 11 finished with value: 0.7518768652645011 and parameters: {'num_leaves': 56, 'max_depth': 3, 'learning_rate': 0.03704127704221621, 'feature_fraction': 0.8111460167977234, 'bagging_fraction': 0.7053937642080603, 'bagging_freq': 2, 'min_child_samples': 100, 'lambda_l1': 0.9901967052454858, 'lambda_l2': 0.0007300358083644771, 'n_estimators': 976}. Best is trial 11 with value: 0.7518768652645011.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[812]	valid_0's auc: 0.751025
Trial 12 - Fold 1/3 AUC: 0.751025
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[822]	valid_0's auc: 0.753039
Trial 12 - Fold 2/3 AUC: 0.753039
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[712]	valid_0's auc: 0.751318


Best trial: 11. Best value: 0.751877:  65%|██████▌   | 13/20 [03:06<02:09, 18.45s/it, 186.32/7200 seconds]

Trial 12 - Fold 3/3 AUC: 0.751318
Trial 12 - Mean AUC: 0.751794 (±0.000888)

[I 2025-11-28 11:28:22,250] Trial 12 finished with value: 0.751793550292769 and parameters: {'num_leaves': 54, 'max_depth': 3, 'learning_rate': 0.037524496913365166, 'feature_fraction': 0.8137188026058412, 'bagging_fraction': 0.7045504544754168, 'bagging_freq': 2, 'min_child_samples': 100, 'lambda_l1': 0.9639757903159535, 'lambda_l2': 0.0009976642826646919, 'n_estimators': 997}. Best is trial 11 with value: 0.7518768652645011.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[718]	valid_0's auc: 0.750711
Trial 13 - Fold 1/3 AUC: 0.750711
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[806]	valid_0's auc: 0.752557
Trial 13 - Fold 2/3 AUC: 0.752557
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[770]	valid_0's auc: 0.750822


Best trial: 11. Best value: 0.751877:  70%|███████   | 14/20 [03:27<01:54, 19.13s/it, 207.02/7200 seconds]

Trial 13 - Fold 3/3 AUC: 0.750822
Trial 13 - Mean AUC: 0.751363 (±0.000845)

[I 2025-11-28 11:28:42,953] Trial 13 finished with value: 0.7513631321139354 and parameters: {'num_leaves': 45, 'max_depth': 3, 'learning_rate': 0.03428441285554138, 'feature_fraction': 0.8990219285835948, 'bagging_fraction': 0.7144988113114171, 'bagging_freq': 2, 'min_child_samples': 99, 'lambda_l1': 0.7266943511144078, 'lambda_l2': 0.0015946316827111769, 'n_estimators': 806}. Best is trial 11 with value: 0.7518768652645011.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[576]	valid_0's auc: 0.750213
Trial 14 - Fold 1/3 AUC: 0.750213
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[679]	valid_0's auc: 0.753138
Trial 14 - Fold 2/3 AUC: 0.753138
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[618]	valid_0's auc: 0.750976


Best trial: 11. Best value: 0.751877:  75%|███████▌  | 15/20 [03:56<01:51, 22.24s/it, 236.48/7200 seconds]

Trial 14 - Fold 3/3 AUC: 0.750976
Trial 14 - Mean AUC: 0.751442 (±0.001239)

[I 2025-11-28 11:29:12,420] Trial 14 finished with value: 0.7514420802146357 and parameters: {'num_leaves': 75, 'max_depth': 5, 'learning_rate': 0.02601396887955182, 'feature_fraction': 0.7496668844156134, 'bagging_fraction': 0.8356978938632825, 'bagging_freq': 2, 'min_child_samples': 84, 'lambda_l1': 0.08861339536715615, 'lambda_l2': 0.0006709551581928013, 'n_estimators': 803}. Best is trial 11 with value: 0.7518768652645011.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[684]	valid_0's auc: 0.751383
Trial 15 - Fold 1/3 AUC: 0.751383
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[699]	valid_0's auc: 0.752236
Trial 15 - Fold 2/3 AUC: 0.752236
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[698]	valid_0's auc: 0.750746


Best trial: 11. Best value: 0.751877:  80%|████████  | 16/20 [04:08<01:16, 19.03s/it, 248.05/7200 seconds]

Trial 15 - Fold 3/3 AUC: 0.750746
Trial 15 - Mean AUC: 0.751455 (±0.000610)

[I 2025-11-28 11:29:23,984] Trial 15 finished with value: 0.7514549849987423 and parameters: {'num_leaves': 45, 'max_depth': 3, 'learning_rate': 0.04066866873014908, 'feature_fraction': 0.8424019717573158, 'bagging_fraction': 0.7014220358959268, 'bagging_freq': 1, 'min_child_samples': 73, 'lambda_l1': 0.8852837166182764, 'lambda_l2': 0.00078632255712794, 'n_estimators': 701}. Best is trial 11 with value: 0.7518768652645011.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[899]	valid_0's auc: 0.749813
Trial 16 - Fold 1/3 AUC: 0.749813
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[898]	valid_0's auc: 0.751049
Trial 16 - Fold 2/3 AUC: 0.751049
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[899]	valid_0's auc: 0.750328


Best trial: 11. Best value: 0.751877:  85%|████████▌ | 17/20 [04:34<01:03, 21.18s/it, 274.24/7200 seconds]

Trial 16 - Fold 3/3 AUC: 0.750328
Trial 16 - Mean AUC: 0.750396 (±0.000507)

[I 2025-11-28 11:29:50,170] Trial 16 finished with value: 0.7503964454131813 and parameters: {'num_leaves': 46, 'max_depth': 4, 'learning_rate': 0.01624839426010113, 'feature_fraction': 0.8346975091183737, 'bagging_fraction': 0.7437905882663138, 'bagging_freq': 3, 'min_child_samples': 92, 'lambda_l1': 0.0021687686030076835, 'lambda_l2': 0.005309699778041148, 'n_estimators': 899}. Best is trial 11 with value: 0.7518768652645011.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[307]	valid_0's auc: 0.748802
Trial 17 - Fold 1/3 AUC: 0.748802
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[433]	valid_0's auc: 0.751749
Trial 17 - Fold 2/3 AUC: 0.751749
Training until validation scores don't improve for 30 rounds


Best trial: 11. Best value: 0.751877:  90%|█████████ | 18/20 [04:51<00:39, 19.95s/it, 291.33/7200 seconds]

Early stopping, best iteration is:
[298]	valid_0's auc: 0.750567
Trial 17 - Fold 3/3 AUC: 0.750567
Trial 17 - Mean AUC: 0.750373 (±0.001211)

[I 2025-11-28 11:30:07,265] Trial 17 finished with value: 0.7503729624083938 and parameters: {'num_leaves': 68, 'max_depth': 5, 'learning_rate': 0.045190117184934, 'feature_fraction': 0.7701555384645838, 'bagging_fraction': 0.8130987912299135, 'bagging_freq': 2, 'min_child_samples': 65, 'lambda_l1': 0.0472581081269157, 'lambda_l2': 0.0003840744634044172, 'n_estimators': 637}. Best is trial 11 with value: 0.7518768652645011.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[412]	valid_0's auc: 0.750051
Trial 18 - Fold 1/3 AUC: 0.750051
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[461]	valid_0's auc: 0.752038
Trial 18 - Fold 2/3 AUC: 0.752038
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[408]	valid_0'

Best trial: 11. Best value: 0.751877:  95%|█████████▌| 19/20 [05:12<00:20, 20.21s/it, 312.14/7200 seconds]

Trial 18 - Fold 3/3 AUC: 0.749281
Trial 18 - Mean AUC: 0.750457 (±0.001162)

[I 2025-11-28 11:30:28,080] Trial 18 finished with value: 0.7504568232105983 and parameters: {'num_leaves': 94, 'max_depth': 6, 'learning_rate': 0.027454807354694015, 'feature_fraction': 0.9062544098604814, 'bagging_fraction': 0.7408764477740766, 'bagging_freq': 1, 'min_child_samples': 90, 'lambda_l1': 0.24424121263724016, 'lambda_l2': 0.0024229129379055355, 'n_estimators': 882}. Best is trial 11 with value: 0.7518768652645011.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[395]	valid_0's auc: 0.750041
Trial 19 - Fold 1/3 AUC: 0.750041
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[394]	valid_0's auc: 0.752578
Trial 19 - Fold 2/3 AUC: 0.752578
Training until validation scores don't improve for 30 rounds


Best trial: 11. Best value: 0.751877: 100%|██████████| 20/20 [05:23<00:00, 16.16s/it, 323.12/7200 seconds]

Early stopping, best iteration is:
[306]	valid_0's auc: 0.750551
Trial 19 - Fold 3/3 AUC: 0.750551
Trial 19 - Mean AUC: 0.751057 (±0.001096)

[I 2025-11-28 11:30:39,053] Trial 19 finished with value: 0.7510565766791206 and parameters: {'num_leaves': 37, 'max_depth': 3, 'learning_rate': 0.09913462872002565, 'feature_fraction': 0.7034416690065798, 'bagging_fraction': 0.9784308604698617, 'bagging_freq': 2, 'min_child_samples': 70, 'lambda_l1': 0.3108634868900223, 'lambda_l2': 0.9448679984400115, 'n_estimators': 882}. Best is trial 11 with value: 0.7518768652645011.

Best Mean AUC: 0.7518768652645011
Best hyperparameters:
 {'num_leaves': 56, 'max_depth': 3, 'learning_rate': 0.03704127704221621, 'feature_fraction': 0.8111460167977234, 'bagging_fraction': 0.7053937642080603, 'bagging_freq': 2, 'min_child_samples': 100, 'lambda_l1': 0.9901967052454858, 'lambda_l2': 0.0007300358083644771, 'n_estimators': 976}






Final Model Performance:
Train Score: 0.7854364996098675
Valid Score: 0.7524678934040427


# lightGBM cause seem like this is the best for the old model
import lightgbm as lgb
# optuna para 
hyperpara = {'num_leaves': 57, 'max_depth': 4, 'learning_rate': 0.03277842323048595, 'feature_fraction': 0.9569717768433371, 'bagging_fraction': 0.8506767379185446, 'bagging_freq': 1, 'min_child_samples': 48, 'lambda_l1': 0.17636584028657937, 'lambda_l2': 0.025244809309038312, 'n_estimators': 2486}

lgbm = lgb.LGBMClassifier(**hyperpara)
lgbm.fit(X_train, y_train, eval_set= [(X_valid, y_valid)])

lgbm_train_pred = lgbm.predict_proba(X_train)[:, 1]
lgbm_valid_pred = lgbm.predict_proba(X_valid)[:, 1]

print(f"Train Score: {roc_auc_score(y_train, lgbm_train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid, lgbm_valid_pred)}")

### under this is the output part


In [10]:
# Make predictions for the test data
# Change model name if needed
pred = lgbm.predict_proba(X_test_std)[:, 1]



In [11]:
# Put the prediction into the format of submission
sample_sub['TARGET'] = pred
sample_sub

Unnamed: 0,SK_ID_CURR,TARGET
0,171202,0.028925
1,171203,0.174882
2,171204,0.113239
3,171205,0.115243
4,171206,0.129844
...,...,...
61495,232697,0.133867
61496,232698,0.050011
61497,232699,0.037502
61498,232700,0.101828


In [12]:
# Create the "output" directory if it doesn't exist
output_dir = Path.cwd() / "output"
os.makedirs(output_dir, exist_ok=True)

# Specify the new output file path
output_file = output_dir / "submission.csv"

# Save the CSV file to the "output" directory
sample_sub.to_csv(output_file, index=False)