In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import warnings

In [2]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Importing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [4]:
%run features_testing.ipynb

In [5]:
# Split the data into explanatory and target variables
X = train.drop("TARGET", axis=1).values
y = train["TARGET"].values
X_test = test.values

In [6]:
#from sklearn.preprocessing import PowerTransformer
#
#pt = PowerTransformer(method='yeo-johnson')
#X_pt = pt.fit_transform(X)
#X_test_pt = pt.transform(X_test)

In [7]:
# Standardization
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)
X_test_std = sc.transform(X_test)

In [8]:
# Split the original data into the training data and the validation data
X_train, X_valid, y_train, y_valid = train_test_split(X_std, y, test_size=0.3, stratify=y, random_state=0)

In [None]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd

def objective(trial):
    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",

        "num_leaves": trial.suggest_int("num_leaves", 16, 128),  # Reduced from 256
        "max_depth": trial.suggest_int("max_depth", 3, 8),  # Reduced from -1 to 12
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),  # Narrowed range

        "feature_fraction": trial.suggest_float("feature_fraction", 0.7, 1.0),  # Narrowed
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.7, 1.0),  # Narrowed
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 5),  # Reduced from 10

        "min_child_samples": trial.suggest_int("min_child_samples", 20, 100),  # Narrowed
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-4, 1.0, log=True),  # Narrowed
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-4, 1.0, log=True),  # Narrowed

        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),  # Reduced from 3000
        "verbose": -1,
        
        # GPU parameters
        "device": "gpu" ,
        "gpu_platform_id": 0 ,
        "gpu_device_id": 0 ,
    }

    # Initialize k-fold cross-validation
    # Reduced to 3 folds for faster training
    n_splits = 3
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    auc_scores = []
    
    # Convert X_std to DataFrame to avoid feature names warning
    # Create feature names if they don't exist
    if isinstance(X_std, np.ndarray):
        feature_names = [f'feature_{i}' for i in range(X_std.shape[1])]
        x_train_df = pd.DataFrame(X_std, columns=feature_names)
    else:
        x_train_df = X_std
    
    # Perform k-fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(skf.split(x_train_df, y)):
        x_train_fold = x_train_df.iloc[train_idx]
        y_train_fold = y[train_idx]
        x_val_fold = x_train_df.iloc[val_idx]
        y_val_fold = y[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        
        model.fit(
            x_train_fold,
            y_train_fold,
            eval_set=[(x_val_fold, y_val_fold)],
            callbacks=[
                lgb.early_stopping(stopping_rounds=30),  # Reduced from 100
                lgb.log_evaluation(period=0)  # Silent training
            ]
        )
        
        preds = model.predict_proba(x_val_fold)[:, 1]
        fold_auc = roc_auc_score(y_val_fold, preds)
        auc_scores.append(fold_auc)
        
        print(f"Trial {trial.number} - Fold {fold + 1}/{n_splits} AUC: {fold_auc:.6f}")
    
    # Return mean AUC across all folds
    mean_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
    print(f"Trial {trial.number} - Mean AUC: {mean_auc:.6f} (±{std_auc:.6f})\n")
    
    return mean_auc


# Create and run the study
# n_jobs=1 when using GPU (GPU doesn't benefit from parallel trials)
study = optuna.create_study(
    direction="maximize",
    study_name="lgbm_optimization_kfold",
    sampler=optuna.samplers.TPESampler(seed=42),  # For reproducibility
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)  # Prune bad trials early
)

n_trials = 20 

study.optimize(
    objective, 
    n_trials=n_trials,
    n_jobs=1,  # LightGBM GPU doesn't support parallel training
    show_progress_bar=True,
    timeout=7200  # 2 hour timeout as safety measure
)

print("\nBest Mean AUC:", study.best_value)
print("Best hyperparameters:\n", study.best_params)

# Train final model with best parameters on full training data
best_params = study.best_params.copy()
best_params.update({
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "verbose": -1,

    "device": "gpu" ,
    "gpu_platform_id": 0 ,
    "gpu_device_id": 0 ,
})

# Split data for final validation
X_train_final, X_valid_final, y_train_final, y_valid_final = train_test_split(
    X_std, y, test_size=0.3, stratify=y, random_state=0
)

lgbm = lgb.LGBMClassifier(**best_params)
lgbm.fit(X_train_final, y_train_final, eval_set=[(X_valid_final, y_valid_final)])

lgbm_train_pred = lgbm.predict_proba(X_train_final)[:, 1]
lgbm_valid_pred = lgbm.predict_proba(X_valid_final)[:, 1]

print(f"\nFinal Model Performance:")
print(f"Train Score: {roc_auc_score(y_train_final, lgbm_train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid_final, lgbm_valid_pred)}")

[I 2025-11-28 01:42:32,979] A new study created in memory with name: lgbm_optimization_kfold
  0%|          | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[155]	valid_0's auc: 0.743425
Trial 0 - Fold 1/3 AUC: 0.743425
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[195]	valid_0's auc: 0.746374
Trial 0 - Fold 2/3 AUC: 0.746374
Training until validation scores don't improve for 30 rounds


Best trial: 0. Best value: 0.744456:   5%|▌         | 1/20 [00:11<03:42, 11.69s/it, 11.69/7200 seconds]

Early stopping, best iteration is:
[145]	valid_0's auc: 0.743569
Trial 0 - Fold 3/3 AUC: 0.743569
Trial 0 - Mean AUC: 0.744456 (±0.001357)

[I 2025-11-28 01:42:44,672] Trial 0 finished with value: 0.7444558708452923 and parameters: {'num_leaves': 58, 'max_depth': 8, 'learning_rate': 0.05395030966670229, 'feature_fraction': 0.8795975452591109, 'bagging_fraction': 0.7468055921327309, 'bagging_freq': 1, 'min_child_samples': 24, 'lambda_l1': 0.29154431891537513, 'lambda_l2': 0.02537815508265665, 'n_estimators': 737}. Best is trial 0 with value: 0.7444558708452923.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[176]	valid_0's auc: 0.745232
Trial 1 - Fold 1/3 AUC: 0.745232
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[204]	valid_0's auc: 0.748726
Trial 1 - Fold 2/3 AUC: 0.748726
Training until validation scores don't improve for 30 rounds


Best trial: 1. Best value: 0.746754:  10%|█         | 2/20 [00:18<02:36,  8.72s/it, 18.33/7200 seconds]

Early stopping, best iteration is:
[239]	valid_0's auc: 0.746304
Trial 1 - Fold 3/3 AUC: 0.746304
Trial 1 - Mean AUC: 0.746754 (±0.001461)

[I 2025-11-28 01:42:51,305] Trial 1 finished with value: 0.7467538785086787 and parameters: {'num_leaves': 18, 'max_depth': 8, 'learning_rate': 0.06798962421591129, 'feature_fraction': 0.7637017332034828, 'bagging_fraction': 0.7545474901621302, 'bagging_freq': 1, 'min_child_samples': 44, 'lambda_l1': 0.012561043700013555, 'lambda_l2': 0.005342937261279773, 'n_estimators': 362}. Best is trial 1 with value: 0.7467538785086787.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[141]	valid_0's auc: 0.731071
Trial 2 - Fold 1/3 AUC: 0.731071
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[141]	valid_0's auc: 0.729138
Trial 2 - Fold 2/3 AUC: 0.729138
Training until validation scores don't improve for 30 rounds


Best trial: 1. Best value: 0.746754:  15%|█▌        | 3/20 [00:21<01:46,  6.29s/it, 21.73/7200 seconds]

Did not meet early stopping. Best iteration is:
[141]	valid_0's auc: 0.726358
Trial 2 - Fold 3/3 AUC: 0.726358
Trial 2 - Mean AUC: 0.728856 (±0.001935)

[I 2025-11-28 01:42:54,712] Trial 2 finished with value: 0.7288555220572212 and parameters: {'num_leaves': 85, 'max_depth': 3, 'learning_rate': 0.019594972058679168, 'feature_fraction': 0.8099085529881075, 'bagging_fraction': 0.8368209952651108, 'bagging_freq': 4, 'min_child_samples': 36, 'lambda_l1': 0.011400863701127324, 'lambda_l2': 0.0234238498471129, 'n_estimators': 141}. Best is trial 1 with value: 0.7467538785086787.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[496]	valid_0's auc: 0.741578
Trial 3 - Fold 1/3 AUC: 0.741578
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[496]	valid_0's auc: 0.744064
Trial 3 - Fold 2/3 AUC: 0.744064
Training until validation scores don't improve for 30 rounds


Best trial: 1. Best value: 0.746754:  20%|██        | 4/20 [00:34<02:20,  8.80s/it, 34.37/7200 seconds]

Did not meet early stopping. Best iteration is:
[496]	valid_0's auc: 0.742027
Trial 3 - Fold 3/3 AUC: 0.742027
Trial 3 - Mean AUC: 0.742556 (±0.001082)

[I 2025-11-28 01:43:07,352] Trial 3 finished with value: 0.7425563830354798 and parameters: {'num_leaves': 84, 'max_depth': 4, 'learning_rate': 0.011615865989246453, 'feature_fraction': 0.984665661176, 'bagging_fraction': 0.9896896099223678, 'bagging_freq': 5, 'min_child_samples': 44, 'lambda_l1': 0.00024586032763280086, 'lambda_l2': 0.054567254856014755, 'n_estimators': 496}. Best is trial 1 with value: 0.7467538785086787.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[266]	valid_0's auc: 0.73828
Trial 4 - Fold 1/3 AUC: 0.738280
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[264]	valid_0's auc: 0.740076
Trial 4 - Fold 2/3 AUC: 0.740076
Training until validation scores don't improve for 30 rounds


Best trial: 1. Best value: 0.746754:  25%|██▌       | 5/20 [00:44<02:20,  9.36s/it, 44.74/7200 seconds]

Did not meet early stopping. Best iteration is:
[266]	valid_0's auc: 0.736901
Trial 4 - Fold 3/3 AUC: 0.736901
Trial 4 - Mean AUC: 0.738419 (±0.001300)

[I 2025-11-28 01:43:17,720] Trial 4 finished with value: 0.738419066003154 and parameters: {'num_leaves': 29, 'max_depth': 5, 'learning_rate': 0.01082401838150096, 'feature_fraction': 0.9727961206236346, 'bagging_fraction': 0.777633994480005, 'bagging_freq': 4, 'min_child_samples': 45, 'lambda_l1': 0.012030178871154668, 'lambda_l2': 0.015375920235481757, 'n_estimators': 266}. Best is trial 1 with value: 0.7467538785086787.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[94]	valid_0's auc: 0.742111
Trial 5 - Fold 1/3 AUC: 0.742111
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[60]	valid_0's auc: 0.744712
Trial 5 - Fold 2/3 AUC: 0.744712
Training until validation scores don't improve for 30 rounds


Best trial: 1. Best value: 0.746754:  30%|███       | 6/20 [00:53<02:10,  9.30s/it, 53.90/7200 seconds]

Early stopping, best iteration is:
[102]	valid_0's auc: 0.74313
Trial 5 - Fold 3/3 AUC: 0.743130
Trial 5 - Mean AUC: 0.743318 (±0.001070)

[I 2025-11-28 01:43:26,883] Trial 5 finished with value: 0.7433176640548536 and parameters: {'num_leaves': 125, 'max_depth': 7, 'learning_rate': 0.08699593128513321, 'feature_fraction': 0.9684482051282947, 'bagging_fraction': 0.8793699936433255, 'bagging_freq': 5, 'min_child_samples': 27, 'lambda_l1': 0.0006080390190296605, 'lambda_l2': 0.00015167330688076205, 'n_estimators': 393}. Best is trial 1 with value: 0.7467538785086787.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[187]	valid_0's auc: 0.746227
Trial 6 - Fold 1/3 AUC: 0.746227
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[373]	valid_0's auc: 0.749628
Trial 6 - Fold 2/3 AUC: 0.749628
Training until validation scores don't improve for 30 rounds


Best trial: 6. Best value: 0.747506:  35%|███▌      | 7/20 [01:01<01:55,  8.90s/it, 61.99/7200 seconds]

Early stopping, best iteration is:
[228]	valid_0's auc: 0.746664
Trial 6 - Fold 3/3 AUC: 0.746664
Trial 6 - Mean AUC: 0.747506 (±0.001511)

[I 2025-11-28 01:43:34,972] Trial 6 finished with value: 0.7475062454789164 and parameters: {'num_leaves': 59, 'max_depth': 4, 'learning_rate': 0.0674120461070276, 'feature_fraction': 0.8070259980080767, 'bagging_fraction': 0.7842803529062142, 'bagging_freq': 3, 'min_child_samples': 31, 'lambda_l1': 0.16172900811143134, 'lambda_l2': 0.00019870215385428647, 'n_estimators': 989}. Best is trial 6 with value: 0.7475062454789164.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[204]	valid_0's auc: 0.730555
Trial 7 - Fold 1/3 AUC: 0.730555
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[204]	valid_0's auc: 0.730044
Trial 7 - Fold 2/3 AUC: 0.730044
Training until validation scores don't improve for 30 rounds


Best trial: 6. Best value: 0.747506:  40%|████      | 8/20 [01:07<01:34,  7.89s/it, 67.72/7200 seconds]

Did not meet early stopping. Best iteration is:
[204]	valid_0's auc: 0.726153
Trial 7 - Fold 3/3 AUC: 0.726153
Trial 7 - Mean AUC: 0.728917 (±0.001966)

[I 2025-11-28 01:43:40,702] Trial 7 finished with value: 0.7289174162660919 and parameters: {'num_leaves': 103, 'max_depth': 4, 'learning_rate': 0.010127963257331486, 'feature_fraction': 0.9446384285364502, 'bagging_fraction': 0.9120572031542851, 'bagging_freq': 4, 'min_child_samples': 82, 'lambda_l1': 0.00019777828512462724, 'lambda_l2': 0.00271558195528294, 'n_estimators': 204}. Best is trial 6 with value: 0.7475062454789164.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[312]	valid_0's auc: 0.745591
Trial 8 - Fold 1/3 AUC: 0.745591
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[492]	valid_0's auc: 0.749438
Trial 8 - Fold 2/3 AUC: 0.749438
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[

Best trial: 6. Best value: 0.747506:  45%|████▌     | 9/20 [01:32<02:24, 13.12s/it, 92.35/7200 seconds]

Trial 8 - Fold 3/3 AUC: 0.746215
Trial 8 - Mean AUC: 0.747081 (±0.001686)

[I 2025-11-28 01:44:05,331] Trial 8 finished with value: 0.7470814689616198 and parameters: {'num_leaves': 113, 'max_depth': 6, 'learning_rate': 0.02142387495644906, 'feature_fraction': 0.7190675050858071, 'bagging_fraction': 0.7932946965146986, 'bagging_freq': 2, 'min_child_samples': 79, 'lambda_l1': 0.035500125258511595, 'lambda_l2': 0.35387588647792356, 'n_estimators': 525}. Best is trial 6 with value: 0.7475062454789164.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[179]	valid_0's auc: 0.745478
Trial 9 - Fold 1/3 AUC: 0.745478
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[183]	valid_0's auc: 0.748523
Trial 9 - Fold 2/3 AUC: 0.748523
Training until validation scores don't improve for 30 rounds


Best trial: 6. Best value: 0.747506:  50%|█████     | 10/20 [01:41<01:57, 11.79s/it, 101.15/7200 seconds]

Did not meet early stopping. Best iteration is:
[192]	valid_0's auc: 0.746054
Trial 9 - Fold 3/3 AUC: 0.746054
Trial 9 - Mean AUC: 0.746685 (±0.001320)

[I 2025-11-28 01:44:14,128] Trial 9 finished with value: 0.7466852381956096 and parameters: {'num_leaves': 29, 'max_depth': 7, 'learning_rate': 0.057648106701146694, 'feature_fraction': 0.8683831592708489, 'bagging_fraction': 0.9312901539863683, 'bagging_freq': 3, 'min_child_samples': 62, 'lambda_l1': 0.005130551760589831, 'lambda_l2': 0.00012637946338082883, 'n_estimators': 197}. Best is trial 6 with value: 0.7475062454789164.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[660]	valid_0's auc: 0.747034
Trial 10 - Fold 1/3 AUC: 0.747034
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[701]	valid_0's auc: 0.750585
Trial 10 - Fold 2/3 AUC: 0.750585
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:

Best trial: 10. Best value: 0.748532:  55%|█████▌    | 11/20 [01:58<02:00, 13.36s/it, 118.08/7200 seconds]

Trial 10 - Fold 3/3 AUC: 0.747979
Trial 10 - Mean AUC: 0.748532 (±0.001502)

[I 2025-11-28 01:44:31,064] Trial 10 finished with value: 0.7485324983347446 and parameters: {'num_leaves': 59, 'max_depth': 3, 'learning_rate': 0.03783138462055962, 'feature_fraction': 0.8040552674219865, 'bagging_fraction': 0.7066392625569425, 'bagging_freq': 2, 'min_child_samples': 97, 'lambda_l1': 0.7795612091139277, 'lambda_l2': 0.0006934125446843669, 'n_estimators': 994}. Best is trial 10 with value: 0.7485324983347446.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[622]	valid_0's auc: 0.747551
Trial 11 - Fold 1/3 AUC: 0.747551
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[698]	valid_0's auc: 0.75057
Trial 11 - Fold 2/3 AUC: 0.750570
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[661]	valid_0's auc: 0.747643


Best trial: 11. Best value: 0.748588:  60%|██████    | 12/20 [02:14<01:54, 14.27s/it, 134.44/7200 seconds]

Trial 11 - Fold 3/3 AUC: 0.747643
Trial 11 - Mean AUC: 0.748588 (±0.001402)

[I 2025-11-28 01:44:47,421] Trial 11 finished with value: 0.7485879671111954 and parameters: {'num_leaves': 56, 'max_depth': 3, 'learning_rate': 0.03704127704221621, 'feature_fraction': 0.8111460167977234, 'bagging_fraction': 0.7053937642080603, 'bagging_freq': 2, 'min_child_samples': 100, 'lambda_l1': 0.9901967052454858, 'lambda_l2': 0.0007300358083644771, 'n_estimators': 976}. Best is trial 11 with value: 0.7485879671111954.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[657]	valid_0's auc: 0.747209
Trial 12 - Fold 1/3 AUC: 0.747209
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[590]	valid_0's auc: 0.749977
Trial 12 - Fold 2/3 AUC: 0.749977
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[760]	valid_0's auc: 0.748148


Best trial: 11. Best value: 0.748588:  65%|██████▌   | 13/20 [02:30<01:43, 14.85s/it, 150.63/7200 seconds]

Trial 12 - Fold 3/3 AUC: 0.748148
Trial 12 - Mean AUC: 0.748445 (±0.001150)

[I 2025-11-28 01:45:03,608] Trial 12 finished with value: 0.748444656342358 and parameters: {'num_leaves': 54, 'max_depth': 3, 'learning_rate': 0.037524496913365166, 'feature_fraction': 0.8137188026058412, 'bagging_fraction': 0.7045504544754168, 'bagging_freq': 2, 'min_child_samples': 100, 'lambda_l1': 0.9639757903159535, 'lambda_l2': 0.0009976642826646919, 'n_estimators': 997}. Best is trial 11 with value: 0.7485879671111954.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[555]	valid_0's auc: 0.746945
Trial 13 - Fold 1/3 AUC: 0.746945
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[779]	valid_0's auc: 0.750359
Trial 13 - Fold 2/3 AUC: 0.750359
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[734]	valid_0's auc: 0.748462


Best trial: 13. Best value: 0.748589:  70%|███████   | 14/20 [02:47<01:32, 15.36s/it, 167.16/7200 seconds]

Trial 13 - Fold 3/3 AUC: 0.748462
Trial 13 - Mean AUC: 0.748589 (±0.001397)

[I 2025-11-28 01:45:20,141] Trial 13 finished with value: 0.7485888440012993 and parameters: {'num_leaves': 47, 'max_depth': 3, 'learning_rate': 0.03390699310533449, 'feature_fraction': 0.7517282120243132, 'bagging_fraction': 0.7144904822730566, 'bagging_freq': 2, 'min_child_samples': 99, 'lambda_l1': 0.7266943511144078, 'lambda_l2': 0.0005508571077477058, 'n_estimators': 788}. Best is trial 13 with value: 0.7485888440012993.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[544]	valid_0's auc: 0.747167
Trial 14 - Fold 1/3 AUC: 0.747167
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[597]	valid_0's auc: 0.750468
Trial 14 - Fold 2/3 AUC: 0.750468
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[452]	valid_0's auc: 0.747018


Best trial: 13. Best value: 0.748589:  75%|███████▌  | 15/20 [03:08<01:26, 17.22s/it, 188.69/7200 seconds]

Trial 14 - Fold 3/3 AUC: 0.747018
Trial 14 - Mean AUC: 0.748218 (±0.001593)

[I 2025-11-28 01:45:41,669] Trial 14 finished with value: 0.7482176252641745 and parameters: {'num_leaves': 37, 'max_depth': 5, 'learning_rate': 0.02601396887955182, 'feature_fraction': 0.7322566372376089, 'bagging_fraction': 0.8356978938632825, 'bagging_freq': 2, 'min_child_samples': 83, 'lambda_l1': 0.11149912088964545, 'lambda_l2': 0.0008804123039539076, 'n_estimators': 782}. Best is trial 13 with value: 0.7485888440012993.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[503]	valid_0's auc: 0.747026
Trial 15 - Fold 1/3 AUC: 0.747026
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[608]	valid_0's auc: 0.749337
Trial 15 - Fold 2/3 AUC: 0.749337
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[638]	valid_0's auc: 0.747398


Best trial: 13. Best value: 0.748589:  80%|████████  | 16/20 [03:17<00:58, 14.67s/it, 197.44/7200 seconds]

Trial 15 - Fold 3/3 AUC: 0.747398
Trial 15 - Mean AUC: 0.747921 (±0.001013)

[I 2025-11-28 01:45:50,424] Trial 15 finished with value: 0.7479205830532002 and parameters: {'num_leaves': 45, 'max_depth': 3, 'learning_rate': 0.04066866873014908, 'feature_fraction': 0.7676680036452747, 'bagging_fraction': 0.7284918108351282, 'bagging_freq': 1, 'min_child_samples': 69, 'lambda_l1': 0.055823792081035384, 'lambda_l2': 0.002050884892193022, 'n_estimators': 776}. Best is trial 13 with value: 0.7485888440012993.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[828]	valid_0's auc: 0.747418
Trial 16 - Fold 1/3 AUC: 0.747418
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[865]	valid_0's auc: 0.750091
Trial 16 - Fold 2/3 AUC: 0.750091
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[867]	valid_0's auc: 0.74704


Best trial: 13. Best value: 0.748589:  85%|████████▌ | 17/20 [03:38<00:49, 16.60s/it, 218.54/7200 seconds]

Trial 16 - Fold 3/3 AUC: 0.747040
Trial 16 - Mean AUC: 0.748183 (±0.001358)

[I 2025-11-28 01:46:11,520] Trial 16 finished with value: 0.7481830880991523 and parameters: {'num_leaves': 75, 'max_depth': 4, 'learning_rate': 0.01624839426010113, 'feature_fraction': 0.9009312770749308, 'bagging_fraction': 0.7014913855583648, 'bagging_freq': 3, 'min_child_samples': 92, 'lambda_l1': 0.4007803446404775, 'lambda_l2': 0.0003307121973944748, 'n_estimators': 867}. Best is trial 13 with value: 0.7485888440012993.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[368]	valid_0's auc: 0.746726
Trial 17 - Fold 1/3 AUC: 0.746726
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[507]	valid_0's auc: 0.750263
Trial 17 - Fold 2/3 AUC: 0.750263
Training until validation scores don't improve for 30 rounds


Best trial: 13. Best value: 0.748589:  90%|█████████ | 18/20 [03:56<00:34, 17.06s/it, 236.67/7200 seconds]

Early stopping, best iteration is:
[404]	valid_0's auc: 0.746953
Trial 17 - Fold 3/3 AUC: 0.746953
Trial 17 - Mean AUC: 0.747981 (±0.001616)

[I 2025-11-28 01:46:29,652] Trial 17 finished with value: 0.7479805325797568 and parameters: {'num_leaves': 74, 'max_depth': 5, 'learning_rate': 0.027380228456921795, 'feature_fraction': 0.7637887938170701, 'bagging_fraction': 0.8130987912299135, 'bagging_freq': 2, 'min_child_samples': 89, 'lambda_l1': 0.0024912417260024742, 'lambda_l2': 0.1773569364077961, 'n_estimators': 654}. Best is trial 13 with value: 0.7485888440012993.
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[166]	valid_0's auc: 0.744201
Trial 18 - Fold 1/3 AUC: 0.744201
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[284]	valid_0's auc: 0.748523
Trial 18 - Fold 2/3 AUC: 0.748523
Training until validation scores don't improve for 30 rounds


Best trial: 13. Best value: 0.748589:  95%|█████████▌| 19/20 [04:06<00:15, 15.02s/it, 246.93/7200 seconds]

Early stopping, best iteration is:
[182]	valid_0's auc: 0.746276
Trial 18 - Fold 3/3 AUC: 0.746276
Trial 18 - Mean AUC: 0.746333 (±0.001765)

[I 2025-11-28 01:46:39,911] Trial 18 finished with value: 0.7463334117890722 and parameters: {'num_leaves': 46, 'max_depth': 6, 'learning_rate': 0.048704928050618324, 'feature_fraction': 0.8425971289701821, 'bagging_fraction': 0.7454170828821509, 'bagging_freq': 1, 'min_child_samples': 73, 'lambda_l1': 0.07131187670025274, 'lambda_l2': 0.004705614323574566, 'n_estimators': 882}. Best is trial 13 with value: 0.7485888440012993.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[632]	valid_0's auc: 0.746831
Trial 19 - Fold 1/3 AUC: 0.746831
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[636]	valid_0's auc: 0.748701
Trial 19 - Fold 2/3 AUC: 0.748701
Training until validation scores don't improve for 30 rounds
Did not meet early s

Best trial: 13. Best value: 0.748589: 100%|██████████| 20/20 [04:21<00:00, 13.10s/it, 261.94/7200 seconds]

Trial 19 - Fold 3/3 AUC: 0.747246
Trial 19 - Mean AUC: 0.747592 (±0.000802)

[I 2025-11-28 01:46:54,916] Trial 19 finished with value: 0.7475924426343478 and parameters: {'num_leaves': 86, 'max_depth': 3, 'learning_rate': 0.03150878204091636, 'feature_fraction': 0.7011567750760258, 'bagging_fraction': 0.871517558129842, 'bagging_freq': 2, 'min_child_samples': 90, 'lambda_l1': 0.3108634868900223, 'lambda_l2': 0.000611150344765429, 'n_estimators': 636}. Best is trial 13 with value: 0.7485888440012993.

Best Mean AUC: 0.7485888440012993
Best hyperparameters:
 {'num_leaves': 47, 'max_depth': 3, 'learning_rate': 0.03390699310533449, 'feature_fraction': 0.7517282120243132, 'bagging_fraction': 0.7144904822730566, 'bagging_freq': 2, 'min_child_samples': 99, 'lambda_l1': 0.7266943511144078, 'lambda_l2': 0.0005508571077477058, 'n_estimators': 788}






Final Model Performance:
Train Score: 0.7730214638271273
Valid Score: 0.7483696910689517




# lightGBM cause seem like this is the best for the old model
import lightgbm as lgb
# optuna para 
hyperpara = {'num_leaves': 57, 'max_depth': 4, 'learning_rate': 0.03277842323048595, 'feature_fraction': 0.9569717768433371, 'bagging_fraction': 0.8506767379185446, 'bagging_freq': 1, 'min_child_samples': 48, 'lambda_l1': 0.17636584028657937, 'lambda_l2': 0.025244809309038312, 'n_estimators': 2486}

lgbm = lgb.LGBMClassifier(**hyperpara)
lgbm.fit(X_train, y_train, eval_set= [(X_valid, y_valid)])

lgbm_train_pred = lgbm.predict_proba(X_train)[:, 1]
lgbm_valid_pred = lgbm.predict_proba(X_valid)[:, 1]

print(f"Train Score: {roc_auc_score(y_train, lgbm_train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid, lgbm_valid_pred)}")

### under this is the output part


In [13]:
# Make predictions for the test data
# Change model name if needed
pred = lgbm.predict_proba(X_test_std)[:, 1]



In [14]:
# Put the prediction into the format of submission
sample_sub['TARGET'] = pred
sample_sub

Unnamed: 0,SK_ID_CURR,TARGET
0,171202,0.026395
1,171203,0.132098
2,171204,0.153422
3,171205,0.090435
4,171206,0.139855
...,...,...
61495,232697,0.195401
61496,232698,0.056935
61497,232699,0.045301
61498,232700,0.110547


In [15]:
# Create the "output" directory if it doesn't exist
output_dir = Path.cwd() / "output"
os.makedirs(output_dir, exist_ok=True)

# Specify the new output file path
output_file = output_dir / "submission.csv"

# Save the CSV file to the "output" directory
sample_sub.to_csv(output_file, index=False)