In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import warnings

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Importing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [None]:
%run features_testing.ipynb

In [None]:
# Split the data into explanatory and target variables
X = train.drop("TARGET", axis=1).values
y = train["TARGET"].values
X_test = test.values

In [None]:
#from sklearn.preprocessing import PowerTransformer
#
#pt = PowerTransformer(method='yeo-johnson')
#X_pt = pt.fit_transform(X)
#X_test_pt = pt.transform(X_test)

In [None]:
# Standardization
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)
X_test_std = sc.transform(X_test)

In [None]:
# Split the original data into the training data and the validation data
X_train, X_valid, y_train, y_valid = train_test_split(X_std, y, test_size=0.3, stratify=y, random_state=0)

In [None]:
lgb_params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.03,

    "num_leaves": 16,          
    "max_depth": 4,              
    "min_child_samples": 40,
    "min_child_weight": 1e-3,

    "subsample": 0.8,
    "subsample_freq": 1,
    "colsample_bytree": 0.7,

    "reg_alpha": 0.4,    
    "reg_lambda": 0.6,

    "n_estimators": 2000,
    "early_stopping_rounds": 200,
    "random_state": 42,
    "verbose": -1
}

In [None]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

def objective(trial):
    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",

        "num_leaves": trial.suggest_int("num_leaves", 16, 256),
        "max_depth": trial.suggest_int("max_depth", -1, 12),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.2, log=True),

        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),

        "min_child_samples": trial.suggest_int("min_child_samples", 10, 200),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),

        "n_estimators": trial.suggest_int("n_estimators", 200, 3000),
        "verbose": -1,

        "device": "gpu",           
        "gpu_platform_id": 0,      
        "gpu_device_id": 0,        
    }

    n_splits = 10
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    auc_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_std, y)):
        X_train_fold = X_std[train_idx]
        y_train_fold = y[train_idx]
        X_val_fold = X_std[val_idx]
        y_val_fold = y[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        
        model.fit(
            X_train_fold,
            y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            callbacks=[
                lgb.early_stopping(stopping_rounds=100),
                lgb.log_evaluation(period=0)
            ]
        )
        
        preds = model.predict_proba(X_val_fold)[:, 1]
        fold_auc = roc_auc_score(y_val_fold, preds)
        auc_scores.append(fold_auc)
        
        print(f"Fold {fold + 1} AUC: {fold_auc:.6f}")
    
    # why return mean ???
    # Return mean AUC across all folds
    mean_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
    print(f"Mean AUC: {mean_auc:.6f} (+/- {std_auc:.6f})\n")
    
    return mean_auc


# Create and run the study
study = optuna.create_study(
    direction="maximize",
    study_name="lgbm_optimization_kfold"
)

study.optimize(objective, n_trials=50)

print("\nBest Mean AUC:", study.best_value)
print("Best hyperparameters:\n", study.best_params)

In [None]:
# lightGBM cause seem like this is the best for the old model
import lightgbm as lgb
# optuna para 
hyperpara = {'num_leaves': 57, 'max_depth': 4, 'learning_rate': 0.03277842323048595, 'feature_fraction': 0.9569717768433371, 'bagging_fraction': 0.8506767379185446, 'bagging_freq': 1, 'min_child_samples': 48, 'lambda_l1': 0.17636584028657937, 'lambda_l2': 0.025244809309038312, 'n_estimators': 2486}

lgbm = lgb.LGBMClassifier(**hyperpara)
lgbm.fit(X_train, y_train, eval_set= [(X_valid, y_valid)])

lgbm_train_pred = lgbm.predict_proba(X_train)[:, 1]
lgbm_valid_pred = lgbm.predict_proba(X_valid)[:, 1]

print(f"Train Score: {roc_auc_score(y_train, lgbm_train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid, lgbm_valid_pred)}")

### under this is the output part


In [None]:
# Make predictions for the test data
# Change model name if needed
pred = lgbm.predict_proba(X_test_std)[:, 1]

In [None]:
# Put the prediction into the format of submission
sample_sub['TARGET'] = pred
sample_sub

In [None]:
# Create the "output" directory if it doesn't exist
output_dir = Path.cwd() / "output"
os.makedirs(output_dir, exist_ok=True)

# Specify the new output file path
output_file = output_dir / "submission.csv"

# Save the CSV file to the "output" directory
sample_sub.to_csv(output_file, index=False)