In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import warnings

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Importing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [None]:
%run features_testing.ipynb

In [None]:
# Split the data into explanatory and target variables
X = train.drop("TARGET", axis=1).values
y = train["TARGET"].values
X_test = test.values

In [None]:
print(len(X[0]))
print(len(X_test[0]))

In [None]:
#from sklearn.preprocessing import PowerTransformer
#
#pt = PowerTransformer(method='yeo-johnson')
#X_pt = pt.fit_transform(X)
#X_test_pt = pt.transform(X_test)

In [None]:
# Standardization
sc = StandardScaler()
sc.fit(X)

X_std = sc.transform(X)
X_test_std = sc.transform(X_test)

In [None]:
# Split the original data into the training data and the validation data
X_train, X_valid, y_train, y_valid = train_test_split(X_std, y, test_size=0.3, stratify=y, random_state=0)

In [None]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd

def objective(trial):
    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",

        "num_leaves": trial.suggest_int("num_leaves", 16, 128),  # Reduced from 256
        "max_depth": trial.suggest_int("max_depth", 3, 8),  # Reduced from -1 to 12
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),  # Narrowed range

        "feature_fraction": trial.suggest_float("feature_fraction", 0.7, 1.0),  # Narrowed
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.7, 1.0),  # Narrowed
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 5),  # Reduced from 10

        "min_child_samples": trial.suggest_int("min_child_samples", 20, 100),  # Narrowed
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-4, 1.0, log=True),  # Narrowed
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-4, 1.0, log=True),  # Narrowed

        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),  # Reduced from 3000
        "verbose": -1,
        
        # GPU parameters
        "device": "gpu" ,
        "gpu_platform_id": 0 ,
        "gpu_device_id": 0 ,
    }

    # Initialize k-fold cross-validation
    # Reduced to 3 folds for faster training
    n_splits = 3
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    auc_scores = []
    
    # Convert X_std to DataFrame to avoid feature names warning
    # Create feature names if they don't exist
    if isinstance(X_std, np.ndarray):
        feature_names = [f'feature_{i}' for i in range(X_std.shape[1])]
        x_train_df = pd.DataFrame(X_std, columns=feature_names)
    else:
        x_train_df = X_std
    
    # Perform k-fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(skf.split(x_train_df, y)):
        x_train_fold = x_train_df.iloc[train_idx]
        y_train_fold = y[train_idx]
        x_val_fold = x_train_df.iloc[val_idx]
        y_val_fold = y[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        
        model.fit(
            x_train_fold,
            y_train_fold,
            eval_set=[(x_val_fold, y_val_fold)],
            callbacks=[
                lgb.early_stopping(stopping_rounds=30),  # Reduced from 100
                lgb.log_evaluation(period=0)  # Silent training
            ]
        )
        
        preds = model.predict_proba(x_val_fold)[:, 1]
        fold_auc = roc_auc_score(y_val_fold, preds)
        auc_scores.append(fold_auc)
        
        print(f"Trial {trial.number} - Fold {fold + 1}/{n_splits} AUC: {fold_auc:.6f}")
    
    # Return mean AUC across all folds
    mean_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
    print(f"Trial {trial.number} - Mean AUC: {mean_auc:.6f} (Â±{std_auc:.6f})\n")
    
    return mean_auc


# Create and run the study
# n_jobs=1 when using GPU (GPU doesn't benefit from parallel trials)
study = optuna.create_study(
    direction="maximize",
    study_name="lgbm_optimization_kfold",
    sampler=optuna.samplers.TPESampler(seed=42),  # For reproducibility
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)  # Prune bad trials early
)

n_trials = 20 

study.optimize(
    objective, 
    n_trials=n_trials,
    n_jobs=1,  # LightGBM GPU doesn't support parallel training
    show_progress_bar=True,
    timeout=7200  # 2 hour timeout as safety measure
)

print("\nBest Mean AUC:", study.best_value)
print("Best hyperparameters:\n", study.best_params)

# Train final model with best parameters on full training data
best_params = study.best_params.copy()
best_params.update({
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "verbose": -1,

    "device": "gpu" ,
    "gpu_platform_id": 0 ,
    "gpu_device_id": 0 ,
})

# Split data for final validation
X_train_final, X_valid_final, y_train_final, y_valid_final = train_test_split(
    X_std, y, test_size=0.3, stratify=y, random_state=0
)

lgbm = lgb.LGBMClassifier(**best_params)
lgbm.fit(X_train_final, y_train_final, eval_set=[(X_valid_final, y_valid_final)])

lgbm_train_pred = lgbm.predict_proba(X_train_final)[:, 1]
lgbm_valid_pred = lgbm.predict_proba(X_valid_final)[:, 1]

print(f"\nFinal Model Performance:")
print(f"Train Score: {roc_auc_score(y_train_final, lgbm_train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid_final, lgbm_valid_pred)}")

### under this is the output part


In [None]:
# Make predictions for the test data
# Change model name if needed
pred = lgbm.predict_proba(X_test_std)[:, 1]

In [None]:
# Put the prediction into the format of submission
sample_sub['TARGET'] = pred
sample_sub

In [None]:
# Create the "output" directory if it doesn't exist
output_dir = Path.cwd() / "output"
os.makedirs(output_dir, exist_ok=True)

# Specify the new output file path
output_file = output_dir / "submission.csv"

# Save the CSV file to the "output" directory
sample_sub.to_csv(output_file, index=False)