#### Preprocess

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df_aug = pd.read_csv('data/bank-full.csv', sep=';')

df_aug['y'] = df_aug['y'].map({'yes':1, 'no':0})
df_aug.columns = df_train.columns.drop('id')

df_train_ids = df_train['id']
df_test_ids = df_test['id']
df_train_y = df_train['y']
df_aug_y = df_aug['y']

#drop unnecessary columns before column trasformation
df_train = df_train.drop(columns=['id', 'y'])
df_test = df_test.drop(columns=['id'])
df_aug = df_aug.drop(columns=['y'])

df_all = pd.concat([df_train, df_test, df_aug], ignore_index=True)
categorical_cols = df_all.select_dtypes(include=['object']).columns.tolist()

df_all_transformed = pd.get_dummies(df_all, columns=categorical_cols, dummy_na=False)

df_train_transformed = df_all_transformed.iloc[:len(df_train)]
df_test_transformed = df_all_transformed.iloc[len(df_train):len(df_train)+len(df_test)]
df_aug_transformed = df_all_transformed.iloc[len(df_train)+len(df_test):]

x_all = pd.concat([df_train_transformed, df_aug_transformed])
y_all = pd.concat([df_train_y, df_aug_y])

x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

rf_classifier = RandomForestClassifier(
    n_estimators=300,
    max_depth=17,
    min_samples_split=3,
    min_samples_leaf=1,
    max_features='sqrt',
)

lgbm_classifier = LGBMClassifier(
    n_estimators=1000,
    max_depth=4,
    learning_rate=0.014754226812497719,
    subsample= 0.6194448318329648,
    colsample_bytree= 0.7659390034131854,
    reg_alpha= 0.296203120040839,
    reg_lambda= 0.6646730180280181,
)

stack_model = StackingClassifier(
    estimators=[('rf', rf_classifier), ('lgbm', lgbm_classifier)],
    final_estimator=LogisticRegression(
        C=6.457275468008391,
        solver='lbfgs',
    ),
    stack_method='predict_proba',
    n_jobs=-1,
    passthrough=False
)

stack_model.fit(x_train, y_train)
pred = stack_model.predict_proba(x_test)[:, 1]
pred_score = roc_auc_score(y_test, pred)
print("======================================")
print("ROC-AUC Score:",pred_score)
print("======================================")


In [10]:
final = stackModel.predict_proba(df_test_transformed)[:, 1]
submission = pd.DataFrame({
    'id' : df_test_ids,
    'y' : final
})

submission.to_csv('submission1.csv', index=False)
print("submission.csv file has been created.")



submission.csv file has been created.


#### Optuna code

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

def objective(trial):
    # RandomForest hyperparameters
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 100, 500, step=100)
    rf_max_depth = trial.suggest_int('rf_max_depth', 5, 30)
    rf_min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 10)
    rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 4)
    rf_max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2'])
    
    rf = RandomForestClassifier(
        n_estimators=rf_n_estimators,
        max_depth=rf_max_depth,
        min_samples_split=rf_min_samples_split,
        min_samples_leaf=rf_min_samples_leaf,
        max_features=rf_max_features,
        random_state=42,
        n_jobs=-1
    )
    
    # LightGBM hyperparameters
    lgbm_n_estimators = trial.suggest_int('lgbm_n_estimators', 100, 1000, step=100)
    lgbm_learning_rate = trial.suggest_float('lgbm_learning_rate', 0.01, 0.2, log=True)
    lgbm_max_depth = trial.suggest_int('lgbm_max_depth', 3, 15)
    lgbm_subsample = trial.suggest_float('lgbm_subsample', 0.6, 1.0)
    lgbm_colsample_bytree = trial.suggest_float('lgbm_colsample_bytree', 0.6, 1.0)
    lgbm_reg_alpha = trial.suggest_float('lgbm_reg_alpha', 0.0, 1.0)
    lgbm_reg_lambda = trial.suggest_float('lgbm_reg_lambda', 0.0, 1.0)
    
    lgbm = LGBMClassifier(
        n_estimators=lgbm_n_estimators,
        learning_rate=lgbm_learning_rate,
        max_depth=lgbm_max_depth,
        subsample=lgbm_subsample,
        colsample_bytree=lgbm_colsample_bytree,
        reg_alpha=lgbm_reg_alpha,
        reg_lambda=lgbm_reg_lambda,
        random_state=42,
        n_jobs=-1
    )
    
    # LogisticRegression hyperparameters
    logreg_C = trial.suggest_float('logreg_C', 0.01, 10.0, log=True)
    logreg_solver = trial.suggest_categorical('logreg_solver', ['lbfgs', 'liblinear'])
    
    logreg = LogisticRegression(
        C=logreg_C,
        solver=logreg_solver,
        random_state=42
    )
    
    # Stacking classifier
    stack_model = StackingClassifier(
        estimators=[('rf', rf), ('lgbm', lgbm)],
        final_estimator=logreg,
        stack_method='predict_proba',
        n_jobs=-1,
        passthrough=False
    )
    
    # Evaluate using cross-validation with roc_auc
    score = cross_val_score(stack_model, x_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1).mean()
    return score

# Create and optimize the study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Output results
print("Best trial:")
trial = study.best_trial
print(f"Best roc_auc: {trial.value:.4f}")
print("Best hyperparameters:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Retrain with best hyperparameters
best_rf = RandomForestClassifier(
    n_estimators=trial.params['rf_n_estimators'],
    max_depth=trial.params['rf_max_depth'],
    min_samples_split=trial.params['rf_min_samples_split'],
    min_samples_leaf=trial.params['rf_min_samples_leaf'],
    max_features=trial.params['rf_max_features'],
    random_state=42,
    n_jobs=-1
)

best_lgbm = LGBMClassifier(
    n_estimators=trial.params['lgbm_n_estimators'],
    learning_rate=trial.params['lgbm_learning_rate'],
    max_depth=trial.params['lgbm_max_depth'],
    subsample=trial.params['lgbm_subsample'],
    colsample_bytree=trial.params['lgbm_colsample_bytree'],
    reg_alpha=trial.params['lgbm_reg_alpha'],
    reg_lambda=trial.params['lgbm_reg_lambda'],
    random_state=42,
    n_jobs=-1
)

best_logreg = LogisticRegression(
    C=trial.params['logreg_C'],
    solver=trial.params['logreg_solver'],
    random_state=42
)

best_stack = StackingClassifier(
    estimators=[('rf', best_rf), ('lgbm', best_lgbm)],
    final_estimator=best_logreg,
    stack_method='predict_proba',
    n_jobs=-1,
    passthrough=False
)

best_stack.fit(x_train, y_train)
test_accuracy = best_stack.predict_proba(x_test, y_test)
print(f"Test accuracy with optimized model: {test_accuracy:.4f}")

#### XGB+LGBM

In [6]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

xgb_classifier = XGBClassifier(
    n_estimators = 1000,
    # early_stopping_rounds=50,
    learning_rate=0.01
)

lgbm_classifier = LGBMClassifier(
    n_estimators=1000,
    max_depth=4,
    learning_rate=0.014754226812497719,
    subsample= 0.6194448318329648,
    colsample_bytree= 0.7659390034131854,
    reg_alpha= 0.296203120040839,
    reg_lambda= 0.6646730180280181,
)

stackModel = StackingClassifier(
    estimators = [
        ('lgbm', lgbm_classifier), 
        ('xgb', xgb_classifier)
    ],
    final_estimator = LogisticRegression(C=6.457275468008391, solver='lbfgs'),
    stack_method = 'predict_proba',
    n_jobs = -1,
    passthrough = False
)

stackModel.fit(x_train,y_train)
pred = stackModel.predict_proba(x_test)[:,1]
pred_score = roc_auc_score(y_test, pred)
print("======================================")
print("ROC-AUC Score:",pred_score)
print("======================================")

[LightGBM] [Info] Number of positive: 76565, number of negative: 559603
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014367 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1045
[LightGBM] [Info] Number of data points in the train set: 636168, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120353 -> initscore=-1.989088
[LightGBM] [Info] Start training from score -1.989088
[LightGBM] [Info] Number of positive: 61252, number of negative: 447683
[LightGBM] [Info] Number of positive: 61252, number of negative: 447683
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028862 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1043
[LightGBM] [Info] Number of data poi

In [None]:
import gc
gc.collect()

5070