In [None]:
import pandas as pd
import optuna
import numpy as np
import lightgbm as lgb
import pickle

df = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')
final = pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv')
original = pd.read_csv('/kaggle/input/bank-marketing-dataset-full/bank-full.csv', sep=';')

original['y'] = original['y'].map({'no': 0, 'yes': 1})

def simplify_contact(x):
    if x == 'unknown':
        return 'unknown'
    else:
        return 'known_contact'

def preprocessing(df):

    df['job_edu'] = df['job'].astype(str) + "_" + df['education'].astype(str)
    df = pd.get_dummies(df, columns=['job_edu'], prefix='job_edu')
    df.drop('job', axis=1, inplace=True)
    df.drop('education', axis=1, inplace=True)

    df['balance_log'] = np.log1p(df['balance'].clip(lower=0))
    
    df = pd.get_dummies(df, columns=['marital'], prefix='marital')
    df['default'] = df['default'].map({'yes': 1, 'no': 0})
    df['housing'] = df['housing'].map({'yes': 1, 'no': 0})
    df['loan'] = df['loan'].map({'yes': 1, 'no': 0})

    df['contact_simple'] = df['contact'].apply(simplify_contact)
    df = pd.get_dummies(df, columns=['contact_simple'], prefix='contact')
    df.drop('contact', axis=1, inplace=True)

    df['age^2'] = df["age"]**2
    
    df['prev_camp'] = (df['pdays'] != -1).astype(int)
    df['pdays'] = df['pdays'].replace(-1, 999)
    df = pd.get_dummies(df, columns=['poutcome'], prefix='poutcome') # has unknown. but means still ongoing 

    df['duration_sin'] = np.sin(2*np.pi * df['duration'] / 400)
    df['duration_cos'] = np.cos(2*np.pi * df['duration'] / 400)

    month_map = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
    'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
    'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
    }
    df['month_num'] = df['month'].map(month_map).astype('int')

    df['month_sin'] = np.sin(2 * np.pi * df['month_num'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month_num'] / 12)

    df.drop('month_num',axis=1,inplace=True)

    df = pd.get_dummies(df, columns=['month'], prefix='month')

    return df

def FE(X, test, c, original, target='y'):
    new_col = f"{c}_mean_target_orig"
    
    target_map = original.groupby(c)[target].mean()
    mapping_count = original[c].value_counts()
    
    global_mean = original[target].mean()
    
    X[f"{c}_count"] = X[c].map(mapping_count).fillna(0)
    test[f"{c}_count"] = test[c].map(mapping_count).fillna(0)
    
    X[new_col] = X[c].map(target_map).fillna(global_mean)
    test[new_col] = test[c].map(target_map).fillna(global_mean)
    
    return X, test

def extra(train, test):
    combined = pd.concat([train, test], axis=0, ignore_index=True)
    
    def f1(x):
        if x['education']=='unknown' and x['contact']=='unknown' and x['poutcome']=='unknown':
            return 21
        if (x['education']=='unknown' and x['contact']=='unknown') \
           or (x['education']=='unknown' and x['poutcome']=='unknown') \
           or (x['contact']=='unknown' and x['poutcome']=='unknown'):
            return 7
        if x['education']=='unknown' or x['contact']=='unknown' or x['poutcome']=='unknown':
            return 3
        return 0
    
    def f2(x):
        if x['default']=='no' and x['housing']=='no' and x['loan']=='no':
            return 21
        if (x['default']=='no' and x['housing']=='no') \
           or (x['default']=='no' and x['loan']=='no') \
           or (x['housing']=='no' and x['loan']=='no'):
            return 7
        if x['default']=='no' or x['housing']=='no' or x['loan']=='no':
            return 3
        return 0
    
    combined['unknown_score'] = combined.apply(f1, axis=1)
    combined['many_no_score'] = combined.apply(f2, axis=1)
    
    unknown_freq = combined['unknown_score'].value_counts().to_dict()
    many_no_freq = combined['many_no_score'].value_counts().to_dict()
    
    combined['unknown_score_freq'] = combined['unknown_score'].map(unknown_freq)
    combined['many_no_score_freq'] = combined['many_no_score'].map(many_no_freq)
    
    train_len = len(train)
    train['unknown_score_freq'] = combined.loc[:train_len-1, 'unknown_score_freq'].values
    test['unknown_score_freq'] = combined.loc[train_len:, 'unknown_score_freq'].values
    
    train['many_no_score_freq'] = combined.loc[:train_len-1, 'many_no_score_freq'].values
    test['many_no_score_freq'] = combined.loc[train_len:, 'many_no_score_freq'].values
    
    return train, test

COLS = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome',]

df.drop('id', axis=1, inplace=True)
final_id = final['id']
final.drop('id', axis=1, inplace=True)

for c in COLS:
    df, test = FE(df, final, c, original)

df, final = extra(df, final)

df = preprocessing(df)
final = preprocessing(final)

y = df['y']
X = df.drop('y', axis=1)

train_data = lgb.Dataset(X, label=y)

def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'is_unbalance': True,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 150),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'verbose': -1,
        'feature_pre_filter': False,
    }
    
    cv_results = lgb.cv(
        params,
        train_data,
        num_boost_round=4000,
        nfold=5,
        stratified=True,
        shuffle=True,
        seed=42,
        callbacks=[lgb.early_stopping(stopping_rounds=100)],
    )
    
    best_auc = max(cv_results['valid auc-mean'])
    best_rounds = len(cv_results['valid auc-mean'])
    
    trial.set_user_attr("best_rounds", best_rounds)
    return best_auc

pruner = optuna.pruners.MedianPruner(n_warmup_steps=1)
study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(objective, timeout=60*60*8)

best_params = study.best_params
best_rounds = study.best_trial.user_attrs["best_rounds"]

model_lgb = lgb.train(
    {**best_params, 'objective': 'binary', 'metric': 'auc', 'is_unbalance': True},
    train_data,
    num_boost_round=best_rounds
)

preds = model_lgb.predict(final)

submission = pd.DataFrame({
    "id": final_id,
    "y": preds
})

submission.to_csv("submission.csv", index=False)

with open("lgb_model.pkl", "wb") as f:
    pickle.dump((model_lgb), f)

[I 2025-08-11 23:30:25,759] A new study created in memory with name: no-name-7182ee16-fd44-4ead-8ebb-f123f8d91623


Training until validation scores don't improve for 100 rounds


[I 2025-08-11 23:36:11,334] Trial 0 finished with value: 0.9737589397366427 and parameters: {'learning_rate': 0.07039908814401957, 'num_leaves': 114, 'max_depth': 10, 'min_child_samples': 105, 'subsample': 0.783583022041211, 'colsample_bytree': 0.6196182243461509, 'reg_alpha': 3.3001243336160755, 'reg_lambda': 1.7605123659053645}. Best is trial 0 with value: 0.9737589397366427.


Early stopping, best iteration is:
[752]	cv_agg's valid auc: 0.973759 + 0.000503667
Training until validation scores don't improve for 100 rounds


[I 2025-08-11 23:53:48,470] Trial 1 finished with value: 0.9703400926633441 and parameters: {'learning_rate': 0.006850033973741573, 'num_leaves': 280, 'max_depth': 4, 'min_child_samples': 113, 'subsample': 0.9201880384114736, 'colsample_bytree': 0.7035050810358845, 'reg_alpha': 0.16539793153851123, 'reg_lambda': 2.8751219748852384}. Best is trial 0 with value: 0.9737589397366427.


Did not meet early stopping. Best iteration is:
[4000]	cv_agg's valid auc: 0.97034 + 0.000391886
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 00:21:50,748] Trial 2 finished with value: 0.9734254030332373 and parameters: {'learning_rate': 0.0060342273630518365, 'num_leaves': 94, 'max_depth': 9, 'min_child_samples': 143, 'subsample': 0.7651015834757406, 'colsample_bytree': 0.6415525844180954, 'reg_alpha': 3.0247230059263117, 'reg_lambda': 7.2343622836428}. Best is trial 0 with value: 0.9737589397366427.


Did not meet early stopping. Best iteration is:
[4000]	cv_agg's valid auc: 0.973425 + 0.000446705
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 00:47:39,481] Trial 3 finished with value: 0.9737449650474937 and parameters: {'learning_rate': 0.010950394504109062, 'num_leaves': 114, 'max_depth': 11, 'min_child_samples': 141, 'subsample': 0.7299185803714718, 'colsample_bytree': 0.8932551574947689, 'reg_alpha': 2.5504833691480644, 'reg_lambda': 3.322807935302251}. Best is trial 0 with value: 0.9737589397366427.


Did not meet early stopping. Best iteration is:
[4000]	cv_agg's valid auc: 0.973745 + 0.000479279
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 01:10:22,912] Trial 4 finished with value: 0.9722537498720856 and parameters: {'learning_rate': 0.005492283572873466, 'num_leaves': 34, 'max_depth': 9, 'min_child_samples': 136, 'subsample': 0.9904356742733796, 'colsample_bytree': 0.9646602474470659, 'reg_alpha': 2.248230995912034, 'reg_lambda': 9.389540341491925}. Best is trial 0 with value: 0.9737589397366427.


Did not meet early stopping. Best iteration is:
[4000]	cv_agg's valid auc: 0.972254 + 0.000412553
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 01:47:23,732] Trial 5 finished with value: 0.973926490920458 and parameters: {'learning_rate': 0.005424923577628177, 'num_leaves': 251, 'max_depth': 12, 'min_child_samples': 74, 'subsample': 0.854094922784768, 'colsample_bytree': 0.6661723482840918, 'reg_alpha': 6.856029645002502, 'reg_lambda': 3.9599111880593174}. Best is trial 5 with value: 0.973926490920458.


Did not meet early stopping. Best iteration is:
[4000]	cv_agg's valid auc: 0.973926 + 0.000468796
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 02:11:35,893] Trial 6 finished with value: 0.9739785563971367 and parameters: {'learning_rate': 0.029839446670373276, 'num_leaves': 200, 'max_depth': 6, 'min_child_samples': 108, 'subsample': 0.8666630165078928, 'colsample_bytree': 0.5611423701973145, 'reg_alpha': 5.833874655431098, 'reg_lambda': 9.92853614033092}. Best is trial 6 with value: 0.9739785563971367.


Did not meet early stopping. Best iteration is:
[3995]	cv_agg's valid auc: 0.973979 + 0.000471636
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 02:20:16,209] Trial 7 finished with value: 0.9734394727166423 and parameters: {'learning_rate': 0.07802560610368349, 'num_leaves': 129, 'max_depth': 5, 'min_child_samples': 21, 'subsample': 0.5308248058612526, 'colsample_bytree': 0.8060881700049494, 'reg_alpha': 0.8943971185909971, 'reg_lambda': 3.508716993482258}. Best is trial 6 with value: 0.9739785563971367.


Early stopping, best iteration is:
[1812]	cv_agg's valid auc: 0.973439 + 0.000462728
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 02:44:12,064] Trial 8 finished with value: 0.9724158921028014 and parameters: {'learning_rate': 0.010746677501226572, 'num_leaves': 293, 'max_depth': 5, 'min_child_samples': 107, 'subsample': 0.9624420451056517, 'colsample_bytree': 0.526247220647452, 'reg_alpha': 6.223672036245955, 'reg_lambda': 9.171998170436199}. Best is trial 6 with value: 0.9739785563971367.


Did not meet early stopping. Best iteration is:
[4000]	cv_agg's valid auc: 0.972416 + 0.000370387
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 03:04:08,443] Trial 9 finished with value: 0.9711797803814003 and parameters: {'learning_rate': 0.005793807090413483, 'num_leaves': 287, 'max_depth': 5, 'min_child_samples': 119, 'subsample': 0.8251859199504324, 'colsample_bytree': 0.9438600393262486, 'reg_alpha': 5.61925545223462, 'reg_lambda': 1.4584082966354495}. Best is trial 6 with value: 0.9739785563971367.


Did not meet early stopping. Best iteration is:
[4000]	cv_agg's valid auc: 0.97118 + 0.000366589
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 03:22:37,752] Trial 10 finished with value: 0.9740975178283252 and parameters: {'learning_rate': 0.03669370169821907, 'num_leaves': 207, 'max_depth': 7, 'min_child_samples': 68, 'subsample': 0.6471964086706348, 'colsample_bytree': 0.420935902619139, 'reg_alpha': 9.956606234010714, 'reg_lambda': 6.313508631709841}. Best is trial 10 with value: 0.9740975178283252.


Early stopping, best iteration is:
[2486]	cv_agg's valid auc: 0.974098 + 0.000485803
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 03:41:15,362] Trial 11 finished with value: 0.9740946815034256 and parameters: {'learning_rate': 0.0358769525313863, 'num_leaves': 212, 'max_depth': 7, 'min_child_samples': 62, 'subsample': 0.6470790056888375, 'colsample_bytree': 0.46508423858543796, 'reg_alpha': 9.895307012707033, 'reg_lambda': 5.9915408138092685}. Best is trial 10 with value: 0.9740975178283252.


Early stopping, best iteration is:
[2493]	cv_agg's valid auc: 0.974095 + 0.000487915
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 03:58:25,243] Trial 12 finished with value: 0.9741184852300165 and parameters: {'learning_rate': 0.03835012002296815, 'num_leaves': 199, 'max_depth': 7, 'min_child_samples': 58, 'subsample': 0.6403304919032008, 'colsample_bytree': 0.430290075351875, 'reg_alpha': 9.948649237742083, 'reg_lambda': 6.048773402871248}. Best is trial 12 with value: 0.9741184852300165.


Early stopping, best iteration is:
[2299]	cv_agg's valid auc: 0.974118 + 0.000471609
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 04:10:52,782] Trial 13 finished with value: 0.9741263356758287 and parameters: {'learning_rate': 0.04432690720043594, 'num_leaves': 183, 'max_depth': 8, 'min_child_samples': 42, 'subsample': 0.6240347662748922, 'colsample_bytree': 0.40315475008011503, 'reg_alpha': 9.607167511519927, 'reg_lambda': 6.403795101665761}. Best is trial 13 with value: 0.9741263356758287.


Early stopping, best iteration is:
[1512]	cv_agg's valid auc: 0.974126 + 0.000461446
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 04:20:25,769] Trial 14 finished with value: 0.974027794254876 and parameters: {'learning_rate': 0.0575675007018041, 'num_leaves': 167, 'max_depth': 8, 'min_child_samples': 36, 'subsample': 0.571510213197008, 'colsample_bytree': 0.40316757338123715, 'reg_alpha': 8.215690990203184, 'reg_lambda': 7.697318477885231}. Best is trial 13 with value: 0.9741263356758287.


Early stopping, best iteration is:
[1142]	cv_agg's valid auc: 0.974028 + 0.00051358
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 04:46:25,215] Trial 15 finished with value: 0.9740710301892467 and parameters: {'learning_rate': 0.019874282660708422, 'num_leaves': 161, 'max_depth': 8, 'min_child_samples': 47, 'subsample': 0.6503590864914939, 'colsample_bytree': 0.5011513318803428, 'reg_alpha': 8.161040920480541, 'reg_lambda': 5.0297483873843785}. Best is trial 13 with value: 0.9741263356758287.


Early stopping, best iteration is:
[3272]	cv_agg's valid auc: 0.974071 + 0.000501341
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 05:01:37,706] Trial 16 finished with value: 0.972721764205976 and parameters: {'learning_rate': 0.052856369429635765, 'num_leaves': 237, 'max_depth': 3, 'min_child_samples': 17, 'subsample': 0.6980911019882756, 'colsample_bytree': 0.762371040534709, 'reg_alpha': 8.145636355521622, 'reg_lambda': 8.091004043875966}. Best is trial 13 with value: 0.9741263356758287.


Did not meet early stopping. Best iteration is:
[4000]	cv_agg's valid auc: 0.972722 + 0.000411083
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 05:06:13,135] Trial 17 finished with value: 0.973804654762281 and parameters: {'learning_rate': 0.09884969579289775, 'num_leaves': 175, 'max_depth': 9, 'min_child_samples': 50, 'subsample': 0.5034955333143757, 'colsample_bytree': 0.5664560853777345, 'reg_alpha': 9.004431304800706, 'reg_lambda': 5.696467161309728}. Best is trial 13 with value: 0.9741263356758287.


Early stopping, best iteration is:
[512]	cv_agg's valid auc: 0.973805 + 0.000465589
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 05:32:17,199] Trial 18 finished with value: 0.9738492684653022 and parameters: {'learning_rate': 0.020650454659009915, 'num_leaves': 77, 'max_depth': 6, 'min_child_samples': 87, 'subsample': 0.5891864674294066, 'colsample_bytree': 0.4502271490344506, 'reg_alpha': 4.571026231039094, 'reg_lambda': 4.652300776370144}. Best is trial 13 with value: 0.9741263356758287.


Did not meet early stopping. Best iteration is:
[4000]	cv_agg's valid auc: 0.973849 + 0.000450329
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 05:42:47,561] Trial 19 finished with value: 0.974078102644282 and parameters: {'learning_rate': 0.045126643898150645, 'num_leaves': 142, 'max_depth': 10, 'min_child_samples': 32, 'subsample': 0.5993624693977592, 'colsample_bytree': 0.49694414185209734, 'reg_alpha': 6.906373617969525, 'reg_lambda': 6.8402167024870995}. Best is trial 13 with value: 0.9741263356758287.


Early stopping, best iteration is:
[1229]	cv_agg's valid auc: 0.974078 + 0.000485441
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 06:06:20,127] Trial 20 finished with value: 0.9739692513640371 and parameters: {'learning_rate': 0.02604940408493602, 'num_leaves': 244, 'max_depth': 6, 'min_child_samples': 88, 'subsample': 0.7008858495598326, 'colsample_bytree': 0.5866672430977311, 'reg_alpha': 8.967036465798234, 'reg_lambda': 8.586620412428601}. Best is trial 13 with value: 0.9741263356758287.


Did not meet early stopping. Best iteration is:
[3985]	cv_agg's valid auc: 0.973969 + 0.000454084
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 06:23:16,695] Trial 21 finished with value: 0.9741096493949147 and parameters: {'learning_rate': 0.03830272448851989, 'num_leaves': 199, 'max_depth': 7, 'min_child_samples': 5, 'subsample': 0.6475395740200202, 'colsample_bytree': 0.4050210197968639, 'reg_alpha': 9.993910159582265, 'reg_lambda': 6.400487640450486}. Best is trial 13 with value: 0.9741263356758287.


Early stopping, best iteration is:
[2259]	cv_agg's valid auc: 0.97411 + 0.000498359
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 06:52:02,506] Trial 22 finished with value: 0.9740778882089561 and parameters: {'learning_rate': 0.015480322990321252, 'num_leaves': 188, 'max_depth': 7, 'min_child_samples': 5, 'subsample': 0.6287942239790978, 'colsample_bytree': 0.4084849684950726, 'reg_alpha': 9.089117598906832, 'reg_lambda': 0.0017469842615360776}. Best is trial 13 with value: 0.9741263356758287.


Did not meet early stopping. Best iteration is:
[3998]	cv_agg's valid auc: 0.974078 + 0.00047469
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 07:05:05,680] Trial 23 finished with value: 0.9740315259488492 and parameters: {'learning_rate': 0.04182106223755985, 'num_leaves': 216, 'max_depth': 8, 'min_child_samples': 55, 'subsample': 0.6916476428388789, 'colsample_bytree': 0.46743593699141356, 'reg_alpha': 7.209129457862365, 'reg_lambda': 5.181976869651331}. Best is trial 13 with value: 0.9741263356758287.


Early stopping, best iteration is:
[1564]	cv_agg's valid auc: 0.974032 + 0.00052086
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 07:29:06,419] Trial 24 finished with value: 0.9740731563553437 and parameters: {'learning_rate': 0.02908759704645533, 'num_leaves': 230, 'max_depth': 7, 'min_child_samples': 5, 'subsample': 0.5331021520389398, 'colsample_bytree': 0.5262748744969131, 'reg_alpha': 9.956174235593256, 'reg_lambda': 6.747943462592171}. Best is trial 13 with value: 0.9741263356758287.


Early stopping, best iteration is:
[3243]	cv_agg's valid auc: 0.974073 + 0.000457806
Training until validation scores don't improve for 100 rounds


[I 2025-08-12 07:46:41,882] Trial 25 finished with value: 0.97398659362023 and parameters: {'learning_rate': 0.04930625150578331, 'num_leaves': 143, 'max_depth': 6, 'min_child_samples': 31, 'subsample': 0.5981874776806715, 'colsample_bytree': 0.44996159808699215, 'reg_alpha': 7.800093017031082, 'reg_lambda': 4.415053800517421}. Best is trial 13 with value: 0.9741263356758287.


Early stopping, best iteration is:
[2544]	cv_agg's valid auc: 0.973987 + 0.000473944
