In [None]:
import pandas as pd
from xgboost import XGBClassifier
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import cupy as cp

In [None]:
df = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')
final = pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv')
original = pd.read_csv('/kaggle/input/bank-marketing-dataset-full/bank-full.csv', sep=';')

original['y'] = original['y'].map({'no': 0, 'yes': 1})

def simplify_contact(x):
    if x == 'unknown':
        return 'unknown'
    else:
        return 'known_contact'

In [None]:
def preprocessing(df):

    df['job_edu'] = df['job'].astype(str) + "_" + df['education'].astype(str)
    df = pd.get_dummies(df, columns=['job_edu'], prefix='job_edu')
    df.drop('job', axis=1, inplace=True)
    df.drop('education', axis=1, inplace=True)

    df['balance_log'] = np.log1p(df['balance'].clip(lower=0))
    
    df = pd.get_dummies(df, columns=['marital'], prefix='marital')
    df['default'] = df['default'].map({'yes': 1, 'no': 0})
    df['housing'] = df['housing'].map({'yes': 1, 'no': 0})
    df['loan'] = df['loan'].map({'yes': 1, 'no': 0})

    df['contact_simple'] = df['contact'].apply(simplify_contact)
    df = pd.get_dummies(df, columns=['contact_simple'], prefix='contact')
    df.drop('contact', axis=1, inplace=True)

    df['age^2'] = df["age"]**2
    
    df['prev_camp'] = (df['pdays'] != -1).astype(int)
    df['pdays'] = df['pdays'].replace(-1, 999)
    df = pd.get_dummies(df, columns=['poutcome'], prefix='poutcome') # has unknown. but means still ongoing 

    df['duration_sin'] = np.sin(2*np.pi * df['duration'] / 400)
    df['duration_cos'] = np.cos(2*np.pi * df['duration'] / 400)

    month_map = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
    'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
    'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
    }
    df['month_num'] = df['month'].map(month_map).astype('int')

    df['month_sin'] = np.sin(2 * np.pi * df['month_num'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month_num'] / 12)

    df.drop('month_num',axis=1,inplace=True)

    df = pd.get_dummies(df, columns=['month'], prefix='month')

    df = pd.get_dummies(df, columns=['unknown_score_freq'], prefix='unknown_score_freq')
    df = pd.get_dummies(df, columns=['many_no_score_freq'], prefix='many_no_score_freq')
    
    df['balance_per_age'] = df['balance'] / df['age']
    df['duration_per_campaign'] = df['duration'] / (df['campaign'] + 1)

    return df

def FE(X, test, c, original, target='y'):
    new_col = f"{c}_mean_target_orig"
    
    target_map = original.groupby(c)[target].mean()
    mapping_count = original[c].value_counts()
    
    global_mean = original[target].mean()
    
    X[f"{c}_count"] = X[c].map(mapping_count).fillna(0)
    test[f"{c}_count"] = test[c].map(mapping_count).fillna(0)
    
    X[new_col] = X[c].map(target_map).fillna(global_mean)
    test[new_col] = test[c].map(target_map).fillna(global_mean)
    
    return X, test

def extra(train, test):
    combined = pd.concat([train, test], axis=0, ignore_index=True)
    
    def f1(x):
        if x['education']=='unknown' and x['contact']=='unknown' and x['poutcome']=='unknown':
            return 21
        if (x['education']=='unknown' and x['contact']=='unknown') \
           or (x['education']=='unknown' and x['poutcome']=='unknown') \
           or (x['contact']=='unknown' and x['poutcome']=='unknown'):
            return 7
        if x['education']=='unknown' or x['contact']=='unknown' or x['poutcome']=='unknown':
            return 3
        return 0
    
    def f2(x):
        if x['default']=='no' and x['housing']=='no' and x['loan']=='no':
            return 21
        if (x['default']=='no' and x['housing']=='no') \
           or (x['default']=='no' and x['loan']=='no') \
           or (x['housing']=='no' and x['loan']=='no'):
            return 7
        if x['default']=='no' or x['housing']=='no' or x['loan']=='no':
            return 3
        return 0
    
    combined['unknown_score'] = combined.apply(f1, axis=1)
    combined['many_no_score'] = combined.apply(f2, axis=1)
    
    unknown_freq = combined['unknown_score'].value_counts().to_dict()
    many_no_freq = combined['many_no_score'].value_counts().to_dict()
    
    combined['unknown_score_freq'] = combined['unknown_score'].map(unknown_freq)
    combined['many_no_score_freq'] = combined['many_no_score'].map(many_no_freq)
    
    train_len = len(train)
    train['unknown_score_freq'] = combined.loc[:train_len-1, 'unknown_score_freq'].values
    test['unknown_score_freq'] = combined.loc[train_len:, 'unknown_score_freq'].values
    
    train['many_no_score_freq'] = combined.loc[:train_len-1, 'many_no_score_freq'].values
    test['many_no_score_freq'] = combined.loc[train_len:, 'many_no_score_freq'].values
    
    return train, test

In [None]:
COLS = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome',]

df.drop('id', axis=1, inplace=True)
final_id = final['id']
final.drop('id', axis=1, inplace=True)

for c in COLS:
    df, test = FE(df, final, c, original)

df, final = extra(df, final)

df = preprocessing(df)
final = preprocessing(final)

In [None]:
y = df['y']
X = df.drop('y', axis=1)

In [None]:
xgb1 = XGBClassifier(
    colsample_bytree=0.8,
    learning_rate=0.1,
    max_depth=7,
    n_estimators=200,
    eval_metric='logloss',
    random_state=42,
)

xgb1.fit(X, y)

importances = xgb1.feature_importances_
feature_ranking = np.argsort(importances)[::-1]
sorted_features = X.columns[feature_ranking]

In [None]:
def objective_xgb(trial):

    n_top_features = trial.suggest_int("top_n_features", 70, 120)
    selected = sorted_features[:n_top_features]
    X_sel = X[selected]

    param = {
        'n_estimators': trial.suggest_int('n_estimators', 1200, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.03, 0.1),
        'max_depth': trial.suggest_int('max_depth', 5, 10),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 2.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 2.0),
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'verbosity': 1,
        'n_jobs': 1,
        'device': 'cuda',
        'tree_method': 'hist',
    }

    aucs = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    for train_idx, val_idx in skf.split(X_sel, y):
        X_train, X_val = X_sel.iloc[train_idx], X_sel.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
        model = XGBClassifier(**param)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        preds = model.predict_proba(X_val)[:, 1]
        aucs.append(roc_auc_score(y_val, preds))
    return np.mean(aucs)

pruner = optuna.pruners.MedianPruner(n_warmup_steps=1)
study_xgb = optuna.create_study(direction='maximize', pruner=pruner)
study_xgb.optimize(objective_xgb, timeout=60*60*8)

best_params_xgb = study_xgb.best_params
best_n_features = best_params_xgb.pop("top_n_features")
best_params_xgb.update({
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'verbosity': 1,
    'n_jobs': 1,
    'device': 'cuda',
    'tree_method': 'hist',
})

selected_features = sorted_features[:best_n_features]

X_sel = X[selected_features]
final_sel = final[selected_features]

In [None]:
xgb = XGBClassifier(**best_params_xgb)

xgb.fit(X_sel, y)

y_test_proba = xgb.predict_proba(final_sel)[:, 1]

In [None]:
with open("xgb_model.pkl", "wb") as f:
    pickle.dump((xgb, selected_features), f)

In [None]:
submission = pd.DataFrame({
    "id": final_id,
    "y": y_test_proba
})

submission.to_csv("submission.csv", index=False)