In [110]:
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from xgboost import plot_importance
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score, make_scorer
import numpy as np
import joblib

In [101]:
df = pd.read_csv('train.csv')
df.columns

def simplify_contact(x):
    if x == 'unknown':
        return 'unknown'
    else:
        return 'known_contact'

In [None]:

df.drop('id', axis=1, inplace=True)

df = pd.get_dummies(df, columns=['job'], prefix='job') #has unknown
df = pd.get_dummies(df, columns=['marital'], prefix='marital')
df = pd.get_dummies(df, columns=['education'], prefix='education') # has unknown
df['default'] = df['default'].map({'yes': 1, 'no': 0})
df['housing'] = df['housing'].map({'yes': 1, 'no': 0})
df['loan'] = df['loan'].map({'yes': 1, 'no': 0})

df['contact_simple'] = df['contact'].apply(simplify_contact)
df = pd.get_dummies(df, columns=['contact_simple'], prefix='contact')
df.drop('contact', axis=1, inplace=True)

df.drop('day', axis=1, inplace=True)
df = pd.get_dummies(df, columns=['month'], prefix='month')

df['prev_camp'] = (df['pdays'] != -1).astype(int)
df['pdays'] = df['pdays'].replace(-1, 999)
df = pd.get_dummies(df, columns=['poutcome'], prefix='poutcome') # has unknown 

df['balance'] = df['balance'].clip(upper=df['balance'].quantile(0.99))

In [103]:
print(len(df.columns))



48


In [104]:
y = df['y']
X = df.drop('y', axis=1)

In [None]:
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.2, random_state=1, stratify=y_trainval)

In [None]:
xgb = XGBClassifier(
    eval_metric='logloss',
    random_state=42
)

roc_auc = make_scorer(roc_auc_score, needs_proba=True)

xgb.fit(X_train, y_train)

selector = SelectFromModel(xgb, threshold='mean', prefit=True)

X_train_sel = selector.transform(X_train)
X_val_sel = selector.transform(X_val)

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=XGBClassifier(eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    scoring=roc_auc,
    cv=cv,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_sel, y_train)

print(f"Best params: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

y_val_pred = best_model.predict_proba(X_val_sel)[:, 1]
print(f"Validation ROC AUC: {roc_auc_score(y_val, y_val_pred):.4f}")



Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Validation ROC AUC: 0.9546


In [None]:
X_test_sel = selector.transform(X_test)

y_test_pred_proba = best_model.predict_proba(X_test_sel)[:, 1]

test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)
print(f"Training ROC AUC: {test_roc_auc:.4f}")




Training ROC AUC: 0.9557


In [112]:
selected_indices = selector.get_support(indices=True)
print(selected_indices)

[ 3  4  5 10 28 30 36 37 40 41 45 46]
