In [1]:
import numpy as np
from sklearn import metrics


def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return metrics.log_loss(y_true, y_pred, sample_weight=1 / nc[y_true], eps=1e-15)

In [2]:
from xgboost import XGBClassifier

In [3]:
import pandas as pd
from pathlib import Path

path = Path('./data')
output = Path('./output')
output.mkdir(exist_ok=True)

In [4]:
df = pd.read_csv(path/'train.csv', index_col='Id')

In [7]:
# add additional features
df["GH"].describe()

count    617.000000
mean      31.489716
std        9.864239
min        9.432735
25%       25.034888
50%       30.608946
75%       36.863947
max       81.210825
Name: GH, dtype: float64

In [9]:
df["GH_binned"] = pd.qcut(df["GH"], 10, range(1, 11))

In [10]:
from sklearn.model_selection import train_test_split

drop_cols = ["EJ"]
dep_vars = ["Class"]

X = df.drop(drop_cols + dep_vars, axis=1)
y = df[dep_vars].values.ravel()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=33)

In [12]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer

preprocessing_pipeline = make_pipeline(
    KNNImputer(n_neighbors=10, weights="distance"),
).set_output(transform="pandas")

In [18]:
def resample(X, y):
    sampler = SMOTE()
    X_res, y_res = sampler.fit_resample(X, y)
    return X_res, y_res

In [17]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=5)

xgm_clf = XGBClassifier(
    objective="binary:logistic",
    colsample_bylevel=0.3,
    colsample_bynode=0.7,
    colsample_bytree=1.0,
    gamma=0.6,
    learning_rate=0.0344,
    max_depth=3,
    min_child_weight=0.5,
    n_estimators=650,
    reg_alpha=0.0,
    reg_lambda=0.0,
    scale_pos_weight=5.5,
    subsample=0.6,
    tree_method="hist",
    eval_metric=balanced_log_loss,
)


bal_log_losses = []

for train_idx, val_idx in cv.split(X_train, y_train):
    X_t = X_train.iloc[train_idx]
    y_t = y_train.iloc[train_idx]
    
    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]

    X_res, y_res = resample(X_train, y_train.values.ravel())
    xgm_clf.fit(X, y)
    
    y_pred = xgm_clf.predict_proba(X_val)
    loss = balanced_log_loss(y_t, y_pred)
    

(493,) (124,)
(493,) (124,)
(494,) (123,)
(494,) (123,)
(494,) (123,)


In [None]:
balanced_log_loss(y_test, p_eval), metrics.accuracy_score(y_test, p_eval.argmax(axis=1))

In [None]:
from visualize.results import plot_results
plot_results(y_test, p_eval[:, 1])