In [37]:
from pathlib import Path

path = Path("./data")
output_path = Path("./submission")


In [38]:
import numpy as np
from sklearn import metrics


def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return metrics.log_loss(y_true, y_pred, sample_weight=1 / nc[y_true], eps=1e-15)


In [39]:
from collections import defaultdict

import numpy as np
import pandas as pd
from scipy import stats
from sklearn import compose, impute, pipeline, preprocessing


def get_preprocess_pipeline(df, cont_cols, cat_cols, drop_cols):
    """
    Returns a pipeline that performs the following transformations:
    * Standard scaling
    * Log transformation
    * Reciprocal transformation
    * Box-Cox transformation
    * Yeo-Johnson transformation
    * Categorical imputing
    * Semi-constant feature binarization

    Based on the EDA from https://www.kaggle.com/code/mateuszk013/icr-eda-balanced-learning-with-lgbm-xgb/notebook

    :param df: The dataframe to be transformed.
    :param cont_cols: The names of the continuous variables.
    :param drop_cols: The names of the dependent variables.
    """

    # Identify columns that doesn't follow a normal distribution
    # find an appropriate transformation for them to follow a normal distribution
    r2_scores = defaultdict(tuple)

    for feature in cont_cols:
        orig = df[feature].dropna()
        _, (*_, R_orig) = stats.probplot(orig, rvalue=True)
        _, (*_, R_log) = stats.probplot(np.log(orig), rvalue=True)
        _, (*_, R_sqrt) = stats.probplot(np.sqrt(orig), rvalue=True)
        _, (*_, R_reci) = stats.probplot(np.reciprocal(orig), rvalue=True)
        _, (*_, R_boxcox) = stats.probplot(stats.boxcox(orig)[0], rvalue=True)
        _, (*_, R_yeojohn) = stats.probplot(stats.yeojohnson(orig)[0], rvalue=True)
        r2_scores[feature] = (
            R_orig * R_orig,
            R_log * R_log,
            R_sqrt * R_sqrt,
            R_reci * R_reci,
            R_boxcox * R_boxcox,
            R_yeojohn * R_yeojohn,
        )

    r2_scores = pd.DataFrame(
        r2_scores,
        index=("Original", "Log", "Sqrt", "Reciprocal", "BoxCox", "YeoJohnson"),
    ).T

    r2_scores["Winner"] = r2_scores.idxmax(axis=1)

    # Identify columns to be transformed
    no_transform_cols = r2_scores.query("Winner == 'Original'").index
    log_transform_cols = r2_scores.query("Winner == 'Log'").index
    reciprocal_transform_cols = r2_scores.query("Winner == 'Reciprocal'").index
    boxcox_transform_cols = r2_scores.query("Winner == 'BoxCox'").index
    yeojohnson_transform_cols = r2_scores.query("Winner == 'YeoJohnson'").index

    # Identify columns that are constant or semi-constant
    numeric_descr = df.drop(columns=drop_cols).describe().T
    semi_constant_mask = np.isclose(numeric_descr["min"], numeric_descr["50%"])
    semi_constant_descr = numeric_descr[semi_constant_mask]
    semi_const_cols_thresholds = semi_constant_descr["50%"].to_dict()

    # List of columns to be transformed
    semi_const_cols = semi_const_cols_thresholds.keys()
    no_transform_cols = no_transform_cols.drop(semi_const_cols, errors="ignore").to_list()
    log_transform_cols = log_transform_cols.drop(semi_const_cols, errors="ignore").to_list()
    reciprocal_transform_cols = reciprocal_transform_cols.drop(semi_const_cols, errors="ignore").to_list()
    boxcox_transform_cols = boxcox_transform_cols.drop(semi_const_cols, errors="ignore").to_list()
    yeojohnson_transform_cols = yeojohnson_transform_cols.drop(semi_const_cols, errors="ignore").to_list()

    # Transformations
    standard_scaling = (
        preprocessing.StandardScaler(),
        no_transform_cols,
    )
    log_transform = (
        pipeline.make_pipeline(
            preprocessing.FunctionTransformer(func=np.log, feature_names_out="one-to-one"),
            preprocessing.StandardScaler(),
        ),
        log_transform_cols,
    )
    reciprocal_transform = (
        pipeline.make_pipeline(
            preprocessing.FunctionTransformer(func=np.reciprocal, feature_names_out="one-to-one"),
            preprocessing.StandardScaler(),
        ),
        reciprocal_transform_cols,
    )
    boxcox_transform = (
        preprocessing.PowerTransformer(method="box-cox", standardize=True),
        boxcox_transform_cols,
    )
    yeojohnson_transform = (
        preprocessing.PowerTransformer(method="yeo-johnson", standardize=True),
        yeojohnson_transform_cols,
    )

    # Other transformations
    categorical_imputing = (
        pipeline.make_pipeline(
            impute.SimpleImputer(strategy="most_frequent"),
            preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
        ),
        cat_cols,  # type: ignore
    )
    semi_const_transforms = [
        (
            pipeline.make_pipeline(
                impute.SimpleImputer(strategy="median"),
                preprocessing.Binarizer(threshold=thresh),
            ),
            [col],
        )
        for col, thresh in semi_const_cols_thresholds.items()
    ]

    return pipeline.make_pipeline(
        compose.make_column_transformer(
            standard_scaling,
            log_transform,
            reciprocal_transform,
            boxcox_transform,
            yeojohnson_transform,
            categorical_imputing,
            *semi_const_transforms,
            remainder="drop",
            verbose_feature_names_out=False,
        ),
        impute.KNNImputer(n_neighbors=10, weights="distance"),
    ).set_output(transform="pandas")


In [40]:
# Resample
from imblearn.over_sampling import SMOTE


def resample(X, y):
    sampler = SMOTE()
    X_res, y_res = sampler.fit_resample(X, y)
    return X_res, y_res

In [41]:
import pandas as pd
from fastai.tabular.core import cont_cat_split
from sklearn import model_selection

train_df = pd.read_csv(path / "train.csv", index_col="Id")

drop_cols = ["EJ"]
dep_vars = ["Class"]

untrainable_cols = drop_cols + dep_vars

# Drops the dep_vars before splitting categorical and continuous variables
cont_names, cat_names = cont_cat_split(train_df, dep_var=untrainable_cols)

train_df, test_df = model_selection.train_test_split(train_df, test_size=0.4, random_state=33)
train_df.shape, test_df.shape


((370, 57), (247, 57))

In [42]:
preprocessor = get_preprocess_pipeline(train_df, cont_names, cat_names, untrainable_cols)
X_pre = preprocessor.fit_transform(train_df.drop(columns=untrainable_cols))
train_df = pd.merge(X_pre, train_df[untrainable_cols], left_index=True, right_index=True)


In [43]:
from xgboost import XGBClassifier

model = XGBClassifier(
    objective="binary:logistic",
    colsample_bylevel=0.3,
    colsample_bynode=0.7,
    colsample_bytree=1.0,
    gamma=0.6,
    learning_rate=0.0344,
    max_depth=3,
    min_child_weight=0.5,
    n_estimators=650,
    reg_alpha=0.0,
    reg_lambda=0.0,
    scale_pos_weight=5.5,
    subsample=0.6,
    tree_method="hist",
    eval_metric=balanced_log_loss,
)

X = train_df.drop(columns=untrainable_cols, errors="ignore")
y = train_df[dep_vars]

kfold = model_selection.RepeatedStratifiedKFold(n_splits=7, n_repeats=4)

for idx in kfold.split(X, y):
    train_idx, _ = idx
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]

    X_res, y_res = resample(X_train, y_train)

    model.fit(X_res, y_res)

In [44]:
pred_probs = model.predict_proba(X)
y_true = y.values.ravel()

y_pred = pred_probs.argmax(axis=1)

log_loss_val = metrics.log_loss(y_true, pred_probs)
balanced_log_loss_val = balanced_log_loss(y_true, pred_probs)

accuracy_val = metrics.accuracy_score(y_true, y_pred)
kappa_val = metrics.cohen_kappa_score(y_true, y_pred)
f1_val = metrics.f1_score(y_true, y_pred)

print(f"Log loss: {log_loss_val:.4f}")
print(f"Balanced log loss: {balanced_log_loss_val:.4f}")
print(f"Accuracy: {accuracy_val:.4f}")
print(f"Kappa: {kappa_val:.4f}")
print(f"F1: {f1_val:.4f}")

Log loss: 0.0406
Balanced log loss: 0.0558
Accuracy: 0.9892
Kappa: 0.9579
F1: 0.9643


In [45]:
X_test = preprocessor.transform(test_df.drop(columns=untrainable_cols, errors="ignore"))
y_test = test_df[dep_vars]

In [46]:
pred_probs = model.predict_proba(X_test)
y_true = y_test.values.ravel()

y_pred = pred_probs.argmax(axis=1)

log_loss_val = metrics.log_loss(y_true, pred_probs)
balanced_log_loss_val = balanced_log_loss(y_true, pred_probs)

accuracy_val = metrics.accuracy_score(y_true, y_pred)
kappa_val = metrics.cohen_kappa_score(y_true, y_pred)
f1_val = metrics.f1_score(y_true, y_pred)

print(f"Log loss: {log_loss_val:.4f}")
print(f"Balanced log loss: {balanced_log_loss_val:.4f}")
print(f"Accuracy: {accuracy_val:.4f}")
print(f"Kappa: {kappa_val:.4f}")
print(f"F1: {f1_val:.4f}")

Log loss: 0.1902
Balanced log loss: 0.2166
Accuracy: 0.9271
Kappa: 0.7897
F1: 0.8364
