In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder

In [5]:
data_path = Path("../data/")

In [2]:
# read data
train_df = pd.read_csv(data_path/"train.csv")
test_df = pd.read_csv(data_path/"test.csv")
test_ids = test_df["Id"]

# preprocess data
feature_columns = train_df.columns[1:-1]
X_train, y_train = train_df[feature_columns], train_df["Class"]
X_test = test_df[feature_columns]

# convert categorical values
oh_encoder = OneHotEncoder(handle_unknown="ignore")
X_train_encoded = oh_encoder.fit_transform(X_train[["EJ"]])
X_test_encoded = oh_encoder.transform(X_test[["EJ"]])

# concatenate the encoded features with the original features
X_train = np.concatenate(
    [X_train.drop("EJ", axis=1), csr_matrix(X_train_encoded).toarray()], axis=1
)
X_test = np.concatenate(
    [X_test.drop("EJ", axis=1), csr_matrix(X_test_encoded).toarray()], axis=1
)

# impute missing values with the mean of the column
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [3]:
def balanced_log_loss(y_true, pred_probs):
    pred_probs = np.maximum(np.minimum(pred_probs, 1 - 1e-15), 1e-15)
    pred_probs = pred_probs / np.sum(pred_probs, axis=1)[:, None]

    n0, n1 = np.bincount(y_true)
    w0 = 1 / (n0 / len(y_true))
    w1 = 1 / (n1 / len(y_true))

    l0 = -w0 / n0 * np.sum(np.where(y_true == 0, 1, 0) * np.log(pred_probs[:, 0]))
    l1 = -w1 / n1 * np.sum(np.where(y_true == 1, 1, 0) * np.log(pred_probs[:, 1]))

    return (l0 + l1) / (w0 + w1)

In [4]:
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
test_probs = np.zeros((len(test_df), 2))

for train_idx, val_idx in kf.split(X_train, y_train):
    X_train_fold, y_train_fold = X_train[train_idx], y_train[train_idx]
    X_val_fold, y_val_fold = X_train[val_idx], y_train[val_idx]

    model.fit(X_train_fold, y_train_fold)
    val_probs = model.predict_proba(X_val_fold)
    val_pred = model.predict(X_val_fold)

    acc = accuracy_score(y_val_fold, np.argmax(val_probs, axis=1))
    print(
        f"Validation accuracy: {acc:.4f}.",
        f"Balanced log loss: {balanced_log_loss(y_val_fold, val_probs):.4f}",
    )

    test_fold_probs = model.predict_proba(X_test)
    test_probs += test_fold_probs / n_splits


sub_df = pd.DataFrame(
    {"Id": test_ids, "Class_0": test_probs[:, 0], "Class_1": test_probs[:, 1]}
)
sub_df.to_csv("submission.csv", index=False)

Validation accuracy: 0.9113. Balanced log loss: 0.6173
Validation accuracy: 0.9274. Balanced log loss: 0.6783
Validation accuracy: 0.9024. Balanced log loss: 0.6268
Validation accuracy: 0.8862. Balanced log loss: 1.0075
Validation accuracy: 0.9593. Balanced log loss: 0.5217
