In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder

In [2]:
# read data
data_path = Path("../data/")
train_df = pd.read_csv(data_path/"train.csv")
test_df = pd.read_csv(data_path/"test.csv")
test_ids = test_df["Id"]

# preprocess data
feature_columns = train_df.columns[1:-1]
X_train, y_train = train_df[feature_columns], train_df["Class"]
X_test = test_df[feature_columns]

# convert categorical values
oh_encoder = OneHotEncoder(handle_unknown="ignore")
X_train_encoded = oh_encoder.fit_transform(X_train[["EJ"]])
X_test_encoded = oh_encoder.transform(X_test[["EJ"]])

# concatenate the encoded features with the original features
X_train = np.concatenate(
    [X_train.drop("EJ", axis=1), csr_matrix(X_train_encoded).toarray()], axis=1
)
X_test = np.concatenate(
    [X_test.drop("EJ", axis=1), csr_matrix(X_test_encoded).toarray()], axis=1
)

# impute missing values with the mean of the column
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [3]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
model = LogisticRegression(max_iter=10000, solver="saga")
test_probs = np.zeros((len(test_df), 2))

for train_idx, val_idx in kf.split(X_train):
    X_train_fold, y_train_fold = X_train[train_idx], y_train[train_idx]
    X_val_fold, y_val_fold = X_train[val_idx], y_train[val_idx]

    model.fit(X_train_fold, y_train_fold)
    val_probs = model.predict_proba(X_val_fold)

    acc = accuracy_score(y_val_fold, np.argmax(val_probs, axis=1))
    print(f"Validation accuracy: {acc:.4f}.")

    # make predictions on the test set
    test_fold_probs = model.predict_proba(X_test)

    # add the probabilities to the test_probs array
    test_probs += test_fold_probs / n_splits


sub_df = pd.DataFrame(
    {"Id": test_ids, "Class_0": test_probs[:, 0], "Class_1": test_probs[:, 1]}
)
sub_df.to_csv("submission.csv", index=False)

Validation accuracy: 0.8306.
Validation accuracy: 0.8065.
Validation accuracy: 0.8780.
Validation accuracy: 0.8293.
Validation accuracy: 0.8537.
