In [60]:
import pandas as pd
import numpy as np

import sklearn.linear_model as sklm
import sklearn.model_selection as skms
import sklearn.metrics as metrics
import sklearn.pipeline as skpl
import sklearn.preprocessing as skpp
import sklearn.compose as skcmp
import scipy
import optuna

import os
import gc

In [117]:
def bulk_read(paths):
    ret_df = pd.DataFrame()

    for p in paths:
        df = pd.read_csv(p, index_col="id")
        df["class"] = df["class"].rename({"e":0, "p":1})
        df["class"] = df["class"].astype("category")
        df = df.rename(columns={
            "class":f"{p}_pred",
            "pred_proba_0":f"{p}_p0",
            "pred_proba_1":f"{p}_p1"
        })
        
        ret_df = pd.concat([ret_df, df], axis=1)

    return ret_df

train_paths = [os.path.join("ensemble_data", p) for p in os.listdir("ensemble_data") if p.endswith(".csv")]
test_paths = [os.path.join("test_preds", p) for p in os.listdir("test_preds") if p.endswith(".csv")]
pred_df = bulk_read(train_paths)
true_df = pd.read_csv("data/train.csv")["class"]
test_df = bulk_read(test_paths)
gc.collect()

48

In [50]:
true_df = true_df.astype("category")
true_df = true_df.cat.rename_categories({"e":0, "p":1})

for c in pred_df.columns:
    if c.endswith("pred"):
        pred_df[c] = pred_df[c].astype("category")
        pred_df[c] = pred_df[c].cat.rename_categories({"e":0, "p":1})

In [111]:
def objective(trial):
    skm = skms.StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    model = sklm.LogisticRegression(
        n_jobs=-1, 
        random_state=0,
        penalty="elasticnet",
        solver="saga",
        max_iter=500,

        C=trial.suggest_float("C", 1, 30),
        l1_ratio=trial.suggest_float("l1_ratio", 0, 1)
    )

    model = skpl.make_pipeline(skpp.FunctionTransformer(scipy.special.logit), model)
    pred_probas = pred_df[[c for c in pred_df.columns if not c.endswith("pred")]].clip(1e-15, 1-1e-15)

    score = skms.cross_val_score(
        model,
        pred_probas,
        true_df,
        cv=skm,
        scoring="matthews_corrcoef",
        n_jobs=-1
    )

    return np.mean(score)

In [None]:
# optuna.delete_study(study_name="logreg_ensemble", storage="sqlite:///optuna.sqlite3")

study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///optuna.sqlite3",
    study_name="logreg_ensemble",
    load_if_exists=True
)
study.optimize(objective, n_trials=100)
gc.collect()

In [119]:
for c in test_df.columns:
    if c.endswith("pred"):
        test_df[c] = test_df[c].astype("category")
        test_df[c] = test_df[c].cat.rename_categories({"e":0, "p":1})

In [131]:
best_model = sklm.LogisticRegression(
        n_jobs=-1, 
        random_state=0,
        penalty="elasticnet",
        solver="saga",
        max_iter=500,

        **study.best_params
    )

tmp1 = pred_df[[c for c in pred_df.columns if not c.endswith("pred")]].clip(1e-15, 1-1e-15)
tmp2 = test_df[[c for c in test_df.columns if not c.endswith("pred")]].clip(1e-15, 1-1e-15)

tmp1 = tmp1.rename(columns={src:dst for src, dst in zip(tmp1.columns, np.arange(len(tmp1.columns)))})
tmp2 = tmp2.rename(columns={src:dst for src, dst in zip(tmp2.columns, np.arange(len(tmp2.columns)))})

best_model = skpl.make_pipeline(skpp.FunctionTransformer(scipy.special.logit), best_model)
best_model.fit(tmp1, true_df)
test_preds = best_model.predict(tmp2)
out_df = pd.DataFrame({"class":test_preds}, index=tmp2.index)
out_df.to_csv("ensemble.csv")