In [1]:
import pandas as pd
import numpy as np

import sklearn.linear_model as sklm
import sklearn.model_selection as skms
import sklearn.metrics as metrics
import sklearn.pipeline as skpl
import sklearn.preprocessing as skpp
import sklearn.compose as skcmp
import scipy
import optuna

import os
import gc

In [2]:
def bulk_read(paths):
    ret_df = pd.DataFrame()

    for p in paths:
        df = pd.read_csv(p, index_col="id")
        df["class"] = df["class"].rename({"e":0, "p":1})
        df["class"] = df["class"].astype("category")
        df = df.rename(columns={
            "class":f"{p}_class",
            "pp_0":f"{p}_p0",
            "pp_1":f"{p}_p1"
        })
        
        ret_df = pd.concat([ret_df, df], axis=1)

    return ret_df

train_paths = [os.path.join("predictions/v2", p) for p in os.listdir("predictions/v2") if p.endswith(".csv") and "train" in p]
test_paths = [os.path.join("predictions/v2", p) for p in os.listdir("predictions/v2") if p.endswith(".csv") and "test" in p]
pred_df = bulk_read(train_paths)
true_df = pd.read_csv("data/train.csv")["class"]
test_df = bulk_read(test_paths)
gc.collect()

0

In [12]:
true_df = true_df.replace({"e":0, "p":1}).astype("uint8")
# true_df = true_df.astype("category")
# true_df = true_df.cat.rename_categories({"e":0, "p":1})

# for c in pred_df.columns:
#     if "class" in c:
#         pred_df[c] = pred_df[c].astype("category")
#         pred_df[c] = pred_df[c].cat.rename_categories({"e":0, "p":1})

for df in [pred_df, test_df]:
    for c in df.columns:
        if "class" in c:
            df[c] = df[c].astype("uint8")
        else:
            df[c] = df[c].astype("float16")

In [5]:
def objective(trial):
    skm = skms.StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    model = sklm.LogisticRegression(
        n_jobs=-1, 
        random_state=0,
        penalty="elasticnet",
        solver="saga",
        max_iter=500,

        C=trial.suggest_float("C", 1, 30),
        l1_ratio=trial.suggest_float("l1_ratio", 0, 1)
    )

    model = skpl.make_pipeline(model)
    pred_probas = pred_df[[c for c in pred_df.columns if not c.endswith("class")]].clip(1e-15, 1-1e-15)

    score = skms.cross_val_score(
        model,
        pred_probas,
        true_df,
        cv=skm,
        scoring="matthews_corrcoef",
        n_jobs=-1,
        error_score="raise"
    )

    return np.mean(score)

In [6]:
# optuna.delete_study(study_name="logreg_ensemble", storage="sqlite:///optuna.sqlite3")

study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///optuna/optuna.sqlite3",
    study_name="logreg_ensemble_v2",
    load_if_exists=True
)
study.optimize(objective, n_trials=30)
gc.collect()

[I 2024-08-20 14:49:22,336] Using an existing study with name 'logreg_ensemble_v2' instead of creating a new one.
[I 2024-08-20 14:53:58,176] Trial 7 finished with value: 0.9860574430567277 and parameters: {'C': 10.265391496082636, 'l1_ratio': 0.622418027978618}. Best is trial 7 with value: 0.9860574430567277.
[I 2024-08-20 15:10:35,033] Trial 8 finished with value: 0.9860574443060977 and parameters: {'C': 27.938093096850427, 'l1_ratio': 0.9344840454640506}. Best is trial 8 with value: 0.9860574443060977.
[I 2024-08-20 15:11:36,110] Trial 9 finished with value: 0.9860580957830323 and parameters: {'C': 2.1321050667477075, 'l1_ratio': 0.534577852654605}. Best is trial 9 with value: 0.9860580957830323.
[I 2024-08-20 15:16:20,561] Trial 10 finished with value: 0.9860574430567277 and parameters: {'C': 24.93669055034484, 'l1_ratio': 0.0653780412178222}. Best is trial 9 with value: 0.9860580957830323.
[I 2024-08-20 15:16:57,623] Trial 11 finished with value: 0.9860593873056134 and parameters:

116

In [14]:
best_model = sklm.LogisticRegression(
        n_jobs=-1, 
        random_state=0,
        penalty="elasticnet",
        solver="saga",
        max_iter=500,

        **study.best_params
    )

tmp1 = pred_df[[c for c in pred_df.columns if not c.endswith("class")]].clip(1e-15, 1-1e-15)
tmp2 = test_df[[c for c in test_df.columns if not c.endswith("class")]].clip(1e-15, 1-1e-15)

tmp1 = tmp1.rename(columns={src:dst for src, dst in zip(tmp1.columns, np.arange(len(tmp1.columns)))})
tmp2 = tmp2.rename(columns={src:dst for src, dst in zip(tmp2.columns, np.arange(len(tmp2.columns)))})

best_model = skpl.make_pipeline(best_model)
best_model.fit(tmp1, true_df)
test_preds = best_model.predict(tmp2)
out_df = pd.DataFrame({"class":test_preds}, index=tmp2.index)
out_df["class"] = out_df["class"].replace({0:"e", 1:"p"})
out_df.to_csv("ensemble_v2.csv")