In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import numpy as np
from imbalance.pipeline import Pipeline
from imbalance.data import gaussian_binary
from imbalance.viz import metric_balance
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from joblib import Parallel, delayed
from matplotlib import pyplot as plt

In [3]:
x, y, groups = gaussian_binary(mean_distance=1)

In [4]:
classifiers = {
    "lr": {
        "default": LogisticRegression(class_weight=None),
        "balanced": LogisticRegression(class_weight="balanced")
    },
    "svm": {
        "default": SVC(probability=True, class_weight=None),
        "balanced": SVC(probability=True, class_weight="balanced")
        },
    "rf": {
        "default": RandomForestClassifier(n_estimators=25, class_weight=None),
        "balanced": RandomForestClassifier(n_estimators=25, class_weight="balanced"),
        "balanced_subsample": RandomForestClassifier(n_estimators=25, class_weight="balanced_subsample"),
        "min_weight_fraction_leaf=0.5": RandomForestClassifier(n_estimators=25, min_weight_fraction_leaf=0.5),
    }
}

In [None]:
def run_pipeline(name, x, y, groups, clf):
    pl = Pipeline(x, y, groups, classifiers=clf, n_permutations=100)
    pl.evaluate()
    return name, pl

pls = {}
for clf_name, clf_list in classifiers.items():
    pls[clf_name] = Parallel(n_jobs=-1)(
        delayed(run_pipeline)(name, x, y, groups, clf) for name, clf in clf_list.items()
    )
    pls[clf_name] = {name: pl for name, pl in pls[clf_name]}

with open("hparams_pipelines.pkl", "wb") as f:
    pickle.dump(pls, f)

fitting classifiers:   6%|▌         | 14/250 [04:46<2:01:04, 30.78s/it, size=1, balance=0.567, classifier=SVC]cRegression]  

fitting classifiers:   6%|▌         | 15/250 [05:20<2:03:33, 31.55s/it, size=1, balance=0.6, classifier=SVC]  

In [None]:
with open("hparams_pipelines.pkl", "rb") as f:
    pls = pickle.load(f)

In [None]:
for clf_name in pls.keys():
    fig, ax = plt.subplots()
    lines = []
    for i, curr_pl in enumerate(pls[clf_name].values()):
        lines.extend(metric_balance(curr_pl, classifier=clf_name, color_offset=i, ax=ax, 
                                    ignore_metrics=["roc_auc", "balanced_accuracy", "f1"],
                                    show=False, show_leg=False, enforce_ylim=False))
    ax.legend(lines, pls[clf_name].keys())