In [20]:
from itertools import combinations, product
import os
import pickle

import joblib
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import KernelPCA
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from custom_transformers.stability_selection import StabilitySelectionTransformer
from custom_transformers.standard_true_false import standard_true_false

In [14]:
seed = 15
np.random.seed(seed)
n_jobs = joblib.cpu_count() - 1

In [15]:
def read_data(data_path):
    with open(os.path.join(data_path, "dataset.pkl"), "rb") as f:
        data = pickle.load(f)
        Y = data["pheno"].iloc[:, 1:]
        X_gpa = data["X_gpa"]
        X_snps = data["X_snps"]
        X_genexp = data["X_genexp"]

    return X_gpa, X_snps, X_genexp, Y


def _build_reg_pipeline(trans, idx):
    trans_ind = ColumnTransformer(transformers=["ind", trans, idx], remainder="drop")
    return Pipeline([("trans", trans_ind), ("dim_red", "passthrough"), ("clf", DummyClassifier())])


def get_voting_clf(X_gpa, X_snps, X_genexp):
    gpa_idx = np.arange(0, X_gpa.shape[1] - 1)
    snps_idx = np.arange(0, X_snps.shape[1] - 1) + gpa_idx[-1] + 1
    genexp_idx = np.arange(0, X_genexp.shape[1] - 1) + snps_idx[-1] + 1

    gpa_pipe = _build_reg_pipeline(standard_true_false, gpa_idx)
    snps_pipe = _build_reg_pipeline(standard_true_false, snps_idx)
    genexp_pipe = _build_reg_pipeline(StandardScaler(), genexp_idx)

    return VotingClassifier([("gpa", gpa_pipe), ("snps", snps_pipe), ("genexp", genexp_pipe)], voting="soft")

In [16]:
X_gpa, X_snps, X_genexp, Y = read_data(".")

antibiotics = list(Y)
antibiotic = antibiotics[0]

y = Y[antibiotic].to_numpy()

# there is no missing value in the regressors but there are in the target
mask = np.isfinite(y)
X_gpa = X_gpa[mask]
X_snps = X_snps[mask]
X_genexp = X_genexp[mask]
y = y[mask].astype(int)

In [17]:
clf = get_voting_clf(X_gpa, X_snps, X_genexp)

In [18]:
clf

In [37]:
def _get_stab_sel_trans(stab_sel_path):
    stab_sel_trans = None

    if os.path.exists(stab_sel_path):
        with open(stab_sel_path, "rb") as f:
            stability_scores = pickle.load(f)
        stab_sel_trans = StabilitySelectionTransformer(stability_scores=stability_scores)

    return stab_sel_trans


def _create_grid(roots, params):
    def add_to_grid(g, r, p):
        if len(p[0]) > 0:
            r = "__".join([r, p[0]])
        g[r] = p[1]
        for c in p[2]:
            add_to_grid(g, r, c)

    grids = []
    for comb in combinations(product(roots, params), len(roots)):
        valid = True
        grid = {}
        for root, param in comb:
            if root in grid:
                valid = False
                break
            else:
                add_to_grid(grid, root, param)
        if valid:
            grids.append(grid)
    return grids


def _merge_grids(grids):
    merged_grid = grids.pop()
    for grid in grids:
        merged_grid = [{**g1, **g2} for g1, g2 in product(merged_grid, grid)]
    return merged_grid


def _build_reg_grid(prefix, seed, stab_sel_path):
    dim_red_grid_roots = ["{}__dim_red".format(prefix)]
    dim_red_grid_params = [("", ["passthrough", ], []),
                           ("", [KernelPCA(random_state=seed), ],
                            [("kernel", ["linear", "poly", "rbf", "sigmoid"], []),
                             ("n_components", [64, 128, 256], [])])]
    stab_sel_trans = _get_stab_sel_trans(stab_sel_path)
    if stab_sel_trans is not None:
        dim_red_grid_params.append(("", [stab_sel_trans, ], [("threshold", np.linspace(.6, .9, 4), [])]))
    dim_red_grid = _create_grid(dim_red_grid_roots, dim_red_grid_params)

    clf_grid_roots = ["{}__clf".format(prefix)]
    clf_grid_params = [("", [AdaBoostClassifier(random_state=seed), GradientBoostingClassifier(random_state=seed)],
                        [("learning_rate", np.logspace(-2, 0, 3), [])]),
                       ("", [RandomForestClassifier(class_weight="balanced", random_state=seed)],
                        [("n_estimators", [100, 300, 500], []), ("max_depth", [None, 10, 100], []),
                         ("max_features", ["sqrt", "log2"], [])]),
                       ("", [LogisticRegression(penalty="l1", solver="liblinear", class_weight="balanced",
                                                max_iter=1000, random_state=seed)],
                        [("C", np.logspace(-1, 1, 3), [])]),
                       ("", [SGDClassifier(penalty="l1", class_weight="balanced", random_state=seed)],
                        [("loss", ["hinge", "log_loss"], []), ("alpha", np.logspace(-5, -3, 3), [])]),
                       ("", [SVC(class_weight="balanced", max_iter=10000, random_state=seed)],
                        [("C", np.logspace(-1, 1, 3), []), ("kernel", ["linear", "poly", "rbf", "sigmoid"], [])])]
    clf_grid = _create_grid(clf_grid_roots, clf_grid_params)

    return _merge_grids([dim_red_grid, clf_grid])


def build_hp_grid(clf, seed, n_jobs, stab_sel_path):
    gpa_grid = _build_reg_grid("gpa", seed, stab_sel_path)
    nsps_grid = _build_reg_grid("nsps", seed, stab_sel_path)
    genepx_grid = _build_reg_grid("genepx", seed, stab_sel_path)

    final_grid = _merge_grids([gpa_grid, nsps_grid, genepx_grid])
    cv_grid = GridSearchCV(clf, final_grid, scoring="balanced_accuracy", n_jobs=n_jobs, verbose=2)

    return cv_grid, final_grid

In [41]:
len(ParameterGrid(build_hp_grid(clf, seed, n_jobs, "")[1]))

200201625