In [24]:
import pickle

import joblib
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from custom_transformers.stability_selection import StabilitySelection
from custom_transformers.standard_true_false import standard_true_false

In [2]:
seed = 15
np.random.seed(seed)
n_jobs = joblib.cpu_count() - 1

In [3]:
with open("dataset.pkl", "rb") as f:
    DATA = pickle.load(f)
    pheno = DATA["pheno"]
    X_gpa = DATA["X_gpa"]
    X_snps = DATA["X_snps"]
    X_genexp = DATA["X_genexp"]

In [4]:
antibiotic = "Tobramycin"

y = pheno[antibiotic].to_numpy()
cv_results = pd.read_csv("results/grid_search/cv_results__{}.csv".format(antibiotic))

In [5]:
mask = np.isfinite(y)
X_gpa = X_gpa[mask]
X_snps = X_snps[mask]
X_genexp = X_genexp[mask]
y = y[mask].astype(int)

In [6]:
gpa_idx = np.arange(0, X_gpa.shape[1] - 1)
snps_idx = np.arange(0, X_snps.shape[1] - 1) + gpa_idx[-1] + 1
genexp_idx = np.arange(0, X_genexp.shape[1] - 1) + snps_idx[-1] + 1

# Par fold

In [22]:
cv_results

Unnamed: 0.1,Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__learning_rate,param_dim_red,param_dim_red_ind__genexp,param_dim_red_ind__gpa,...,param_clf__kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,11.586318,0.537423,0.000000,0.000000,AdaBoostClassifier(random_state=15),0.01,passthrough,drop,drop,...,,"{'clf': AdaBoostClassifier(random_state=15), '...",,,,,,,,5920
1,1,27.479746,1.378427,0.983673,0.026305,AdaBoostClassifier(random_state=15),0.01,passthrough,drop,drop,...,,"{'clf': AdaBoostClassifier(random_state=15), '...",0.807692,0.837063,0.808741,0.913986,0.625524,0.798601,0.094799,3894
2,2,23.276912,0.267877,0.490102,0.024449,AdaBoostClassifier(random_state=15),0.01,passthrough,drop,passthrough,...,,"{'clf': AdaBoostClassifier(random_state=15), '...",0.857830,0.817832,0.827972,0.962587,0.798601,0.852965,0.058060,2619
3,3,40.680767,0.219802,1.155546,0.028942,AdaBoostClassifier(random_state=15),0.01,passthrough,drop,passthrough,...,,"{'clf': AdaBoostClassifier(random_state=15), '...",0.857830,0.817832,0.827972,0.962587,0.798601,0.852965,0.058060,2619
4,4,13.031603,0.293586,0.374530,0.013007,AdaBoostClassifier(random_state=15),0.01,passthrough,passthrough,drop,...,,"{'clf': AdaBoostClassifier(random_state=15), '...",0.742445,0.721678,0.761189,0.885664,0.702448,0.762685,0.064569,4227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5915,5915,10.883300,0.199060,0.367277,0.015420,"SVC(class_weight='balanced', max_iter=10000, r...",,StabilitySelection(threshold=0.8),drop,passthrough,...,sigmoid,"{'clf': SVC(class_weight='balanced', max_iter=...",0.860577,0.819930,0.850350,0.898951,0.827972,0.851556,0.027877,2676
5916,5916,7.150990,0.347359,0.314693,0.013221,"SVC(class_weight='balanced', max_iter=10000, r...",,StabilitySelection(threshold=0.8),passthrough,drop,...,sigmoid,"{'clf': SVC(class_weight='balanced', max_iter=...",0.801511,0.790559,0.906993,0.917133,0.632867,0.809813,0.102675,3744
5917,5917,9.674779,0.206152,0.337452,0.013081,"SVC(class_weight='balanced', max_iter=10000, r...",,StabilitySelection(threshold=0.8),passthrough,drop,...,sigmoid,"{'clf': SVC(class_weight='balanced', max_iter=...",0.792582,0.782517,0.857343,0.898951,0.765385,0.819356,0.050528,3568
5918,5918,7.022861,0.905894,0.303505,0.025415,"SVC(class_weight='balanced', max_iter=10000, r...",,StabilitySelection(threshold=0.8),passthrough,passthrough,...,sigmoid,"{'clf': SVC(class_weight='balanced', max_iter=...",0.879808,0.790559,0.896853,0.886713,0.857343,0.862255,0.038125,2240


In [25]:
f4_pipeline = Pipeline([("filter", ColumnTransformer(transformers=[("snps", standard_true_false, snps_idx)],
                                                      remainder="drop")),
                         ("dim_red", KernelPCA(random_state=15, kernel="sigmoid", n_components=256)),
                         ("clf", SVC(class_weight="balanced", max_iter=10000, random_state=seed,
                                     kernel="sigmoid", C=10))])

In [26]:
f3_pipeline = Pipeline([("filter", ColumnTransformer(transformers=[("gpa", standard_true_false, gpa_idx),
                                                                   ("snps", standard_true_false, snps_idx),
                                                                   ("genexp", StandardScaler(), genexp_idx)],
                                                      remainder="drop")),
                         ("dim_red", StabilitySelection(random_state=15, threshold=.7)),
                         ("clf", RandomForestClassifier(class_weight="balanced", random_state=seed,
                                                        criterion="gini"))])

In [27]:
f2_pipeline = Pipeline([("filter", ColumnTransformer(transformers=[("snps", standard_true_false, snps_idx)],
                                                      remainder="drop")),
                         ("dim_red", StabilitySelection(random_state=15, threshold=.9)),
                         ("clf", LogisticRegression(class_weight="balanced", max_iter=1000, random_state=seed,
                                                    C=1))])

In [28]:
f1_pipeline = Pipeline([("filter", ColumnTransformer(transformers=[("gpa", standard_true_false, gpa_idx)],
                                                      remainder="drop")),
                         ("dim_red", KernelPCA(random_state=15, kernel="poly", n_components=128)),
                         ("clf", RandomForestClassifier(class_weight="balanced", random_state=seed,
                                                        criterion="gini"))])

In [29]:
f0_pipeline = Pipeline([("filter", ColumnTransformer(transformers=[("gpa", standard_true_false, gpa_idx),
                                                                   ("snps", standard_true_false, snps_idx)],
                                                      remainder="drop")),
                         ("dim_red", StabilitySelection(random_state=15, threshold=.8)),
                         ("clf", RandomForestClassifier(class_weight="balanced", random_state=seed,
                                                        criterion="gini"))])

In [30]:
clf = VotingClassifier([("f0", f0_pipeline), ("f1", f1_pipeline), ("f2", f2_pipeline),
                        ("f3", f3_pipeline), ("f4", f4_pipeline)],
                       voting="soft")

In [31]:
clf

In [32]:
res = cross_val_score(clf, np.concatenate([X_gpa, X_snps, X_genexp], axis=1), y, scoring="balanced_accuracy",
                      n_jobs=n_jobs)


KeyboardInterrupt



In [None]:
res.mean()

# Par régresseur

In [7]:
gpa_results = cv_results.loc[(cv_results["param_dim_red_ind__gpa"] == "passthrough") &
                             (cv_results["param_dim_red_ind__snps"] == "drop") &
                             (cv_results["param_dim_red_ind__genexp"] == "drop")]
best_gpa = gpa_results.sort_values("mean_test_score", ascending=False).iloc[0]

In [8]:
snps_results = cv_results.loc[(cv_results["param_dim_red_ind__gpa"] == "drop") &
                              (cv_results["param_dim_red_ind__snps"] == "passthrough") &
                              (cv_results["param_dim_red_ind__genexp"] == "drop")]
best_snps = snps_results.sort_values("mean_test_score", ascending=False).iloc[0]

In [9]:
genexp_results = cv_results.loc[(cv_results["param_dim_red_ind__gpa"] == "drop") &
                                (cv_results["param_dim_red_ind__snps"] == "drop") &
                                (cv_results["param_dim_red_ind__genexp"] == "passthrough")]
best_genexp = genexp_results.sort_values("mean_test_score", ascending=False).iloc[0]

In [10]:
gpa_pipeline = Pipeline([("filter", ColumnTransformer(transformers=[("gpa", standard_true_false, gpa_idx), ],
                                                      remainder="drop")),
                         ("dim_red", StabilitySelection(random_state=15, threshold=.9)),
                         ("clf", RandomForestClassifier(class_weight="balanced", random_state=seed,
                                                        criterion="gini"))])

In [11]:
snps_pipeline = Pipeline([("filter", ColumnTransformer(transformers=[("snps", standard_true_false, snps_idx), ],
                                                      remainder="drop")),
                          ("dim_red", KernelPCA(random_state=15, kernel="rbf", n_components=128)),
                          ("clf", LogisticRegression(class_weight="balanced", max_iter=1000, random_state=seed,
                                                     C=10))])

In [12]:
genexp_pipeline = Pipeline([("filter", ColumnTransformer(transformers=[("genexp", StandardScaler(), genexp_idx), ],
                                                         remainder="drop")),
                            ("dim_red", KernelPCA(random_state=15, kernel="rbf", n_components=256)),
                            ("clf", LogisticRegression(class_weight="balanced", max_iter=1000, random_state=seed,
                                                       C=10))])

In [13]:
clf = VotingClassifier([("gpa", gpa_pipeline), ("snps", snps_pipeline), ("genexp", genexp_pipeline)],
                       voting="soft", weights=[best_gpa["mean_test_score"],
                                               best_snps["mean_test_score"],
                                               best_genexp["mean_test_score"]])

In [14]:
clf

In [15]:
res = cross_val_score(clf, np.concatenate([X_gpa, X_snps, X_genexp], axis=1), y, scoring="balanced_accuracy",
                      n_jobs=n_jobs)

In [16]:
res.mean()

0.8913936063936063