In [1]:
# Build various ensemble models
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
from scipy.stats import norm
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedStratifiedKFold

In [26]:
params = {'n_estimators': [300, 400, 500, 600, 700],
              'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.07],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'max_depth': [3, 4, 5, 6],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'min_child_weight': [1, 2, 3, 4, 5]}

def calc_results_simple(X, y, train_index, test_index, clf, ensemble=False):
    X, y = X.to_numpy(), y.to_numpy(dtype=np.int64)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    y_pred  = clf.predict(X_test)
    acc = balanced_accuracy_score(y_test, y_pred)

    recall_0 =  recall_score(y_test, y_pred, pos_label=0)
    recall_1 =  recall_score(y_test, y_pred, pos_label=1)
    prec_0 = precision_score(y_test, y_pred, pos_label=0)
    prec_1 = precision_score(y_test, y_pred, pos_label=1)
    if not ensemble:
        y_pred_prob = clf.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test, y_pred_prob)
        return np.array([[acc, recall_0, prec_0, recall_1, prec_1 ,auc]])
    return np.array([[acc, recall_0, prec_0, recall_1, prec_1]])

#cross_validation
def run_cross_val(X, y, params, n_folds=5, random_seed=42):
    res = np.empty(shape=[0, 6])
    clf = XGBClassifier(**params, n_jobs=8)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        res = np.append(res, calc_results_simple(X, y, train_index, test_index, clf), axis=0)
    return res, clf

def run_cross_val_ens(clf, X, y, n_folds=5, random_seed=42, ens=True):
    if ens:
        res = np.empty(shape=[0, 5])
    else:
        res = np.empty(shape=[0,6])
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        res = np.append(res, calc_results_simple(X, y, train_index, test_index, clf, ensemble=ens), axis=0)
    return res

def print_score_comparison(raw_score, emb_score, target_feature="RFS",
                           header_1="Raw Score", header_2="Embedding Score"):
    print("\t\t{0}\n\t\t\t{1}\t\t{2}".format(target_feature, header_1, header_2))
    print("\t\t-----------------------------------------------")
    print("balanced_accuracy:\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["balanced_accuracy"].mean(), emb_score["balanced_accuracy"].mean()))
    print("precision_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_0"].mean(), emb_score["precision_0"].mean()))
    print("recall_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_0"].mean(), emb_score["recall_0"].mean()))
    print("precision_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_1"].mean(), emb_score["precision_1"].mean()))
    print("recall_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_1"].mean(), emb_score["recall_1"].mean()))
    print("auc:\t\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["auc"].mean(), emb_score["auc"].mean()))

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time

    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

def param_tuning(X, y, n_folds=5, param_comb=25, scoring='roc_auc', jobs=12):
    xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    rand_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring=scoring, n_jobs=jobs,
                                   cv=skf.split(X, y), verbose=3, random_state=42)

    start_time = timer(None) # timing starts from this point for "start_time" variable
    rand_search.fit(X, y)
    timer(start_time)
    print("Best Score: {:.3%}".format(rand_search.best_score_))
    print(rand_search.best_params_)
    return rand_search

In [3]:
def find_misclassified_patients(df, clf, X, y):
    y_test = y.to_numpy()
    X_test = X.to_numpy()
    miss = np.where(y_test != clf.predict(X_test))
    return df.iloc[miss]["patient_ID"].to_numpy(dtype=np.int64)

def calc_overlap(a, b):
    intr = np.intersect1d(a, b)
    union = np.union1d(a, b)
    return intr, (len(intr) / len(union))

def print_overlap(model1, model2, intr, perc):
    print("{0} patients misclassified by {1} and {2} - {3:.1%} overlap\n".format(len(intr) ,model1, model2, perc))

def write_misclassified(file_name, ls):
    with open("datasets/" + file_name + ".txt", "w") as f:
        for p in ls:
            f.write(str(p) + "\n")

In [4]:
ge_df = pd.read_csv("datasets/merged-combat15.csv")
outcome_df = pd.read_csv("datasets/combat15outcomes.csv")
pos_outcome_df = outcome_df[["patient_ID", "posOutcome"]].dropna(axis=0, subset=["posOutcome"])
pos_outcome_df.posOutcome = pos_outcome_df.posOutcome.astype(int)
ge_outcome_df = pd.merge(pos_outcome_df, ge_df, on="patient_ID")
ge_outcome_df.head()

Unnamed: 0,patient_ID,posOutcome,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,...,ZNF80,ZNF83,ZNF84,ZNF91,ZNHIT2,ZSCAN2,ZXDC,ZYX,ZZEF1,ZZZ3
0,22449,0,3.225199,4.367619,3.858262,3.044919,3.965333,6.604256,3.944029,4.259431,...,2.560791,7.325761,7.712692,7.505766,3.928333,4.595116,5.512795,7.291262,6.626688,8.059083
1,22450,0,3.908954,3.149993,2.204543,3.435342,3.540732,5.679883,3.436566,3.184032,...,4.008537,7.17522,6.823591,11.356938,3.950556,2.851631,4.666013,5.994875,5.619837,7.265902
2,22451,0,3.073704,3.259429,4.166477,3.493565,3.658963,5.981936,3.747488,3.070341,...,3.548279,6.017597,6.386842,8.191975,3.754025,3.212675,5.312667,6.97466,4.95344,5.237441
3,22452,0,4.005665,3.581221,3.324827,3.372396,3.126526,5.838507,4.781397,3.510481,...,4.037482,7.497329,6.848631,11.334273,3.573169,2.772652,5.661542,6.188046,6.635075,6.261023
4,22453,1,2.607424,3.83438,3.489463,3.765984,4.006046,6.410331,3.313725,3.7929,...,3.814779,7.470131,7.739834,7.775924,3.758116,2.837425,5.91806,6.168264,5.724966,7.322795


In [5]:
X_outcome, y_outcome = ge_outcome_df[ge_outcome_df.columns.difference(["patient_ID", "posOutcome"])], ge_outcome_df["posOutcome"]
X_outcome.head()

Unnamed: 0,A4GALT,AAAS,AACS,AADAC,AAK1,AAMP,AANAT,AARS,AARSD1,AASDHPPT,...,ZNHIT2,ZP2,ZPBP,ZSCAN2,ZW10,ZWINT,ZXDC,ZYX,ZZEF1,ZZZ3
0,3.394229,6.252874,12.002215,3.962642,6.752155,8.477521,3.180343,10.204728,4.996347,9.565111,...,3.928333,6.048665,3.143185,4.595116,5.528832,9.325127,5.512795,7.291262,6.626688,8.059083
1,3.577138,5.677761,6.663937,3.572545,6.524654,6.703023,2.758917,7.643839,4.750779,7.326442,...,3.950556,3.004729,3.346485,2.851631,3.23373,5.327934,4.666013,5.994875,5.619837,7.265902
2,4.198193,6.303634,6.93153,3.365196,4.758614,7.093338,3.387266,7.883357,5.036539,6.104904,...,3.754025,2.972612,3.101541,3.212675,4.713892,6.050628,5.312667,6.97466,4.95344,5.237441
3,3.087772,5.420661,6.172754,3.109747,7.184927,5.701223,5.813488,6.28704,4.266873,6.01075,...,3.573169,3.824069,2.804146,2.772652,4.261859,8.147456,5.661542,6.188046,6.635075,6.261023
4,3.649299,5.800899,6.978631,3.068513,4.538195,7.36027,2.8449,7.990621,5.58445,8.179148,...,3.758116,3.158452,2.944138,2.837425,4.406356,6.533104,5.91806,6.168264,5.724966,7.322795


In [6]:
#Load the models
clf_xg50 = XGBClassifier()
clf_xg50.load_model("datasets/models/xgb50_raw.json")
clf_moses50 = XGBClassifier()
clf_moses50.load_model("datasets/models/moses50_raw.json")
clf_raw = XGBClassifier()
clf_raw.load_model("datasets/models/raw_model.json")
clf_pam = XGBClassifier()
clf_pam.load_model("datasets/models/pam35_raw.json")
clf_gan = XGBClassifier()
clf_gan.load_model("datasets/models/infogan_model.json")


In [17]:
# ============== Voting Ensemble

from sklearn.ensemble import VotingClassifier
#Hard voting ensembles
xg50_raw_est = [('raw', clf_raw), ('xg50', clf_xg50)]
xg50_raw_ens = VotingClassifier(xg50_raw_est, voting='hard', n_jobs=4)

scores = run_cross_val_ens(xg50_raw_ens, X_outcome, y_outcome)

array([[0.66698125, 0.49700599, 0.6484375 , 0.83695652, 0.73333333],
       [0.77055152, 0.65662651, 0.77304965, 0.88447653, 0.81125828],
       [0.68805185, 0.54216867, 0.66176471, 0.83393502, 0.752443  ],
       [0.72210581, 0.59638554, 0.70212766, 0.84782609, 0.77740864],
       [0.73352104, 0.64457831, 0.68589744, 0.82246377, 0.79370629]])

In [19]:
ens_xg50_raw_df = pd.DataFrame(data=scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1"])
ens_xg50_raw_df.describe()

Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1
count,5.0,5.0,5.0,5.0,5.0
mean,0.716242,0.587353,0.694255,0.845132,0.77363
std,0.040323,0.067748,0.048722,0.023777,0.031249
min,0.666981,0.497006,0.648438,0.822464,0.733333
25%,0.688052,0.542169,0.661765,0.833935,0.752443
50%,0.722106,0.596386,0.685897,0.836957,0.777409
75%,0.733521,0.644578,0.702128,0.847826,0.793706
max,0.770552,0.656627,0.77305,0.884477,0.811258


In [20]:
moses50_raw_est = [('raw', clf_raw), ('moses50', clf_moses50)]
moses50_raw_ens = VotingClassifier(moses50_raw_est, voting='hard', n_jobs=4)
scores = run_cross_val_ens(moses50_raw_ens, X_outcome, y_outcome)
ens_moses50_raw_df = pd.DataFrame(data=scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1"])
ens_moses50_raw_df.describe()

Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1
count,5.0,5.0,5.0,5.0,5.0
mean,0.721134,0.602936,0.693611,0.839332,0.778886
std,0.027083,0.048571,0.038495,0.026883,0.021353
min,0.693467,0.542169,0.666667,0.811594,0.754839
25%,0.70535,0.562874,0.672956,0.815217,0.762215
50%,0.714838,0.614458,0.676692,0.844765,0.778547
75%,0.728086,0.644578,0.691176,0.847826,0.791519
max,0.763929,0.650602,0.760563,0.877256,0.807309


In [29]:
pam_raw_est = [('raw', clf_raw), ('pam35', clf_pam)]
pam_raw_ens = VotingClassifier(pam_raw_est, voting='hard', n_jobs=6)
scores = run_cross_val_ens(pam_raw_ens, X_outcome, y_outcome)
ens_pam_raw_df = pd.DataFrame(data=scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1"])
ens_pam_raw_df.describe()

Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1
count,5.0,5.0,5.0,5.0,5.0
mean,0.7202,0.590931,0.702273,0.849469,0.775943
std,0.035001,0.059922,0.045476,0.024904,0.027173
min,0.685638,0.526946,0.666667,0.826087,0.747604
25%,0.687386,0.53012,0.676923,0.84058,0.749196
50%,0.721495,0.60241,0.692308,0.841155,0.778523
75%,0.738345,0.644578,0.694444,0.847826,0.797203
max,0.768138,0.650602,0.781022,0.891697,0.80719


In [30]:
moses50_pam_est = [('moses50', clf_moses50), ('pam35', clf_pam)]
moses50_pam_ens = VotingClassifier(moses50_pam_est, voting='hard', n_jobs=8)
scores = run_cross_val_ens(moses50_pam_ens, X_outcome, y_outcome)
ens_moses50_pam_df = pd.DataFrame(data=scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1"])
ens_moses50_pam_df.describe()

Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1
count,5.0,5.0,5.0,5.0,5.0
mean,0.716196,0.590888,0.691681,0.841503,0.774444
std,0.031661,0.061565,0.03788,0.02574,0.025875
min,0.679005,0.506024,0.659864,0.811594,0.742138
25%,0.701589,0.562874,0.672,0.818841,0.762987
50%,0.707162,0.584337,0.675,0.851449,0.766102
75%,0.731098,0.650602,0.696296,0.851986,0.794326
max,0.762124,0.650602,0.755245,0.873646,0.806667


In [31]:
xg50_pam_est = [('xgb50', clf_xg50), ('pam35', clf_pam)]
xg50_pam_ens = VotingClassifier(xg50_pam_est, voting='hard', n_jobs=8)
scores = run_cross_val_ens(xg50_pam_ens, X_outcome, y_outcome)
ens_xg50_pam_df = pd.DataFrame(data=scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1"])
ens_xg50_pam_df.describe()

Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1
count,5.0,5.0,5.0,5.0,5.0
mean,0.714906,0.582512,0.695172,0.8473,0.772131
std,0.040751,0.07039,0.045861,0.021186,0.032077
min,0.674152,0.512048,0.651515,0.826087,0.73955
25%,0.678407,0.51497,0.664062,0.833333,0.742857
50%,0.714881,0.578313,0.690323,0.844765,0.770492
75%,0.735333,0.644578,0.70073,0.851449,0.794425
max,0.771759,0.662651,0.769231,0.880866,0.813333


In [32]:
xg50_moses50_est = [('xg50', clf_xg50), ('moses50', clf_moses50)]
xg50_moses50_ens = VotingClassifier(xg50_moses50_est, voting='hard', n_jobs=8)
scores = run_cross_val_ens(xg50_moses50_ens, X_outcome, y_outcome)
ens_xg50_moses50_df = pd.DataFrame(data=scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1"])
ens_xg50_moses50_df.describe()

Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1
count,5.0,5.0,5.0,5.0,5.0
mean,0.712963,0.587303,0.686045,0.838623,0.772239
std,0.025856,0.054685,0.024303,0.016304,0.022582
min,0.682626,0.524096,0.664122,0.818841,0.746795
25%,0.693374,0.538922,0.673469,0.826087,0.752412
50%,0.711236,0.596386,0.681818,0.841155,0.772881
75%,0.734721,0.626506,0.683544,0.847826,0.793333
max,0.742856,0.650602,0.727273,0.859206,0.795775


In [8]:
def generate_diff_df(df, models_dict, X, y):
    size = len(models_dict)
    diff_matrix = np.empty(shape=[size, size])
    for i, k in enumerate(models_dict):
        k_miss = find_misclassified_patients(df, models_dict[k], X, y)
        for j, z in enumerate(models_dict):
            z_miss = find_misclassified_patients(df, models_dict[z], X, y)
            _, perc = calc_overlap(k_miss, z_miss)
            diff_matrix[i][j] = perc

    headers = list(models_dict.keys())
    res_df = pd.DataFrame(diff_matrix, headers, headers)

    return res_df, diff_matrix



In [74]:
models_dict_1 = {"raw": clf_raw ,"xgb50_raw": xg50_raw_ens, "xgb50_pam": xg50_pam_ens, "moses50_raw": moses50_raw_ens,
                  "moses50_pam": moses50_pam_ens, "moses50_xgb50": xg50_moses50_ens}

comp_diff, matrix = generate_diff_df(ge_outcome_df, models_dict_1, X_outcome, y_outcome)
comp_diff * 100

Unnamed: 0,raw,xgb50_raw,xgb50_pam,moses50_raw,moses50_pam,moses50_xgb50
raw,100.0,77.165354,65.925926,76.5625,67.910448,56.774194
xgb50_raw,77.165354,100.0,85.950413,81.6,76.5625,73.049645
xgb50_pam,65.925926,85.950413,100.0,75.193798,80.0,71.126761
moses50_raw,76.5625,81.6,75.193798,100.0,89.166667,73.758865
moses50_pam,67.910448,76.5625,80.0,89.166667,100.0,73.049645
moses50_xgb50,56.774194,73.049645,71.126761,73.758865,73.049645,100.0


In [24]:
def generate_scores_df(scores_dict, cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1"]):
    matrix = np.empty(shape=[len(scores_dict), len(cols)])
    for i, k in enumerate(scores_dict):
        for j, z in enumerate(cols):
            matrix[i][j] = scores_dict[k][z].mean()

    row_headers = list(scores_dict.keys())
    res_df = pd.DataFrame(matrix, row_headers, cols)
    return res_df

scores_dict_1 = {"xgb50_raw": ens_xg50_raw_df, "xgb50_pam": ens_xg50_pam_df, "moses50_raw": ens_moses50_raw_df,
                  "moses50_pam": ens_moses50_pam_df, "moses50_xgb50": ens_xg50_moses50_df}

scores_diff_1 = generate_scores_df(scores_dict_1)
scores_diff_1 * 100

NameError: name 'ens_xg50_raw_df' is not defined

In [None]:
## Three model ensembles
moses_xgb_raw_est = [('raw', clf_raw), ('moses50', clf_moses50), ('xgb50', clf_xg50)]
moses_xgb_raw_ens = VotingClassifier(moses_xgb_raw_est, voting="hard", n_jobs=8)
scores = run_cross_val_ens(moses_xgb_raw_ens, X_outcome, y_outcome)
moses_xgb_raw_ens_df = pd.DataFrame(data=scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1"])
moses_xgb_raw_ens_df.describe()

In [9]:
moses_xgb_raw_ens_df.mean()

NameError: name 'moses_xgb_raw_ens_df' is not defined

In [7]:
def calc_score(clf, X, y):
    y_pred  = clf.predict(X)
    acc = balanced_accuracy_score(y, y_pred)

    recall_0 =  recall_score(y, y_pred, pos_label=0)
    recall_1 =  recall_score(y, y_pred, pos_label=1)
    prec_0 = precision_score(y, y_pred, pos_label=0)
    prec_1 = precision_score(y, y_pred, pos_label=1)
    y_pred_prob = clf.predict_proba(X)[:,1]
    auc = roc_auc_score(y, y_pred_prob)
    return np.array([[acc, recall_0, prec_0, recall_1, prec_1 ,auc]])




In [8]:
#====== Stacking Ensemble ============

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def prepare_stack_input(estimators, num_samples, y):
    num = len(estimators)
    matrix = np.empty(shape=[num_samples, num + 1])

    for i, (_, arr) in enumerate(estimators):
        matrix[:,i] = arr

    matrix[:, num] = y
    cols = [k[0] for k in estimators]
    cols.append("posOutcome")
    df = pd.DataFrame(data=matrix, columns=cols)

    return df


In [9]:
moses50_genes = ["PRND", "FRS3", "FCN3", "DSCR4", "BRCA2", "CXCL6", "LMX1B", "DLX5", "OMP", "ADH6", "PGAP1", "ART3", "BCHE", "FGB", "IL1RAPL1", "FSTL4", "ASGR1", "ZNF135", "DLL3", "NPHS2", "ANGPT2", "GLP2R", "GRIA3", "HOXB8", "MSC", "PLA2R1", "CYP2F1", "TAS2R7", "NKX6-1", "WNT11", "CHST11", "CLCA4", "ENPEP", "PAH", "WFDC1", "CHGA", "SEZ6L", "UGT2A3", "PRDM16", "GALR2", "GUCA1A", "CASQ1", "NOS1AP", "CACNA2D3", "FHOD3", "SRGAP3", "TMOD2", "ATOH1", "SLC6A1", "HAS1"]
xgb50_genes = ['CDX4','GLRA1', 'OR12D3', 'DSCR4', 'HOXB8', 'C9', 'MTNR1B', 'MOS', 'HSD17B3', 'FGF20', 'KCNH4', 'ATP4B', 'CPB2', 'CRYBB1', 'ANGPTL3', 'MYH8', 'GYS2', 'SLC25A21', 'TAS2R7', 'F11', 'GABRA6', 'MYT1L', 'DEFB126', 'RPL18', 'GABRQ', 'ZFP37', 'PIP5K1B', 'MCM5', 'PRKAA1', 'WDR76', 'CHRM4', 'RPS6KC1', 'EIF1AY', 'WNT1', 'SCN3B', 'NLGN4Y', 'MAGEB1', 'NUDC', 'HIGD1A', 'OXCT2', 'GALR2', 'EEF1B2', 'RXRG', 'CALCA', 'TEX13A', 'CST3', 'IGFBP4', 'CRYGA', 'ESR1', 'ZNF750']
pam35_genes = ["BAG1", "BIRC5", "BLVRA", "CCNB1", "CCNE1", "CDC20", "CDC6", "CDH3", "CENPF", "CEP55", "EGFR", "ERBB2", "ESR1", "EXO1", "FOXA1", "FOXC1",  "GRB7", "KIF2C", "KRT14", "KRT17", "KRT5", "MAPT", "MDM2", "MELK", "MIA", "MKI67", "MMP11", "MYBL2", "MYC", "PGR", "RRM2", "SFRP1", "SLC39A6", "TYMS", "UBE2C"]

y_pos_outcome = ge_outcome_df["posOutcome"]
X_moses50_outcome = ge_outcome_df[moses50_genes]
X_pam35_outcome  = ge_outcome_df[pam35_genes]
X_xgb50_outcome = ge_outcome_df[xgb50_genes]
X_raw_outcome = ge_outcome_df[ge_outcome_df.columns.difference(["patient_ID", "posOutcome"])]

moses50_pred = clf_moses50.predict(X_moses50_outcome)
pam35_pred = clf_pam.predict(X_pam35_outcome)
xg50_pred = clf_xg50.predict(X_xgb50_outcome)
raw_pred = clf_raw.predict(X_raw_outcome)

estimators = [("raw", raw_pred), ("pam35", pam35_pred), ("xg50", xg50_pred), ("moses50", moses50_pred)]
num_patients = ge_outcome_df.shape[0]
log_input_df = prepare_stack_input(estimators, num_patients, y_pos_outcome)

log_input_df.head()

Unnamed: 0,raw,pam35,xg50,moses50,posOutcome
0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,1.0,1.0,1.0,1.0,1.0


In [10]:
X_log = log_input_df[log_input_df.columns.difference(["posOutcome"])]
clf_log = LogisticRegression()
log_scores = run_cross_val_ens(clf_log, X_log, y_pos_outcome, ens=False)
log_scores_df = pd.DataFrame(data=log_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
log_scores_df.mean()

balanced_accuracy    0.941169
recall_0             0.912048
precision_0          0.938521
recall_1             0.970290
precision_1          0.952903
auc                  0.951648
dtype: float64

In [11]:
moses50_pred_prob = clf_moses50.predict_proba(X_moses50_outcome)[:,1]
pam35_pred_prob = clf_pam.predict_proba(X_pam35_outcome)[:,1]
xg50_pred_prob = clf_xg50.predict_proba(X_xgb50_outcome)[:,1]
raw_pred_prob = clf_raw.predict_proba(X_raw_outcome)[:,1]

estimators_prob = [("raw", raw_pred_prob), ("pam35", pam35_pred_prob), ("xg50", xg50_pred_prob), ("moses50", moses50_pred_prob)]
log_input_df_prob = prepare_stack_input(estimators_prob, num_patients, y_pos_outcome)

log_input_df_prob.head()

Unnamed: 0,raw,pam35,xg50,moses50,posOutcome
0,0.041239,0.191396,0.470646,0.39491,0.0
1,0.430887,0.776116,0.465145,0.417613,0.0
2,0.045173,0.248015,0.28785,0.19552,0.0
3,0.07108,0.083141,0.222462,0.551764,0.0
4,0.967852,0.742542,0.744176,0.588487,1.0


In [12]:
X_log_prob = log_input_df_prob[log_input_df_prob.columns.difference(["posOutcome"])]
clf_log_prob = LogisticRegression()
log_scores_prob = run_cross_val_ens(clf_log_prob, X_log_prob, y_pos_outcome, ens=False)
log_scores_df_prob = pd.DataFrame(data=log_scores_prob, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
log_scores_df_prob.mean()

balanced_accuracy    0.944426
recall_0             0.915663
precision_0          0.944361
recall_1             0.973188
precision_1          0.954693
auc                  0.962952
dtype: float64

In [36]:
X_log_wo_raw = log_input_df[log_input_df.columns.difference(["raw", "posOutcome"])]
clf_log_wo_raw = LogisticRegression()
log_scores_wo_raw = run_cross_val_ens(clf_log_wo_raw, X_log_wo_raw, y_pos_outcome, ens=False)
log_scores_wo_raw_df = pd.DataFrame(data=log_scores_wo_raw, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
log_scores_wo_raw_df.mean()

balanced_accuracy    0.907815
recall_0             0.840993
precision_0          0.928740
recall_1             0.974638
precision_1          0.920073
auc                  0.924030
dtype: float64

In [37]:
X_log_pam_moses = log_input_df[log_input_df.columns.difference(["raw", "posOutcome", "xg50"])]
clf_log_pam_moses = LogisticRegression()
log_scores_pam_moses = run_cross_val_ens(clf_log_pam_moses, X_log_pam_moses, y_pos_outcome, ens=False)
log_scores_pam_moses_df = pd.DataFrame(data=log_scores_pam_moses, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
log_scores_pam_moses_df.mean()

balanced_accuracy    0.907815
recall_0             0.840993
precision_0          0.928740
recall_1             0.974638
precision_1          0.920073
auc                  0.926027
dtype: float64

In [38]:
X_log_pam_xg50 = log_input_df[log_input_df.columns.difference(["raw", "posOutcome", "moses50"])]
clf_log_pam_xg50 = LogisticRegression()
log_scores_pam_xg50 = run_cross_val_ens(clf_log_pam_xg50, X_log_pam_xg50, y_pos_outcome, ens=False)
log_scores_pam_xg50_df = pd.DataFrame(data=log_scores_pam_xg50, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
log_scores_pam_xg50_df.mean()


balanced_accuracy    0.907815
recall_0             0.840993
precision_0          0.928740
recall_1             0.974638
precision_1          0.920073
auc                  0.926891
dtype: float64

In [39]:
X_log_raw_pam = log_input_df[log_input_df.columns.difference(["xg50", "posOutcome", "moses50"])]
clf_log_raw_pam = LogisticRegression()
log_scores_raw_pam = run_cross_val_ens(clf_log_raw_pam, X_log_raw_pam, y_pos_outcome, ens=False)
log_scores_raw_pam_df = pd.DataFrame(data=log_scores_raw_pam, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
log_scores_raw_pam_df.mean()

balanced_accuracy    0.941894
recall_0             0.912048
precision_0          0.940909
recall_1             0.971739
precision_1          0.952903
auc                  0.945102
dtype: float64

In [40]:
X_log_raw_moses = log_input_df[log_input_df.columns.difference(["xg50", "posOutcome", "pam35"])]
clf_log_raw_moses = LogisticRegression()
log_scores_raw_moses = run_cross_val_ens(clf_log_raw_moses, X_log_raw_moses, y_pos_outcome, ens=False)
log_scores_raw_moses_df = pd.DataFrame(data=log_scores_raw_moses, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
log_scores_raw_moses_df.mean()


balanced_accuracy    0.941894
recall_0             0.912048
precision_0          0.940909
recall_1             0.971739
precision_1          0.952903
auc                  0.950537
dtype: float64

In [42]:
X_log_raw_xgb = log_input_df[log_input_df.columns.difference(["moses50", "posOutcome", "pam35"])]
clf_log_raw_xgb = LogisticRegression()
log_scores_raw_xgb = run_cross_val_ens(clf_log_raw_xgb, X_log_raw_xgb, y_pos_outcome, ens=False)
log_scores_raw_xgb_df = pd.DataFrame(data=log_scores_raw_xgb, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
log_scores_raw_xgb_df.mean()

balanced_accuracy    0.941894
recall_0             0.912048
precision_0          0.940909
recall_1             0.971739
precision_1          0.952903
auc                  0.948858
dtype: float64

In [43]:
X_log_moses_xgb = log_input_df[log_input_df.columns.difference(["raw", "posOutcome", "pam35"])]
clf_log_moses_xgb = LogisticRegression()
log_scores_moses_xgb = run_cross_val_ens(clf_log_moses_xgb, X_log_moses_xgb, y_pos_outcome, ens=False)
log_scores_moses_xgb_df = pd.DataFrame(data=log_scores_moses_xgb, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
log_scores_moses_xgb_df.mean()

balanced_accuracy    0.774184
recall_0             0.618585
precision_0          0.843497
recall_1             0.929783
precision_1          0.802200
auc                  0.789481
dtype: float64

In [44]:
X_log_wo_pam = log_input_df[log_input_df.columns.difference(["posOutcome", "pam35"])]
clf_log_wo_pam = LogisticRegression()
log_scores_wo_pam = run_cross_val_ens(clf_log_wo_pam, X_log_wo_pam, y_pos_outcome, ens=False)
log_scores_wo_pam_df = pd.DataFrame(data=log_scores_wo_pam, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
log_scores_wo_pam_df.mean()
#

balanced_accuracy    0.941894
recall_0             0.912048
precision_0          0.940909
recall_1             0.971739
precision_1          0.952903
auc                  0.952137
dtype: float64