In [3]:
### Experiment for comparing pam50 features to moses and xgboost selected features

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
from scipy.stats import norm
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

In [4]:
from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score
def calc_results_simple(X, y, train_index, test_index, clf):
    X, y = X.to_numpy(), y.to_numpy(dtype=np.int64)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred  = clf.predict(X_test)
    y_pred_prob = clf.predict_proba(X_test)[:,1]
    acc = balanced_accuracy_score(y_test, y_pred)

    recall_0 =  recall_score(y_test, y_pred, pos_label=0)
    recall_1 =  recall_score(y_test, y_pred, pos_label=1)
    prec_0 = precision_score(y_test, y_pred, pos_label=0)
    prec_1 = precision_score(y_test, y_pred, pos_label=1)
    auc = roc_auc_score(y_test, y_pred_prob)

    return np.array([[acc, recall_0, prec_0, recall_1, prec_1 ,auc]])

#cross_validation
def run_cross_val(X, y, params, n_folds=5, random_seed=42):
    res = np.empty(shape=[0, 6])
    clf = XGBClassifier(**params, n_jobs=8)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        res = np.append(res, calc_results_simple(X, y, train_index, test_index, clf), axis=0)
    return res, clf

def print_score_comparison(raw_score, emb_score, target_feature="RFS",
                           header_1="Raw Score", header_2="Embedding Score"):
    print("\t\t{0}\n\t\t\t{1}\t\t{2}".format(target_feature, header_1, header_2))
    print("\t\t-----------------------------------------------")
    print("balanced_accuracy:\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["balanced_accuracy"].mean(), emb_score["balanced_accuracy"].mean()))
    print("precision_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_0"].mean(), emb_score["precision_0"].mean()))
    print("recall_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_0"].mean(), emb_score["recall_0"].mean()))
    print("precision_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_1"].mean(), emb_score["precision_1"].mean()))
    print("recall_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_1"].mean(), emb_score["recall_1"].mean()))
    print("auc:\t\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["auc"].mean(), emb_score["auc"].mean()))


In [5]:
params = {'n_estimators': [300, 400, 500, 600, 700],
              'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.07],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'max_depth': [3, 4, 5, 6],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'min_child_weight': [1, 2, 3, 4, 5]}
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time

    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

def param_tuning(X, y, n_folds=5, param_comb=25, scoring='roc_auc', jobs=12):
    xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    rand_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring=scoring, n_jobs=jobs,
                                   cv=skf.split(X, y), verbose=3, random_state=42)

    start_time = timer(None) # timing starts from this point for "start_time" variable
    rand_search.fit(X, y)
    timer(start_time)
    print("Best Score: {:.3%}".format(rand_search.best_score_))
    print(rand_search.best_params_)
    return rand_search

In [6]:
ge_df = pd.read_csv("datasets/merged-combat15.csv")
outcome_df = pd.read_csv("datasets/combat15outcomes.csv")
pos_outcome_df = outcome_df[["patient_ID", "posOutcome"]].dropna(axis=0, subset=["posOutcome"])
pos_outcome_df.posOutcome = pos_outcome_df.posOutcome.astype(int)
ge_outcome_df = pd.merge(pos_outcome_df, ge_df, on="patient_ID")

In [113]:
pam35_genes = ["BAG1", "BIRC5", "BLVRA", "CCNB1", "CCNE1", "CDC20", "CDC6", "CDH3", "CENPF", "CEP55", "EGFR", "ERBB2", "ESR1", "EXO1", "FOXA1", "FOXC1",  "GRB7", "KIF2C", "KRT14", "KRT17", "KRT5", "MAPT", "MDM2", "MELK", "MIA", "MKI67", "MMP11", "MYBL2", "MYC", "PGR", "RRM2", "SFRP1", "SLC39A6", "TYMS", "UBE2C"]

X_pam35_outcome, y_pam35_outcome = ge_outcome_df[pam35_genes], ge_outcome_df["posOutcome"]

In [12]:
#parameter tuning
rand_search_pam = param_tuning(X_pam35_outcome, y_pam35_outcome, jobs=14)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 39.96 seconds.
Best Score: 71.466%
{'subsample': 0.8, 'n_estimators': 400, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.03, 'gamma': 1.5, 'colsample_bytree': 0.8}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    3.7s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   37.1s finished


In [8]:
outcome_pam_params = {'subsample': 0.8,
 'n_estimators': 400,
 'min_child_weight': 3,
 'max_depth': 5,
 'learning_rate': 0.03,
 'gamma': 1.5,
 'colsample_bytree': 0.8}

In [114]:
outcome_pam_scores, clf_pam = run_cross_val(X_pam35_outcome, y_pam35_outcome, outcome_pam_params)
outcome_pam_scores_df = pd.DataFrame(data=outcome_pam_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_pam_scores_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.629226,0.399524,0.630609,0.858928,0.704131,0.714662
std,0.027263,0.040595,0.05306,0.024174,0.017318,0.029603
min,0.584446,0.349398,0.537037,0.819495,0.677612,0.674851
25%,0.629976,0.379518,0.638889,0.858696,0.702312,0.695019
50%,0.636011,0.39521,0.65625,0.859206,0.705539,0.72121
75%,0.637179,0.415663,0.66,0.876812,0.709581,0.734329
max,0.658519,0.457831,0.66087,0.880435,0.72561,0.747901


In [10]:
xgb50_genes = ['CDX4','GLRA1', 'OR12D3', 'DSCR4', 'HOXB8', 'C9', 'MTNR1B', 'MOS', 'HSD17B3', 'FGF20', 'KCNH4', 'ATP4B', 'CPB2', 'CRYBB1', 'ANGPTL3', 'MYH8', 'GYS2', 'SLC25A21', 'TAS2R7', 'F11', 'GABRA6', 'MYT1L', 'DEFB126', 'RPL18', 'GABRQ', 'ZFP37', 'PIP5K1B', 'MCM5', 'PRKAA1', 'WDR76', 'CHRM4', 'RPS6KC1', 'EIF1AY', 'WNT1', 'SCN3B', 'NLGN4Y', 'MAGEB1', 'NUDC', 'HIGD1A', 'OXCT2', 'GALR2', 'EEF1B2', 'RXRG', 'CALCA', 'TEX13A', 'CST3', 'IGFBP4', 'CRYGA', 'ESR1', 'ZNF750']

X_xgb50_outcome, y_xg50_outcome = ge_outcome_df[xgb50_genes], ge_outcome_df["posOutcome"]
X_xgb50_outcome.head()

Unnamed: 0,CDX4,GLRA1,OR12D3,DSCR4,HOXB8,C9,MTNR1B,MOS,HSD17B3,FGF20,...,GALR2,EEF1B2,RXRG,CALCA,TEX13A,CST3,IGFBP4,CRYGA,ESR1,ZNF750
0,4.393932,4.756301,3.668209,3.81314,3.149279,4.091114,3.7782,4.149525,2.630322,3.38648,...,3.574844,12.278915,3.75311,3.510817,3.60462,6.963822,7.292461,3.493727,-0.820615,2.940893
1,3.735445,3.453197,3.008127,2.500197,3.025658,3.26571,2.90913,2.990024,3.326884,3.172031,...,3.377374,10.201279,3.083013,5.915624,3.28008,7.745012,7.281524,3.310479,7.530269,3.216666
2,3.504602,3.591334,3.487448,2.710443,2.786988,3.904477,2.879539,3.585594,2.765269,2.99288,...,3.700416,10.47336,2.816874,3.125646,3.403129,8.240169,8.71551,3.38855,6.099955,5.146286
3,2.862134,3.326514,3.346279,3.676626,4.426359,3.111246,3.447916,3.153298,2.95836,2.845923,...,3.326836,11.048594,2.359179,2.740101,2.607996,8.634744,6.339568,3.256417,7.552593,2.859658
4,3.706718,4.106301,3.579494,3.123646,3.254895,3.480252,3.673946,3.867726,2.990764,3.592019,...,3.360349,10.595206,3.5135,3.397031,3.078851,7.614526,7.386373,3.368019,5.59124,3.684376


In [23]:
rand_search_xg50 = param_tuning(X_xgb50_outcome, y_xg50_outcome, jobs=14)


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 33.64 seconds.
Best Score: 77.627%
{'subsample': 0.6, 'n_estimators': 300, 'min_child_weight': 4, 'max_depth': 4, 'learning_rate': 0.02, 'gamma': 5, 'colsample_bytree': 0.6}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    3.4s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   32.5s finished


In [11]:
outcome_xg50_params = {'subsample': 0.6,
 'n_estimators': 300,
 'min_child_weight': 4,
 'max_depth': 4,
 'learning_rate': 0.02,
 'gamma': 5,
 'colsample_bytree': 0.6}


In [12]:
outcome_xg50_scores, clf_xg50 = run_cross_val(X_xgb50_outcome, y_xg50_outcome, outcome_xg50_params)
outcome_xg50_scores_df = pd.DataFrame(data=outcome_xg50_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_xg50_scores_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.689748,0.516283,0.694266,0.863214,0.748129,0.776271
std,0.036644,0.052831,0.060686,0.028094,0.024624,0.032882
min,0.647677,0.439759,0.637795,0.833333,0.718182,0.733164
25%,0.659182,0.48503,0.646018,0.855596,0.727848,0.749989
50%,0.69742,0.536145,0.695312,0.858696,0.754777,0.78944
75%,0.706456,0.554217,0.70229,0.858696,0.762058,0.80103
max,0.738006,0.566265,0.789916,0.909747,0.777778,0.807729


In [13]:
print_score_comparison(outcome_pam_scores_df, outcome_xg50_scores_df, target_feature="posOutcome",
                       header_1="PAM 35 Genes", header_2="Xgboost 50 genes")

		posOutcome
			PAM 35 Genes		Xgboost 50 genes
		-----------------------------------------------
balanced_accuracy:	62.923%			68.975%

precision_0:		63.061%			69.427%

recall_0:		39.952%			51.628%

precision_1:		70.413%			74.813%

recall_1:		85.893%			86.321%

auc:			71.466%			77.627%



In [14]:
xgb35_genes = xgb50_genes[:35]
X_xgb35_outcome, y_xgb35_outcome = ge_outcome_df[xgb35_genes], ge_outcome_df["posOutcome"]
X_xgb35_outcome.head()

Unnamed: 0,CDX4,GLRA1,OR12D3,DSCR4,HOXB8,C9,MTNR1B,MOS,HSD17B3,FGF20,...,ZFP37,PIP5K1B,MCM5,PRKAA1,WDR76,CHRM4,RPS6KC1,EIF1AY,WNT1,SCN3B
0,4.393932,4.756301,3.668209,3.81314,3.149279,4.091114,3.7782,4.149525,2.630322,3.38648,...,4.266388,4.478,6.639885,6.337999,4.08227,3.610011,6.330836,3.669253,3.403754,3.691055
1,3.735445,3.453197,3.008127,2.500197,3.025658,3.26571,2.90913,2.990024,3.326884,3.172031,...,3.681765,4.716056,5.286568,3.482599,3.29806,3.580365,6.961319,1.742157,3.142625,3.321853
2,3.504602,3.591334,3.487448,2.710443,2.786988,3.904477,2.879539,3.585594,2.765269,2.99288,...,3.082227,4.550145,4.929561,3.436104,2.67104,3.857673,5.554245,2.026433,3.682513,3.448714
3,2.862134,3.326514,3.346279,3.676626,4.426359,3.111246,3.447916,3.153298,2.95836,2.845923,...,3.472678,6.27744,5.076439,2.769396,3.757192,3.345011,6.121111,3.841874,3.525772,3.155237
4,3.706718,4.106301,3.579494,3.123646,3.254895,3.480252,3.673946,3.867726,2.990764,3.592019,...,4.11504,4.646138,5.217917,4.615567,3.544825,3.525125,4.614673,2.987348,3.058495,3.435803


In [31]:
rand_search_xg35 = param_tuning(X_xgb35_outcome, y_xgb35_outcome, jobs=14)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 22.88 seconds.
Best Score: 76.133%
{'subsample': 0.6, 'n_estimators': 300, 'min_child_weight': 4, 'max_depth': 4, 'learning_rate': 0.02, 'gamma': 5, 'colsample_bytree': 0.6}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    2.6s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   22.1s finished


In [15]:
outcome_xgb35_params = {'subsample': 0.6,
 'n_estimators': 300,
 'min_child_weight': 4,
 'max_depth': 4,
 'learning_rate': 0.02,
 'gamma': 5,
 'colsample_bytree': 0.6}


In [16]:
outcome_xgb35_scores, clf_xg35 = run_cross_val(X_xgb35_outcome, y_xgb35_outcome, outcome_xgb35_params)
outcome_xgb35_scores_df = pd.DataFrame(data=outcome_xgb35_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_xgb35_scores_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.694818,0.523526,0.699927,0.866109,0.7521,0.761328
std,0.044159,0.075587,0.056553,0.024183,0.031818,0.038002
min,0.644654,0.415663,0.626984,0.82971,0.713864,0.711349
25%,0.651382,0.473054,0.663462,0.858696,0.722397,0.730895
50%,0.71248,0.566265,0.706767,0.873188,0.76699,0.781561
75%,0.722739,0.572289,0.730769,0.873646,0.772436,0.783285
max,0.742834,0.590361,0.771654,0.895307,0.78481,0.799552


In [17]:
print_score_comparison(outcome_pam_scores_df, outcome_xgb35_scores_df, target_feature="posOutcome",
                       header_1="Pam 35 genes", header_2="Xgboost 35 genes")

		posOutcome
			Pam 35 genes		Xgboost 35 genes
		-----------------------------------------------
balanced_accuracy:	62.923%			69.482%

precision_0:		63.061%			69.993%

recall_0:		39.952%			52.353%

precision_1:		70.413%			75.210%

recall_1:		85.893%			86.611%

auc:			71.466%			76.133%



In [18]:
print_score_comparison(outcome_xg50_scores_df, outcome_xgb35_scores_df, target_feature="posOutcome",
                       header_1="Xgboost 50 genes", header_2="Xgboost 35 genes")


		posOutcome
			Xgboost 50 genes		Xgboost 35 genes
		-----------------------------------------------
balanced_accuracy:	68.975%			69.482%

precision_0:		69.427%			69.993%

recall_0:		51.628%			52.353%

precision_1:		74.813%			75.210%

recall_1:		86.321%			86.611%

auc:			77.627%			76.133%



In [19]:
moses50_genes = ["PRND", "FRS3", "FCN3", "DSCR4", "BRCA2", "CXCL6", "LMX1B", "DLX5", "OMP", "ADH6", "PGAP1", "ART3", "BCHE", "FGB", "IL1RAPL1", "FSTL4", "ASGR1", "ZNF135", "DLL3", "NPHS2", "ANGPT2", "GLP2R", "GRIA3", "HOXB8", "MSC", "PLA2R1", "CYP2F1", "TAS2R7", "NKX6-1", "WNT11", "CHST11", "CLCA4", "ENPEP", "PAH", "WFDC1", "CHGA", "SEZ6L", "UGT2A3", "PRDM16", "GALR2", "GUCA1A", "CASQ1", "NOS1AP", "CACNA2D3", "FHOD3", "SRGAP3", "TMOD2", "ATOH1", "SLC6A1", "HAS1"]

X_moses50_outcome, y_moses50_outcome = ge_outcome_df[moses50_genes], ge_outcome_df["posOutcome"]
X_moses50_outcome.head()

Unnamed: 0,PRND,FRS3,FCN3,DSCR4,BRCA2,CXCL6,LMX1B,DLX5,OMP,ADH6,...,GUCA1A,CASQ1,NOS1AP,CACNA2D3,FHOD3,SRGAP3,TMOD2,ATOH1,SLC6A1,HAS1
0,4.773548,3.792942,3.591425,3.81314,3.842011,3.301166,3.155381,4.092754,3.111448,2.856136,...,3.724464,2.883942,3.117741,3.498742,3.748926,4.446531,3.557905,3.581241,3.48003,2.783949
1,4.050956,3.596728,3.603971,2.500197,2.761469,3.679678,3.406322,3.698481,3.193991,2.712495,...,2.476936,3.623965,3.833023,3.873612,4.377711,3.648161,3.525747,2.640557,2.87355,4.170882
2,5.213503,3.892048,3.655383,2.710443,2.561722,3.748453,3.964545,4.12564,3.167802,2.590382,...,3.99857,2.609817,3.518557,3.537423,4.209063,4.159658,3.506952,2.705826,3.200665,2.850185
3,3.443242,3.713757,3.370449,3.676626,3.947755,2.890541,2.987402,3.91909,2.765266,2.959009,...,3.494966,3.760962,4.401083,3.483488,3.805772,4.018151,3.360345,2.667166,3.14359,2.925482
4,4.237601,3.800724,3.259677,3.123646,3.354961,3.029855,3.116395,3.882619,2.893702,2.585563,...,3.124846,3.164092,3.699545,3.572111,3.775504,3.915325,3.653177,3.274062,3.415785,3.517647


In [39]:
rand_search_moses50 = param_tuning(X_moses50_outcome, y_moses50_outcome, jobs=14)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 31.46 seconds.
Best Score: 76.013%
{'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 2, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 5, 'colsample_bytree': 0.8}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    3.2s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   29.6s finished


In [20]:
outcome_moses50_params = {'subsample': 1.0,
 'n_estimators': 500,
 'min_child_weight': 2,
 'max_depth': 3,
 'learning_rate': 0.05,
 'gamma': 5,
 'colsample_bytree': 0.8}


In [21]:
outcome_moses50_scores, clf_moses50 = run_cross_val(X_moses50_outcome, y_moses50_outcome, outcome_moses50_params)
outcome_moses50_scores_df = pd.DataFrame(data=outcome_moses50_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_moses50_scores_df.describe()




Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.686531,0.498247,0.703892,0.874816,0.743959,0.760132
std,0.033006,0.057399,0.041483,0.013969,0.023836,0.03633
min,0.648873,0.427711,0.655172,0.855072,0.717262,0.720049
25%,0.655081,0.45509,0.663551,0.870036,0.721713,0.722902
50%,0.695652,0.5,0.722222,0.873188,0.74772,0.772067
75%,0.710691,0.548193,0.734513,0.884477,0.762658,0.790918
max,0.722359,0.560241,0.744,0.891304,0.77044,0.794722


In [22]:
print_score_comparison(outcome_pam_scores_df, outcome_moses50_scores_df, target_feature="posOutcome",
                       header_1="Pam 35 genes", header_2="MOSES 50 genes")

		posOutcome
			Pam 35 genes		MOSES 50 genes
		-----------------------------------------------
balanced_accuracy:	62.923%			68.653%

precision_0:		63.061%			70.389%

recall_0:		39.952%			49.825%

precision_1:		70.413%			74.396%

recall_1:		85.893%			87.482%

auc:			71.466%			76.013%



In [23]:
print_score_comparison(outcome_moses50_scores_df, outcome_xg50_scores_df, target_feature="posOutcome",
                       header_1="MOSES 50 genes", header_2="Xgboost 50 genes")


		posOutcome
			MOSES 50 genes		Xgboost 50 genes
		-----------------------------------------------
balanced_accuracy:	68.653%			68.975%

precision_0:		70.389%			69.427%

recall_0:		49.825%			51.628%

precision_1:		74.396%			74.813%

recall_1:		87.482%			86.321%

auc:			76.013%			77.627%



In [24]:
moses35_genes = moses50_genes[:35]
X_moses35_outcome, y_moses35_outcome = ge_outcome_df[moses35_genes], ge_outcome_df["posOutcome"]
X_moses35_outcome.head()


Unnamed: 0,PRND,FRS3,FCN3,DSCR4,BRCA2,CXCL6,LMX1B,DLX5,OMP,ADH6,...,PLA2R1,CYP2F1,TAS2R7,NKX6-1,WNT11,CHST11,CLCA4,ENPEP,PAH,WFDC1
0,4.773548,3.792942,3.591425,3.81314,3.842011,3.301166,3.155381,4.092754,3.111448,2.856136,...,4.349527,2.56844,3.132241,3.33901,3.293749,5.930842,3.169706,3.952931,2.730984,4.149481
1,4.050956,3.596728,3.603971,2.500197,2.761469,3.679678,3.406322,3.698481,3.193991,2.712495,...,3.249435,2.217734,3.448062,2.672134,4.365613,3.796243,4.13892,4.84309,3.667639,3.075831
2,5.213503,3.892048,3.655383,2.710443,2.561722,3.748453,3.964545,4.12564,3.167802,2.590382,...,3.839366,2.930259,3.568875,2.999942,2.969958,4.158683,3.730554,5.330162,2.930534,2.853928
3,3.443242,3.713757,3.370449,3.676626,3.947755,2.890541,2.987402,3.91909,2.765266,2.959009,...,2.391109,2.84186,3.020353,3.042732,3.62029,5.339365,3.449172,2.952919,2.687195,2.849884
4,4.237601,3.800724,3.259677,3.123646,3.354961,3.029855,3.116395,3.882619,2.893702,2.585563,...,3.479267,2.604748,3.252572,3.306837,3.795611,3.384719,3.347518,3.371804,1.857471,3.254181


In [52]:
rand_search_moses35 = param_tuning(X_moses35_outcome, y_moses35_outcome, jobs=14)


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 24.12 seconds.
Best Score: 76.128%
{'subsample': 1.0, 'n_estimators': 400, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.6}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:    2.5s
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed:   22.6s finished


In [25]:
outcome_moses35_params = {'subsample': 1.0,
 'n_estimators': 400,
 'min_child_weight': 3,
 'max_depth': 5,
 'learning_rate': 0.01,
 'gamma': 1,
 'colsample_bytree': 0.6}

In [26]:
outcome_moses35_scores, clf_moses35 = run_cross_val(X_moses35_outcome, y_moses35_outcome, outcome_moses35_params)
outcome_moses35_scores_df = pd.DataFrame(data=outcome_moses35_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_moses35_scores_df.describe()




Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.686159,0.505469,0.694317,0.86685,0.744851,0.761281
std,0.031924,0.051701,0.041836,0.015549,0.022954,0.035843
min,0.650829,0.451807,0.636364,0.84058,0.720497,0.72172
25%,0.659117,0.461078,0.669643,0.866426,0.725076,0.727861
50%,0.686594,0.5,0.70339,0.873188,0.743827,0.764296
75%,0.707679,0.542169,0.72,0.873188,0.760252,0.796163
max,0.726578,0.572289,0.742188,0.880866,0.774603,0.796366


In [27]:
print_score_comparison(outcome_pam_scores_df, outcome_moses35_scores_df, target_feature="posOutcome",
                       header_1="PAM 35 Genes", header_2="MOSES 35 genes")


		posOutcome
			PAM 35 Genes		MOSES 35 genes
		-----------------------------------------------
balanced_accuracy:	62.923%			68.616%

precision_0:		63.061%			69.432%

recall_0:		39.952%			50.547%

precision_1:		70.413%			74.485%

recall_1:		85.893%			86.685%

auc:			71.466%			76.128%



In [28]:
print_score_comparison(outcome_moses50_scores_df, outcome_moses35_scores_df, target_feature="posOutcome",
                       header_1="MOSES 50 Genes", header_2="MOSES 35 genes")


		posOutcome
			MOSES 50 Genes		MOSES 35 genes
		-----------------------------------------------
balanced_accuracy:	68.653%			68.616%

precision_0:		70.389%			69.432%

recall_0:		49.825%			50.547%

precision_1:		74.396%			74.485%

recall_1:		87.482%			86.685%

auc:			76.013%			76.128%



In [29]:
print_score_comparison(outcome_xgb35_scores_df, outcome_moses35_scores_df, target_feature="posOutcome",
                       header_1="Xgboost 35 Genes", header_2="MOSES 35 genes")


		posOutcome
			Xgboost 35 Genes		MOSES 35 genes
		-----------------------------------------------
balanced_accuracy:	69.482%			68.616%

precision_0:		69.993%			69.432%

recall_0:		52.353%			50.547%

precision_1:		75.210%			74.485%

recall_1:		86.611%			86.685%

auc:			76.133%			76.128%



In [27]:
def find_misclassified_patients(df, clf, X, y):
    y_test = y.to_numpy()
    X_test = X.to_numpy()
    miss = np.where(y_test != clf.predict(X_test))
    return df.iloc[miss]["patient_ID"].to_numpy(dtype=np.int64)

def calc_overlap(a, b):
    intr = np.intersect1d(a, b)
    union = np.union1d(a, b)
    return intr, (len(intr) / len(union))

def print_overlap(model1, model2, intr, perc):
    print("{0} patients misclassified by {1} and {2} - {3:.1%} overlap\n".format(len(intr) ,model1, model2, perc))

In [28]:
xg50_miss = find_misclassified_patients(ge_outcome_df, clf_xg50, X_xgb50_outcome, y_pos_outcome)
moses50_miss = find_misclassified_patients(ge_outcome_df, clf_moses50, X_moses50_outcome, y_pos_outcome)
pam_miss = find_misclassified_patients(ge_outcome_df, clf_pam, X_pos_outcome, y_pos_outcome)

In [29]:
xg_moses_intr, perc = calc_overlap(xg50_miss, moses50_miss)
print_overlap("Xg50", "Moses50", xg_moses_intr, perc)

352 patients misclassified by Xg50 and Moses50 - 68.0% overlap



In [30]:
xg_pam_intr, perc = calc_overlap(xg50_miss, pam_miss)
print_overlap("Xg50", "PAM35", xg_pam_intr, perc)

102 patients misclassified by Xg50 and PAM35 - 21.3% overlap



In [31]:
moses_pam_intr, perc = calc_overlap(moses50_miss, pam_miss)
print_overlap("Moses50", "PAM35", moses_pam_intr, perc)

103 patients misclassified by Moses50 and PAM35 - 19.8% overlap



In [35]:
X_raw_outcome, y_raw_outcome = ge_outcome_df[ge_outcome_df.columns.difference(["patient_ID", "posOutcome"])], ge_outcome_df["posOutcome"]

In [36]:
outcome_raw_params = {'subsample': 0.6,
 'n_estimators': 700,
 'min_child_weight': 5,
 'max_depth': 5,
 'learning_rate': 0.03,
 'gamma': 0.5,
 'colsample_bytree': 0.8}

In [37]:
outcome_scores, clf_outcome = run_cross_val(X_raw_outcome, y_raw_outcome, outcome_raw_params)
outcome_scores_df = pd.DataFrame(data=outcome_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_scores_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.713982,0.545213,0.737919,0.88275,0.763555,0.805828
std,0.038643,0.052892,0.071197,0.034638,0.025463,0.027738
min,0.678556,0.473054,0.693548,0.858696,0.73494,0.774538
25%,0.690444,0.518072,0.704545,0.862816,0.749216,0.780349
50%,0.709468,0.560241,0.711712,0.865942,0.764516,0.816243
75%,0.713091,0.560241,0.715385,0.884058,0.766026,0.817466
max,0.778348,0.614458,0.864407,0.942238,0.803077,0.840546


In [38]:
raw_miss = find_misclassified_patients(ge_outcome_df, clf_outcome, X_raw_outcome, y_pos_outcome)

In [39]:
xg50_raw_intr, perc = calc_overlap(xg50_miss, raw_miss)
print_overlap("Xgb50", "Raw", xg50_raw_intr, perc)

84 patients misclassified by Xgb50 and Raw - 19.0% overlap



In [40]:
moses50_raw_intr, perc = calc_overlap(moses50_miss, raw_miss)
print_overlap("Moses50", "Raw", moses50_raw_intr, perc)


76 patients misclassified by Moses50 and Raw - 15.4% overlap



In [41]:
raw_pam_intr, perc = calc_overlap(raw_miss, pam_miss)
print_overlap("Raw", "Pam35", raw_pam_intr, perc)

82 patients misclassified by Raw and Pam35 - 41.6% overlap



In [45]:
overlap_xg_moses_pam, perc = calc_overlap(moses_pam_intr, xg_pam_intr)
print_overlap("Pam", "Xgboost and MOSES", overlap_xg_moses_pam, perc)

93 patients misclassified by Pam and Xgboost and MOSES - 83.0% overlap



In [43]:
def write_misclassified(file_name, ls):
    with open("datasets/" + file_name + ".txt", "w") as f:
        for p in ls:
            f.write(str(p) + "\n")

In [44]:
write_misclassified("xg50_misclassified", xg50_miss)
write_misclassified("moses50_misclassified", moses50_miss)
write_misclassified("pam35_misclassified", pam_miss)
write_misclassified("raw_genes_misclassified", raw_miss)

In [78]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, balanced_accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_raw_outcome, y_raw_outcome, test_size=0.3, random_state=42)
est_raw_moses = [('raw', clf_outcome), ('moses50', clf_moses50)]
eclf_raw_moses = VotingClassifier(est_raw_moses, voting="soft", n_jobs=8)

eclf_raw_moses.fit(X_train, y_train)

VotingClassifier(estimators=[('raw',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=0.8, gamma=0.5,
                                            gpu_id=-1, importance_type='gain',
                                            interaction_constraints='',
                                            learning_rate=0.03,
                                            max_delta_step=0, max_depth=5,
                                            min_child_weight=5, missing=nan,
                                            monotone_constraints='()',
                                            n_estimators=700, n_jobs=8,
                                            num_par...
                                            gpu_id=-1, importance_type='gain',
                                   

In [79]:
from sklearn.metrics import roc_auc_score
def calc_scores(clf, y_true, y_pred):
    recall_0, recall_1 = recall_score(y_true, y_pred, pos_label=0), recall_score(y_true, y_pred, pos_label=1)
    precision_0, precision_1 =  precision_score(y_true, y_pred, pos_label=0), precision_score(y_true, y_pred, pos_label=1)
    acc = balanced_accuracy_score(y_true, y_pred)
    auc_score = roc_auc_score(y_true, clf.predict_proba(X_test)[:,1])
    return np.array([[acc, recall_0, precision_0, recall_1, precision_1, auc_score]])

y_pred_moses50 = eclf_raw_moses.predict(X_test)
eclf_raw_moses_scores = calc_scores(eclf_raw_moses, y_test, y_pred_moses50)
ensemble_raw_moses_df = pd.DataFrame(data=eclf_raw_moses_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
ensemble_raw_moses_df.head()

Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
0,0.701382,0.514403,0.726744,0.888361,0.760163,0.804493


In [81]:
est_raw_xgb = [('raw', clf_outcome), ('xg50', clf_xg50)]
eclf_raw_xgb = VotingClassifier(est_raw_xgb, voting="soft", n_jobs=8)

eclf_raw_xgb.fit(X_train, y_train)

y_pred_xg50 = eclf_raw_xgb.predict(X_test)
eclf_raw_xgb_scores = calc_scores(eclf_raw_xgb, y_test, y_pred_xg50)
ensemble_raw_xgb_df = pd.DataFrame(data=eclf_raw_xgb_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
ensemble_raw_xgb_df.head()

Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
0,0.709378,0.518519,0.75,0.900238,0.764113,0.810133


In [83]:
est_moses_xgb = [('moses50', clf_moses50), ('xg50', clf_xg50)]
eclf_moses_xgb = VotingClassifier(est_moses_xgb, voting="soft", n_jobs=12)
eclf_moses_xgb.fit(X_train, y_train)

y_pred_moses_xgb = eclf_moses_xgb.predict(X_test)
eclf_moses_xgb_scores = calc_scores(eclf_moses_xgb, y_test, y_pred_moses_xgb)
ensemble_moses_xgb_df = pd.DataFrame(data=eclf_moses_xgb_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
ensemble_moses_xgb_df.head()

Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
0,0.699325,0.510288,0.725146,0.888361,0.758621,0.804043


In [84]:
moses_xgb_miss = find_misclassified_patients(ge_outcome_df, eclf_moses_xgb, X_raw_outcome, y_pos_outcome)
moses_xgb_raw_intr, perc = calc_overlap(raw_miss, moses_xgb_miss)
print_overlap("Raw Model", "Moses + Xgb Ensemble", moses_xgb_raw_intr, perc)

25 patients misclassified by Raw Model and Moses + Xgb Ensemble - 9.0% overlap



In [86]:
ec_moses_raw_miss = find_misclassified_patients(ge_outcome_df, eclf_raw_moses, X_raw_outcome, y_pos_outcome)
ec_moses_raw_intr, perc = calc_overlap(raw_miss, ec_moses_raw_miss)
print_overlap("Raw Model", "Moses + Raw Ensemble", ec_moses_raw_intr, perc)

24 patients misclassified by Raw Model and Moses + Raw Ensemble - 9.5% overlap



In [87]:
ec_raw_xgb_miss = find_misclassified_patients(ge_outcome_df, eclf_raw_xgb, X_raw_outcome, y_pos_outcome)
ec_raw_xgb_intr, perc = calc_overlap(raw_miss, ec_raw_xgb_miss)
print_overlap("Raw Model", "Xgboost + Raw Ensemble", ec_raw_xgb_intr, perc)

23 patients misclassified by Raw Model and Xgboost + Raw Ensemble - 9.3% overlap



In [90]:
moses_xg_ec_intr, perc = calc_overlap(ec_raw_xgb_miss, ec_moses_raw_miss)
print_overlap("Xgboost + Raw Ensemble", "Moses + Raw Ensemble", moses_xg_ec_intr, perc)

151 patients misclassified by Xgboost + Raw Ensemble and Moses + Raw Ensemble - 87.3% overlap



In [94]:
moses_raw_ec_intr, perc = calc_overlap(ec_moses_raw_miss, moses_xgb_miss)
print_overlap("Moses + Raw Ensemble", "Moses + Xgboost Ensemble", moses_raw_ec_intr, perc)

151 patients misclassified by Moses + Raw Ensemble and Moses + Xgboost Ensemble - 74.0% overlap



In [95]:
xg_raw_ec_intr, perc = calc_overlap(ec_raw_xgb_miss, moses_xgb_miss)
print_overlap("Moses + Xgboost", "Xgboost + Raw", xg_raw_ec_intr, perc)

148 patients misclassified by Moses + Xgboost and Xgboost + Raw - 73.6% overlap



In [101]:
#Infogan 48 vector embedding
gan_df = pd.read_csv("datasets/embedding-vectors/codes_48_infogan_vector.csv")

gan_outcome_df = pd.merge(pos_outcome_df, gan_df, on="patient_ID")
X_gan_outcome, y_gan_outcome = gan_outcome_df[gan_outcome_df.columns.difference(["patient_ID", "posOutcome"])], gan_outcome_df["posOutcome"]
gan_outcome_df.head()

Unnamed: 0,patient_ID,posOutcome,0,1,2,3,4,5,6,7,...,38,39,40,41,42,43,44,45,46,47
0,22449,0,0.085418,-0.111649,0.018824,0.0624,-0.010372,-0.128282,0.076703,-0.009947,...,-0.047461,0.032504,0.113524,-0.01213,-0.081734,0.028249,0.008334,0.02047,0.031202,-0.00257
1,22450,0,-0.004098,-0.004369,0.014671,-0.028279,-0.065785,-0.159276,0.07694,0.073731,...,-0.061511,0.031983,-0.026282,0.110665,0.018675,-0.016672,-0.009339,-0.113558,0.009888,-0.055047
2,22451,0,0.018527,-0.012835,-0.109397,0.069133,0.05233,-0.050934,0.068238,-0.037115,...,-0.040069,0.040046,0.013243,0.043978,-0.015119,0.009637,0.053755,0.028593,0.036472,0.036024
3,22452,0,-2.2e-05,0.025746,0.005182,0.058444,0.019806,-0.071137,0.053569,0.007164,...,-0.035705,-0.041495,0.083693,-0.002261,-0.04184,-0.024654,-0.04477,0.040501,-0.12504,-0.07201
4,22453,1,0.012372,0.018869,-0.014388,-0.02291,-0.011056,-0.05358,0.055017,0.072292,...,-0.059993,-0.015497,0.004891,0.070426,0.002443,0.023149,-0.033846,-0.044504,0.041614,-0.071776


In [102]:
outcome_gan_params = {'subsample': 0.6, 'n_estimators': 300, 'min_child_weight': 4, 'max_depth': 4, 'learning_rate': 0.02, 'gamma': 5, 'colsample_bytree': 0.6}

In [103]:
outcome_gan_scores, clf_outcome_gan = run_cross_val(X_gan_outcome, y_gan_outcome, outcome_gan_params)
outcome_gan_scores_df  = pd.DataFrame(data=outcome_gan_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_gan_scores_df.to_csv("datasets/results/outcome_scores_gan_48.csv")
outcome_gan_scores_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.567647,0.250357,0.570676,0.884937,0.662562,0.647394
std,0.020807,0.039079,0.064086,0.032365,0.012077,0.021228
min,0.547449,0.203593,0.505495,0.836957,0.649077,0.626158
25%,0.557032,0.216867,0.53125,0.870036,0.65812,0.63165
50%,0.564536,0.259036,0.544304,0.891304,0.661458,0.644666
75%,0.566917,0.277108,0.610169,0.90942,0.662088,0.654843
max,0.602301,0.295181,0.662162,0.916968,0.682065,0.679653


In [105]:
gan_miss = find_misclassified_patients(ge_outcome_df, clf_outcome_gan, X_gan_outcome, y_gan_outcome)
raw_gan_intr, perc = calc_overlap(raw_miss, gan_miss)
print_overlap("Raw", "Infogan", raw_gan_intr, perc)

65 patients misclassified by Raw and Infogan - 12.0% overlap



In [107]:
moses_gan_intr, perc = calc_overlap(moses50_miss, gan_miss)
print_overlap("Moses", "Infogan", moses_gan_intr, perc)

233 patients misclassified by Moses and Infogan - 32.5% overlap



In [108]:
xgb_gan_intr, perc = calc_overlap(xg50_miss, gan_miss)
print_overlap("Xgboost", "Infogan", xgb_gan_intr, perc)

207 patients misclassified by Xgboost and Infogan - 29.5% overlap



In [111]:
pam_gan_intr, perc = calc_overlap(pam_miss, gan_miss)
print_overlap("Pam35", "Infogan", pam_gan_intr, perc)

94 patients misclassified by Pam35 and Infogan - 16.5% overlap



In [110]:
write_misclassified("infogan_misclassified", gan_miss)

In [115]:
#save the models
clf_moses50.save_model("datasets/models/moses50_raw.json")
clf_xg50.save_model("datasets/models/xgb50_raw.json")
clf_pam.save_model("datasets/models/pam35_raw.json")
clf_outcome.save_model("datasets/models/raw_model.json")
clf_outcome_gan.save_model("datasets/models/infogan_model.json")