In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
from scipy.stats import norm
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

In [3]:
from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score
def calc_results_simple(X, y, train_index, test_index, clf):
    X, y = X.to_numpy(), y.to_numpy(dtype=np.int64)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred  = clf.predict(X_test)
    y_pred_prob = clf.predict_proba(X_test)[:,1]
    acc = balanced_accuracy_score(y_test, y_pred)

    recall_0 =  recall_score(y_test, y_pred, pos_label=0)
    recall_1 =  recall_score(y_test, y_pred, pos_label=1)
    prec_0 = precision_score(y_test, y_pred, pos_label=0)
    prec_1 = precision_score(y_test, y_pred, pos_label=1)
    auc = roc_auc_score(y_test, y_pred_prob)

    return np.array([[acc, recall_0, prec_0, recall_1, prec_1 ,auc]])

#cross_validation
def run_cross_val(X, y, params, n_folds=5, random_seed=42):
    res = np.empty(shape=[0, 6])
    clf = XGBClassifier(**params, n_jobs=8)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        res = np.append(res, calc_results_simple(X, y, train_index, test_index, clf), axis=0)
    return res, clf

def print_score_comparison(raw_score, emb_score, target_feature="RFS",
                           header_1="Raw Score", header_2="Embedding Score"):
    print("\t\t{0}\n\t\t\t{1}\t\t{2}".format(target_feature, header_1, header_2))
    print("\t\t-----------------------------------------------")
    print("balanced_accuracy:\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["balanced_accuracy"].mean(), emb_score["balanced_accuracy"].mean()))
    print("precision_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_0"].mean(), emb_score["precision_0"].mean()))
    print("recall_0:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_0"].mean(), emb_score["recall_0"].mean()))
    print("precision_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["precision_1"].mean(), emb_score["precision_1"].mean()))
    print("recall_1:\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["recall_1"].mean(), emb_score["recall_1"].mean()))
    print("auc:\t\t\t{0:.3%}\t\t\t{1:.3%}\n".format(raw_score["auc"].mean(), emb_score["auc"].mean()))

def find_misclassified_patients(df, clf, X, y):
    y_test = y.to_numpy()
    X_test = X.to_numpy()
    miss = np.where(y_test != clf.predict(X_test))
    return df.iloc[miss]["patient_ID"].to_numpy(dtype=np.int64)

def calc_overlap(a, b):
    intr = np.intersect1d(a, b)
    union = np.union1d(a, b)
    return intr, (len(intr) / len(union))

def print_overlap(model1, model2, intr, perc):
    print("{0} patients misclassified by {1} and {2} - {3:.1%} overlap\n".format(len(intr) ,model1, model2, perc))

def write_misclassified(file_name, ls):
    with open("datasets/" + file_name + ".txt", "w") as f:
        for p in ls:
            f.write(str(p) + "\n")

In [4]:
params = {'n_estimators': [300, 400, 500, 600, 700],
              'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.07],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'max_depth': [3, 4, 5, 6],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'min_child_weight': [1, 2, 3, 4, 5]}
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time

    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

def param_tuning(X, y, n_folds=5, param_comb=25, scoring='roc_auc', jobs=12):
    xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    rand_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring=scoring, n_jobs=jobs,
                                   cv=skf.split(X, y), verbose=3, random_state=42)

    start_time = timer(None) # timing starts from this point for "start_time" variable
    rand_search.fit(X, y)
    timer(start_time)
    print("Best Score: {:.3%}".format(rand_search.best_score_))
    print(rand_search.best_params_)
    return rand_search

In [5]:
ge_df = pd.read_csv("datasets/merged-combat15.csv")
outcome_df = pd.read_csv("datasets/combat15outcomes.csv")
pos_outcome_df = outcome_df[["patient_ID", "posOutcome"]].dropna(axis=0, subset=["posOutcome"])
pos_outcome_df.posOutcome = pos_outcome_df.posOutcome.astype(int)
ge_outcome_df = pd.merge(pos_outcome_df, ge_df, on="patient_ID")


In [5]:
emb_moses50_df = pd.read_csv("datasets/embedding-vectors/property_vector_moses50_withoutpatientsdata_2021-01-09.csv", sep="\t")
pos_outcome_moses50_emb_df = pd.merge(pos_outcome_df, emb_moses50_df, on="patient_ID")
X_moses50_emb, y_moses50_emb = pos_outcome_moses50_emb_df[pos_outcome_moses50_emb_df.columns.difference(["patient_ID", "posOutcome"])], pos_outcome_moses50_emb_df["posOutcome"]
pos_outcome_moses50_emb_df.head()

Unnamed: 0,patient_ID,posOutcome,0,1,2,3,4,5,6,7,...,2208,2209,2210,2211,2212,2213,2214,2215,2216,2217
0,22449,0,0.004935,-0.067173,-0.081524,-0.074297,0.133399,-0.146837,-0.039405,0.084736,...,-5e-06,-5.316138e-07,-5e-06,6.259005e-07,9e-06,-9e-06,-2e-06,-3e-06,-4.132974e-07,6.042526e-07
1,22450,0,-0.107115,-0.066373,-0.186839,-0.073233,-0.008075,-0.106068,-0.16427,-0.01401,...,7e-06,-1.119379e-05,-2e-06,5.344275e-06,-1.2e-05,-2e-06,6e-06,-5e-06,2.132956e-06,-4.973269e-07
2,22451,0,-0.066539,0.002395,-0.151372,-0.038813,0.174195,0.00867,0.011472,-0.045705,...,-8e-06,8.546053e-06,5e-06,1.459251e-05,9e-06,2e-06,-2e-06,1e-06,1.236459e-06,-1.160475e-06
3,22452,0,-0.214922,-0.091833,-0.104011,-0.064794,0.010849,-0.079871,0.034703,-0.017574,...,-7e-06,-1.03577e-05,1e-06,4.595439e-08,3e-06,-2e-06,3e-06,-3e-06,-3.997437e-06,-1.386818e-06
4,22453,1,0.035155,-0.040247,-0.083622,-0.007588,0.112287,-0.065557,-0.020941,0.039357,...,-1.1e-05,-1.192846e-05,-1e-06,1.611686e-06,-4e-06,1.4e-05,4e-06,-3e-06,-4.503628e-06,-1.632821e-06


In [6]:
rand_search_moses50 = param_tuning(X_moses50_emb, y_moses50_emb, jobs=14)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 44 minutes and 7.99 seconds.
Best Score: 73.853%
{'subsample': 0.8, 'n_estimators': 600, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.6}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:  2.9min
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed: 40.0min finished


In [9]:
outcome_moses50_params = {'subsample': 0.8,
 'n_estimators': 600,
 'min_child_weight': 3,
 'max_depth': 6,
 'learning_rate': 0.01,
 'gamma': 5,
 'colsample_bytree': 0.6}

In [10]:
outcome_moses50_scores, clf_moses50 = run_cross_val(X_moses50_emb, y_moses50_emb, outcome_moses50_params)
outcome_moses50_df = pd.DataFrame(data=outcome_moses50_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_moses50_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.663773,0.451273,0.687119,0.876273,0.726522,0.738526
std,0.017982,0.02874,0.032991,0.015369,0.01219,0.027625
min,0.643458,0.427711,0.645455,0.859206,0.714715,0.703667
25%,0.656339,0.433735,0.672727,0.869565,0.720721,0.713812
50%,0.660686,0.443114,0.675676,0.869565,0.725076,0.753645
75%,0.666143,0.451807,0.72,0.884477,0.725146,0.758534
max,0.692238,0.5,0.721739,0.898551,0.746951,0.762972


In [12]:
emb_xg50_df = pd.read_csv("datasets/embedding-vectors/property_vector_xgb50_withoutpatientsdata_2021-01-09.csv", sep="\t")
pos_outcome_xg50_emb_df = pd.merge(pos_outcome_df, emb_xg50_df, on="patient_ID")
X_xg50_emb, y_xg50_emb = pos_outcome_xg50_emb_df[pos_outcome_xg50_emb_df.columns.difference(["patient_ID", "posOutcome"])], pos_outcome_xg50_emb_df["posOutcome"]
pos_outcome_xg50_emb_df.head()

Unnamed: 0,patient_ID,posOutcome,0,1,2,3,4,5,6,7,...,2226,2227,2228,2229,2230,2231,2232,2233,2234,2235
0,22449,0,0.274354,0.036449,0.077523,-0.137187,-0.007095,0.216389,-0.088171,0.001713,...,-1.569298e-05,7e-06,5.8e-05,2.834218e-05,8e-06,3.252772e-07,-6.143507e-05,2e-05,1.9e-05,8e-06
1,22450,0,-0.145367,0.023137,-0.086416,-0.021058,-0.166154,0.133504,-0.013096,0.133711,...,-6.102851e-06,3e-06,2e-06,-1.542109e-05,1.1e-05,5.416132e-06,1.445249e-05,2e-06,-4e-06,2e-06
2,22451,0,0.047539,0.04725,-0.089892,-0.065981,-0.05945,-0.005819,-0.056733,0.200612,...,8.083307e-07,3.7e-05,1.1e-05,-6.268849e-07,2.7e-05,1.037695e-05,1.657085e-05,5e-06,2e-06,-5e-06
3,22452,0,-0.236918,0.019434,-0.056465,-0.119143,-0.057772,0.194564,-0.086927,0.072039,...,9.679478e-06,1.8e-05,1.8e-05,-4.960172e-06,-2e-05,1.572499e-05,8.709695e-07,-2e-06,-1e-05,-2e-06
4,22453,1,0.070666,-0.002817,0.018118,-0.066882,0.061491,0.251767,-0.07889,0.099405,...,5.460626e-06,1.6e-05,-1.1e-05,1.93401e-05,-7e-06,-1.187702e-05,-3.165962e-06,-2e-06,1e-05,1.3e-05


In [13]:
rand_search_xg50 = param_tuning(X_xg50_emb, y_xg50_emb, jobs=14)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 44 minutes and 48.97 seconds.
Best Score: 75.437%
{'subsample': 0.8, 'n_estimators': 400, 'min_child_weight': 2, 'max_depth': 5, 'learning_rate': 0.03, 'gamma': 0.5, 'colsample_bytree': 0.6}


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   4 tasks      | elapsed:  2.9min
[Parallel(n_jobs=14)]: Done 125 out of 125 | elapsed: 42.5min finished


In [15]:
outcome_xg50_params = {'subsample': 0.8,
 'n_estimators': 400,
 'min_child_weight': 2,
 'max_depth': 5,
 'learning_rate': 0.03,
 'gamma': 0.5,
 'colsample_bytree': 0.6}

In [16]:
outcome_xg50_scores, clf_xg50 = run_cross_val(X_xg50_emb, y_xg50_emb, outcome_xg50_params)
outcome_xg50_df = pd.DataFrame(data=outcome_xg50_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_xg50_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.669084,0.466994,0.684265,0.871174,0.731261,0.754367
std,0.03966,0.058133,0.063254,0.023423,0.026726,0.034789
min,0.617895,0.39521,0.6,0.84058,0.696697,0.698299
25%,0.653691,0.433735,0.672566,0.865942,0.720238,0.750098
50%,0.661887,0.457831,0.672897,0.869565,0.726444,0.763249
75%,0.687795,0.506024,0.7,0.873646,0.745342,0.768334
max,0.724153,0.542169,0.775862,0.906137,0.767584,0.791853


In [17]:
print_score_comparison(outcome_moses50_df, outcome_xg50_df, target_feature="posOutcome", header_1="Moses50", header_2="Xgb50")

		posOutcome
			Moses50		Xgb50
		-----------------------------------------------
balanced_accuracy:	66.377%			66.908%

precision_0:		68.712%			68.427%

recall_0:		45.127%			46.699%

precision_1:		72.652%			73.126%

recall_1:		87.627%			87.117%

auc:			73.853%			75.437%



In [18]:
#with patient data(pd)
emb_moses50_all_df = pd.read_csv("datasets/embedding-vectors/property_vector_moses50-all_2021-01-09.csv", sep="\t")
pos_outcome_emb_all_df = pd.merge(pos_outcome_df, emb_moses50_df, on="patient_ID")
X_moses50_all_emb, y_moses50_all_emb = pos_outcome_emb_all_df[pos_outcome_emb_all_df.columns.difference(["patient_ID", "posOutcome"])], pos_outcome_emb_all_df["posOutcome"]
pos_outcome_emb_all_df.head()

Unnamed: 0,patient_ID,posOutcome,0,1,2,3,4,5,6,7,...,2208,2209,2210,2211,2212,2213,2214,2215,2216,2217
0,22449,0,0.004935,-0.067173,-0.081524,-0.074297,0.133399,-0.146837,-0.039405,0.084736,...,-5e-06,-5.316138e-07,-5e-06,6.259005e-07,9e-06,-9e-06,-2e-06,-3e-06,-4.132974e-07,6.042526e-07
1,22450,0,-0.107115,-0.066373,-0.186839,-0.073233,-0.008075,-0.106068,-0.16427,-0.01401,...,7e-06,-1.119379e-05,-2e-06,5.344275e-06,-1.2e-05,-2e-06,6e-06,-5e-06,2.132956e-06,-4.973269e-07
2,22451,0,-0.066539,0.002395,-0.151372,-0.038813,0.174195,0.00867,0.011472,-0.045705,...,-8e-06,8.546053e-06,5e-06,1.459251e-05,9e-06,2e-06,-2e-06,1e-06,1.236459e-06,-1.160475e-06
3,22452,0,-0.214922,-0.091833,-0.104011,-0.064794,0.010849,-0.079871,0.034703,-0.017574,...,-7e-06,-1.03577e-05,1e-06,4.595439e-08,3e-06,-2e-06,3e-06,-3e-06,-3.997437e-06,-1.386818e-06
4,22453,1,0.035155,-0.040247,-0.083622,-0.007588,0.112287,-0.065557,-0.020941,0.039357,...,-1.1e-05,-1.192846e-05,-1e-06,1.611686e-06,-4e-06,1.4e-05,4e-06,-3e-06,-4.503628e-06,-1.632821e-06


In [19]:
rand_search_moses50_all = param_tuning(X_moses50_all_emb, y_moses50_all_emb, jobs=8)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 1 hours 0 minutes and 36.21 seconds.
Best Score: 73.853%
{'subsample': 0.8, 'n_estimators': 600, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.6}


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:  7.3min
[Parallel(n_jobs=8)]: Done 125 out of 125 | elapsed: 56.5min finished


In [32]:
moses50_all_params = {'subsample': 0.8,
 'n_estimators': 600,
 'min_child_weight': 3,
 'max_depth': 6,
 'learning_rate': 0.01,
 'gamma': 5,
 'colsample_bytree': 0.6}

In [22]:
moses50_all_scores, clf_moses50_all = run_cross_val(X_moses50_emb, y_moses50_all_emb, moses50_all_params)
moses50_all_scores_df = pd.DataFrame(data=moses50_all_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
moses50_all_scores_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.663773,0.451273,0.687119,0.876273,0.726522,0.738526
std,0.017982,0.02874,0.032991,0.015369,0.01219,0.027625
min,0.643458,0.427711,0.645455,0.859206,0.714715,0.703667
25%,0.656339,0.433735,0.672727,0.869565,0.720721,0.713812
50%,0.660686,0.443114,0.675676,0.869565,0.725076,0.753645
75%,0.666143,0.451807,0.72,0.884477,0.725146,0.758534
max,0.692238,0.5,0.721739,0.898551,0.746951,0.762972


In [23]:
emb_xg50_all_df = pd.read_csv("datasets/embedding-vectors/property_vector_xgb50-all_2021-01-11.csv", sep="\t")
xg50_pos_outcome_df = pd.merge(pos_outcome_df, emb_xg50_all_df, on="patient_ID")
X_xg50_all_emb, y_xg50_all_emb = xg50_pos_outcome_df[xg50_pos_outcome_df.columns.difference(["patient_ID", "posOutcome"])], xg50_pos_outcome_df["posOutcome"]
xg50_pos_outcome_df.head()

Unnamed: 0,patient_ID,posOutcome,0,1,2,3,4,5,6,7,...,2225,2226,2227,2228,2229,2230,2231,2232,2233,2234
0,22449,0,0.197541,-0.09416,-0.004222,-0.176551,-0.052261,-0.063544,-0.156669,0.059774,...,-1.676209e-06,-4.591032e-07,-6.689761e-08,3e-06,1.1e-05,4e-06,-1.6e-05,-2.599262e-07,1.383866e-06,2e-06
1,22450,0,-0.133166,-0.011311,-0.049556,0.034698,-0.169372,-0.006558,-0.127243,0.101508,...,-7.07683e-07,1.61417e-05,-8.300375e-06,-4e-06,2e-06,4e-06,-8e-06,-6.256857e-06,-1.734795e-06,5e-06
2,22451,0,0.049879,-0.117226,-0.051329,-0.003347,-0.112412,0.043881,-0.036681,-0.029698,...,1.436614e-05,-7.75189e-06,-4.470416e-07,5e-06,-1e-05,-2.1e-05,-3e-06,6.783345e-06,5.289614e-07,3e-06
3,22452,0,-0.19568,-0.004821,-0.015542,-0.054139,-0.168551,0.080153,-0.140853,0.111006,...,-4.741827e-06,5.878677e-06,6.551436e-06,-6e-06,-5e-06,4e-06,-3e-06,1.941135e-06,-3.06709e-06,1e-06
4,22453,1,0.031236,-0.004747,-0.012087,-0.098707,-0.029835,0.02316,-0.196286,0.149748,...,1.097292e-05,-1.188961e-08,5.684494e-06,-2e-06,9e-06,4e-06,6e-06,-4.144589e-06,3.42314e-06,2e-06


In [None]:
rand_search_xg50_all = param_tuning(X_xg50_all_emb, y_xg50_all_emb, jobs=10)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:  6.2min


In [26]:
xg50_all_params = {'subsample': 0.8,
 'n_estimators': 600,
 'min_child_weight': 3,
 'max_depth': 6,
 'learning_rate': 0.01,
 'gamma': 5,
 'colsample_bytree': 0.6}

In [27]:
xg50_all_scores, clf_xg50_all = run_cross_val(X_xg50_all_emb, y_xg50_all_emb, xg50_all_params)
xg50_all_scores_df = pd.DataFrame(data=xg50_all_scores, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
xg50_all_scores_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.673925,0.473039,0.692726,0.87481,0.734574,0.763098
std,0.034857,0.063115,0.046416,0.016747,0.024699,0.020166
min,0.627029,0.377246,0.649485,0.859206,0.699422,0.740758
25%,0.649482,0.439759,0.651786,0.865942,0.719033,0.742899
50%,0.688995,0.512048,0.696721,0.869565,0.746875,0.769301
75%,0.690807,0.512048,0.702479,0.876812,0.747664,0.778025
max,0.713312,0.524096,0.763158,0.902527,0.759878,0.784508


In [28]:
print_score_comparison(moses50_all_scores_df, xg50_all_scores_df, target_feature="posOutcome",
                       header_1="Moses All Emb", header_2="Xgboost All Emb")

		posOutcome
			Moses All Emb		Xgboost All Emb
		-----------------------------------------------
balanced_accuracy:	66.377%			67.392%

precision_0:		68.712%			69.273%

recall_0:		45.127%			47.304%

precision_1:		72.652%			73.457%

recall_1:		87.627%			87.481%

auc:			73.853%			76.310%



In [30]:
print_score_comparison(outcome_moses50_df, moses50_all_scores_df, target_feature="posOutcome",
                       header_1="Moses W/o Patient Data", header_2="Moses All Emb")

		posOutcome
			Moses W/o Patient Data		Moses All Emb
		-----------------------------------------------
balanced_accuracy:	66.377%			66.377%

precision_0:		68.712%			68.712%

recall_0:		45.127%			45.127%

precision_1:		72.652%			72.652%

recall_1:		87.627%			87.627%

auc:			73.853%			73.853%



In [33]:
print_score_comparison(outcome_xg50_df, xg50_all_scores_df, target_feature="posOutcome",
                       header_1="Xgboost W/o Patient Data", header_2="Xgboost All Emb")

		posOutcome
			Xgboost W/o Patient Data		Xgboost All Emb
		-----------------------------------------------
balanced_accuracy:	66.908%			67.392%

precision_0:		68.427%			69.273%

recall_0:		46.699%			47.304%

precision_1:		73.126%			73.457%

recall_1:		87.117%			87.481%

auc:			75.437%			76.310%



In [6]:
emb_moses50_moses_norm = pd.read_csv("datasets/embedding-vectors/property_vector_normalized_moses50-all_2021-01-09.csv", sep="\t")
moses50_norm_outcome_df = pd.merge(pos_outcome_df, emb_moses50_moses_norm, on="patient_ID")
X_moses50_emb_norm, y_moses50_emb_norm = moses50_norm_outcome_df[moses50_norm_outcome_df.columns.difference(["patient_ID", "posOutcome"])],moses50_norm_outcome_df["posOutcome"]

moses50_norm_outcome_df.head()

Unnamed: 0,patient_ID,posOutcome,0,1,2,3,4,5,6,7,...,2225,2226,2227,2228,2229,2230,2231,2232,2233,2234
0,22449,0,-0.0612,-0.011763,-0.09705,-0.144627,-0.004909,0.069013,-0.145238,0.037271,...,-3e-06,-1e-06,5.244921e-06,-1.1e-05,3e-06,-4e-06,1.4e-05,4.997154e-07,-4.934e-06,2.056817e-06
1,22450,0,0.032036,-0.042084,-0.165602,-0.071555,-0.089342,-0.099114,-0.162727,-0.097703,...,7e-06,-3e-06,1.976495e-05,-1.3e-05,6e-06,9e-06,3e-06,8.917825e-06,-7.999823e-07,2.685009e-07
2,22451,0,0.005751,0.074967,-0.126259,0.061286,-0.012363,0.023697,-0.199545,0.043533,...,-1.1e-05,-6e-06,6.491019e-07,1.9e-05,-1e-06,1e-05,3e-06,1.434503e-06,-3.332053e-06,7.659706e-06
3,22452,0,0.132013,-0.048426,-0.16112,-0.06345,-0.062068,-0.011423,-0.104246,-0.029692,...,-1.1e-05,-5e-06,-2.715835e-06,-4e-06,1.4e-05,1.2e-05,3e-06,1.636455e-06,-1.16934e-06,-8.510721e-06
4,22453,1,-0.0477,-0.04643,-0.065408,-0.000439,-0.028872,-0.016403,-0.20544,0.005162,...,-2.3e-05,3e-06,-6.232265e-06,-6e-06,-6e-06,5e-06,7e-06,5.138666e-06,-2.546916e-06,8.56246e-07


In [15]:
rand_search_moses50_norm = param_tuning(X_moses50_emb_norm, y_moses50_emb_norm, jobs=12)


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 56 minutes and 11.07 seconds.
Best Score: 76.151%
{'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.03, 'gamma': 2, 'colsample_bytree': 0.6}


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:  3.2min
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed: 54.5min finished


In [7]:
moses50_norm_params = {'subsample': 0.8,
 'n_estimators': 300,
 'min_child_weight': 5,
 'max_depth': 5,
 'learning_rate': 0.03,
 'gamma': 2,
 'colsample_bytree': 0.6}
outcome_moses50_norm_scores, clf_moses50_norm = run_cross_val(X_moses50_emb_norm, y_moses50_emb_norm, moses50_norm_params)
outcome_moses50_norm_scores_df = pd.DataFrame(data=outcome_moses50_norm_scores,  columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_moses50_norm_scores_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.678477,0.470572,0.713941,0.886381,0.735895,0.761508
std,0.026525,0.042283,0.047181,0.021459,0.018217,0.017672
min,0.640111,0.42515,0.63964,0.855072,0.710843,0.742573
25%,0.663923,0.439759,0.701923,0.876812,0.725664,0.742667
50%,0.684826,0.463855,0.721311,0.888087,0.737463,0.769076
75%,0.700057,0.493976,0.747573,0.905797,0.749254,0.773289
max,0.703466,0.53012,0.759259,0.906137,0.75625,0.779936


In [9]:
emb_moses50_moses_norm_v2 = pd.read_csv("datasets/embedding-vectors/property_vector_normalized-V2_moses50-all_2021-01-09.csv", sep="\t")
moses50_norm_outcome_v2_df = pd.merge(pos_outcome_df, emb_moses50_moses_norm_v2, on="patient_ID")
X_moses50_emb_norm_v2, y_moses50_emb_norm_v2 = moses50_norm_outcome_v2_df[moses50_norm_outcome_v2_df.columns.difference(["patient_ID", "posOutcome"])],moses50_norm_outcome_v2_df["posOutcome"]

moses50_norm_outcome_v2_df.head()

Unnamed: 0,patient_ID,posOutcome,0,1,2,3,4,5,6,7,...,2225,2226,2227,2228,2229,2230,2231,2232,2233,2234
0,22449,0,-0.0612,-0.011763,-0.09705,-0.144627,-0.004909,0.069013,-0.145238,0.037271,...,-3e-06,-1e-06,5.244921e-06,-1.1e-05,3e-06,-4e-06,1.4e-05,4.997154e-07,-4.934e-06,2.056817e-06
1,22450,0,0.032036,-0.042084,-0.165602,-0.071555,-0.089342,-0.099114,-0.162727,-0.097703,...,7e-06,-3e-06,1.976495e-05,-1.3e-05,6e-06,9e-06,3e-06,8.917825e-06,-7.999823e-07,2.685009e-07
2,22451,0,0.005751,0.074967,-0.126259,0.061286,-0.012363,0.023697,-0.199545,0.043533,...,-1.1e-05,-6e-06,6.491019e-07,1.9e-05,-1e-06,1e-05,3e-06,1.434503e-06,-3.332053e-06,7.659706e-06
3,22452,0,0.132013,-0.048426,-0.16112,-0.06345,-0.062068,-0.011423,-0.104246,-0.029692,...,-1.1e-05,-5e-06,-2.715835e-06,-4e-06,1.4e-05,1.2e-05,3e-06,1.636455e-06,-1.16934e-06,-8.510721e-06
4,22453,1,-0.0477,-0.04643,-0.065408,-0.000439,-0.028872,-0.016403,-0.20544,0.005162,...,-2.3e-05,3e-06,-6.232265e-06,-6e-06,-6e-06,5e-06,7e-06,5.138666e-06,-2.546916e-06,8.56246e-07


In [19]:
rand_search_moses50_norm_v2 = param_tuning(X_moses50_emb_norm_v2, y_moses50_emb_norm_v2, jobs=12)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 56 minutes and 52.13 seconds.
Best Score: 76.151%
{'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.03, 'gamma': 2, 'colsample_bytree': 0.6}


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:  3.3min
[Parallel(n_jobs=12)]: Done 125 out of 125 | elapsed: 55.2min finished


In [8]:
moses50_norm_v2_params = {'subsample': 0.8,
 'n_estimators': 300,
 'min_child_weight': 5,
 'max_depth': 5,
 'learning_rate': 0.03,
 'gamma': 2,
 'colsample_bytree': 0.6}

In [10]:
outcome_moses50_norm_v2_scores, clf_moses50_norm_v2 = run_cross_val(X_moses50_emb_norm, y_moses50_emb_norm, moses50_norm_v2_params)
outcome_moses50_norm_v2_scores_df = pd.DataFrame(data=outcome_moses50_norm_v2_scores,  columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
outcome_moses50_norm_v2_scores_df.describe()



Unnamed: 0,balanced_accuracy,recall_0,precision_0,recall_1,precision_1,auc
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.678477,0.470572,0.713941,0.886381,0.735895,0.761508
std,0.026525,0.042283,0.047181,0.021459,0.018217,0.017672
min,0.640111,0.42515,0.63964,0.855072,0.710843,0.742573
25%,0.663923,0.439759,0.701923,0.876812,0.725664,0.742667
50%,0.684826,0.463855,0.721311,0.888087,0.737463,0.769076
75%,0.700057,0.493976,0.747573,0.905797,0.749254,0.773289
max,0.703466,0.53012,0.759259,0.906137,0.75625,0.779936


In [22]:
print_score_comparison(outcome_moses50_norm_scores_df, outcome_moses50_norm_v2_scores_df,
                       target_feature="posOutcome", header_1="Moses Norm V1", header_2="Moses Norm V2")

		posOutcome
			Moses Norm V1		Moses Norm V2
		-----------------------------------------------
balanced_accuracy:	67.848%			67.848%

precision_0:		71.394%			71.394%

recall_0:		47.057%			47.057%

precision_1:		73.589%			73.589%

recall_1:		88.638%			88.638%

auc:			76.151%			76.151%



In [21]:
X_moses50_emb_norm.compare(X_moses50_emb_norm_v2)

Unnamed: 0_level_0,0,0,1,1,10,10,100,100,1000,1000,...,995,995,996,996,997,997,998,998,999,999
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,...,self,other,self,other,self,other,self,other,self,other
0,-0.061200,-0.061200,-0.011763,-0.011763,0.094227,0.094227,0.005540,0.005540,0.000319,0.000319,...,-0.010020,-0.010020,0.013695,0.013695,-0.003760,-0.003760,-0.003787,-0.003787,0.006454,0.006454
1,0.032036,0.032036,-0.042084,-0.042084,0.072747,0.072747,0.005865,0.005865,0.003959,0.003959,...,-0.007732,-0.007732,0.006574,0.006574,0.007644,0.007644,0.007729,0.007729,-0.010231,-0.010231
2,0.005751,0.005751,0.074967,0.074967,0.015623,0.015623,-0.011421,-0.011421,-0.015053,-0.015053,...,0.010022,0.010022,0.002491,0.002491,0.021103,0.021103,-0.019438,-0.019438,0.007085,0.007085
3,0.132013,0.132013,-0.048426,-0.048426,-0.053173,-0.053173,0.041595,0.041595,0.004710,0.004710,...,0.005073,0.005073,0.006257,0.006257,-0.005478,-0.005478,-0.016335,-0.016335,0.001481,0.001481
4,-0.047700,-0.047700,-0.046430,-0.046430,0.047473,0.047473,0.036165,0.036165,-0.017594,-0.017594,...,-0.021378,-0.021378,-0.002560,-0.002560,-0.020363,-0.020363,0.006564,0.006564,0.001492,0.001492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2208,-0.022632,-0.022632,-0.093210,-0.093210,0.155161,0.155161,-0.007396,-0.007396,0.005517,0.005517,...,0.004287,0.004287,0.002754,0.002754,0.002424,0.002424,0.001547,0.001547,-0.004010,-0.004010
2209,-0.053661,-0.053661,-0.033823,-0.033823,-0.021585,-0.021585,0.039583,0.039583,-0.015704,-0.015704,...,-0.004854,-0.004854,-0.005991,-0.005991,0.012206,0.012206,0.023468,0.023468,0.000368,0.000368
2210,,,-0.019663,-0.019663,0.045071,0.045071,0.001040,0.001040,-0.018900,-0.018900,...,0.007928,0.007928,0.016473,0.016473,-0.017683,-0.017683,-0.000785,-0.000785,-0.001125,-0.001125
2211,-0.083596,-0.083596,-0.004946,-0.004946,0.061202,0.061202,0.031859,0.031859,-0.008947,-0.008947,...,0.008434,0.008434,-0.006127,-0.006127,-0.026915,-0.026915,-0.009631,-0.009631,0.022162,0.022162


(2213, 2235)