In [8]:
import numpy as np
import pandas as pd
#import torch
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score

In [4]:
rna_seq = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_rnaseq_genes_145subjects_17072genes.npy")
acgh = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_1mb_aCGH.npy")
clinical = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_clinical_data_for_fit.npy")

stages = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_stages.npy")
stages = np.array(stages).flatten().astype(str)
#stages

y = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_outputs.npy")
mergeable = pd.read_csv("/home/vero/projects/camda/NB/data_cleaned/clinical_data_for_fit.csv", index_col=0)
clinical_HR = pd.read_csv("/home/vero/projects/camda/NB/data_cleaned/info_patients_clinical.csv", 
                          index_col=0)['high_risk2']
clinical_HR.sort_index()
HR = clinical_HR.loc[mergeable.index].values
HR[np.where(HR=='HR')] = '1'
HR = HR.astype(int)

# benchmark single dataset

## Prediction with RNA-Seq

In [5]:
y = y[:,1]

In [6]:
columns = ['mean_score_tr', 'std_score_tr', 'mean_score_ts', 'std_score_ts', 
                                   'parameters', 'scores_tr', 'scores_ts', 'scores_CV']
df_results = pd.DataFrame(columns=columns)

### results with logistic regression

In [9]:
from sklearn.linear_model import LogisticRegressionCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(rna_seq, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = rna_seq[train,:]
    y_tr = y[train]
    x_ts = rna_seq[test, :]
    y_ts = y[test]
    lr = LogisticRegressionCV(Cs=np.logspace(-10,10,50))
    lr.fit(x_tr, y_tr)
    parameters.append(lr.C_)
    val_scores.append(lr.scores_)
    tr_scores.append(balanced_accuracy_score(y_tr, lr.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, lr.predict(x_ts)))

In [10]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts, parameters)

0.9985294117647058 0.006739081904346827 0.8618181818181818 0.11094215230030825 [array([0.00086851]), array([0.00086851]), array([7543.12006335]), array([0.00086851]), array([0.00033932]), array([26.82695795]), array([0.00013257]), array([0.002223]), array([0.00568987]), array([0.00033932]), array([0.002223]), array([0.00013257]), array([0.00033932]), array([0.00033932]), array([0.09540955]), array([0.002223]), array([175.75106249]), array([0.00033932]), array([0.002223]), array([1151.39539933]), array([0.00033932]), array([449.8432669]), array([4.09491506]), array([0.00033932]), array([0.00568987]), array([0.03727594]), array([1151.39539933]), array([0.00033932]), array([0.03727594]), array([0.01456348]), array([0.00568987]), array([0.00086851]), array([0.00086851]), array([1.59985872]), array([0.00033932]), array([0.002223]), array([0.002223]), array([0.002223]), array([0.01456348]), array([0.00086851]), array([0.03727594]), array([0.62505519]), array([0.002223]), array([0.00033932]),

In [11]:
res = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['Logisticregression'])
df_results = df_results.append(res)

In [12]:
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.998529,0.006739,0.861818,0.110942,"[[0.000868511373751352], [0.000868511373751352...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9705882352941...","[0.875, 1.0, 0.8295454545454546, 1.0, 0.625, 0...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...


### results with Ridge

In [13]:
from sklearn.linear_model import RidgeClassifierCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(rna_seq, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = rna_seq[train,:]
    y_tr = y[train]
    x_ts = rna_seq[test, :]
    y_ts = y[test]
    model = RidgeClassifierCV(alphas=np.logspace(-10,10,50), store_cv_values=True)
    model.fit(x_tr, y_tr)
    parameters.append(model.alpha_)
    val_scores.append(model.cv_values_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))
    

In [14]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

1.0 0.0 0.8432954545454545 0.10921270426432665


In [15]:
res = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['RidgeClassifier'])
df_results = df_results.append(res)

In [16]:
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.998529,0.006739,0.861818,0.110942,"[[0.000868511373751352], [0.000868511373751352...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9705882352941...","[0.875, 1.0, 0.8295454545454546, 1.0, 0.625, 0...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...
RidgeClassifier,1.0,0.0,0.843295,0.109213,"[19306.977288832535, 19306.977288832535, 7543....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.875, 0.7840909090909092, 0.8295454545454546...",[[[[0.00699393 0.00699393 0.00699393 0.0069939...


### results with SVM

In [17]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(rna_seq, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = rna_seq[train,:]
    y_tr = y[train]
    x_ts = rna_seq[test, :]
    y_ts = y[test]
    model = LinearSVC()
    gscv = GridSearchCV(model, {'C':np.logspace(-10,10,50)}, cv=3, iid=True)
    gscv.fit(x_tr, y_tr)
    model = gscv.best_estimator_
    parameters.append(gscv.best_params_['C'])
    val_scores.append(gscv.cv_results_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))
    









In [18]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

0.9926378676470589 0.013803601851135171 0.8463636363636365 0.1016201401916076


In [19]:
es = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['SVM-l2'])
df_results = df_results.append(es)

In [20]:
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.998529,0.006739,0.861818,0.110942,"[[0.000868511373751352], [0.000868511373751352...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9705882352941...","[0.875, 1.0, 0.8295454545454546, 1.0, 0.625, 0...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...
RidgeClassifier,1.0,0.0,0.843295,0.109213,"[19306.977288832535, 19306.977288832535, 7543....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.875, 0.7840909090909092, 0.8295454545454546...",[[[[0.00699393 0.00699393 0.00699393 0.0069939...
SVM-l2,0.992638,0.013804,0.846364,0.10162,"[2.0235896477251556e-05, 5.1794746792312125e-0...","[0.9852941176470589, 1.0, 0.9852941176470589, ...","[0.8295454545454546, 0.9545454545454546, 0.875...","[{'mean_fit_time': [0.04174025853474935, 0.033..."


## SVM with l1

In [23]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(rna_seq, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = rna_seq[train,:]
    y_tr = y[train]
    x_ts = rna_seq[test, :]
    y_ts = y[test]
    model = LinearSVC(penalty='l1', loss='squared_hinge', dual=False)
    gscv = GridSearchCV(model, {'C':np.logspace(-10,10,50)}, cv=3, iid=True)
    gscv.fit(x_tr, y_tr)
    model = gscv.best_estimator_
    parameters.append(gscv.best_params_['C'])
    val_scores.append(gscv.cv_results_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))
    

In [24]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

0.9995067401960784 0.0037289189326224033 0.7442045454545453 0.12403027781170847


In [25]:
es = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['SVM-l1'])
df_results = df_results.append(es)
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.998529,0.006739,0.861818,0.110942,"[[0.000868511373751352], [0.000868511373751352...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9705882352941...","[0.875, 1.0, 0.8295454545454546, 1.0, 0.625, 0...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...
RidgeClassifier,1.0,0.0,0.843295,0.109213,"[19306.977288832535, 19306.977288832535, 7543....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.875, 0.7840909090909092, 0.8295454545454546...",[[[[0.00699393 0.00699393 0.00699393 0.0069939...
SVM-l2,0.992638,0.013804,0.846364,0.10162,"[2.0235896477251556e-05, 5.1794746792312125e-0...","[0.9852941176470589, 1.0, 0.9852941176470589, ...","[0.8295454545454546, 0.9545454545454546, 0.875...","[{'mean_fit_time': [0.04174025853474935, 0.033..."
SVM-l1,0.999507,0.003729,0.744205,0.12403,"[19306.977288832535, 7543.120063354608, 449.84...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.8295454545454546, 0.75, 0.5795454545454546,...","[{'mean_fit_time': [0.07293558120727539, 0.057..."


### results with Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier

tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(rna_seq, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = rna_seq[train,:]
    y_tr = y[train]
    x_ts = rna_seq[test, :]
    y_ts = y[test]
    model = RandomForestClassifier()
    gscv = GridSearchCV(model, {'n_estimators':np.array(np.linspace(1,100,50)).astype(int)}, cv=3)
    gscv.fit(x_tr, y_tr)
    model = gscv.best_estimator_
    parameters.append(gscv.best_params_['n_estimators'])
    val_scores.append(gscv.cv_results_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))









In [27]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

0.9955422794117648 0.014362265075859757 0.7382954545454545 0.1233617333376364


In [28]:
res = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['RF'])
df_results = df_results.append(res)

In [29]:
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.998529,0.006739,0.861818,0.110942,"[[0.000868511373751352], [0.000868511373751352...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9705882352941...","[0.875, 1.0, 0.8295454545454546, 1.0, 0.625, 0...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...
RidgeClassifier,1.0,0.0,0.843295,0.109213,"[19306.977288832535, 19306.977288832535, 7543....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.875, 0.7840909090909092, 0.8295454545454546...",[[[[0.00699393 0.00699393 0.00699393 0.0069939...
SVM-l2,0.992638,0.013804,0.846364,0.10162,"[2.0235896477251556e-05, 5.1794746792312125e-0...","[0.9852941176470589, 1.0, 0.9852941176470589, ...","[0.8295454545454546, 0.9545454545454546, 0.875...","[{'mean_fit_time': [0.04174025853474935, 0.033..."
SVM-l1,0.999507,0.003729,0.744205,0.12403,"[19306.977288832535, 7543.120063354608, 449.84...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.8295454545454546, 0.75, 0.5795454545454546,...","[{'mean_fit_time': [0.07293558120727539, 0.057..."
RF,0.995542,0.014362,0.738295,0.123362,"[79, 23, 11, 21, 43, 19, 31, 33, 41, 59, 39, 3...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.75, 0.5795454545454546, 0.75, 0.75, 0.75, 0...","[{'mean_fit_time': [0.011264642079671225, 0.01..."


In [31]:
df_results.to_latex()

"\\begin{tabular}{lrrrrllll}\n\\toprule\n{} &  mean\\_score\\_tr &  std\\_score\\_tr &  mean\\_score\\_ts &  std\\_score\\_ts &                                         parameters &                                          scores\\_tr &                                          scores\\_ts &                                          scores\\_CV \\\\\n\\midrule\nLogisticregression &       0.998529 &      0.006739 &       0.861818 &      0.110942 &  [[0.000868511373751352], [0.000868511373751352... &  [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9705882352941... &  [0.875, 1.0, 0.8295454545454546, 1.0, 0.625, 0... &  [\\{1: [[0.72727273 0.72727273 0.72727273 0.7272... \\\\\nRidgeClassifier    &       1.000000 &      0.000000 &       0.843295 &      0.109213 &  [19306.977288832535, 19306.977288832535, 7543.... &  [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ... &  [0.875, 0.7840909090909092, 0.8295454545454546... &  [[[[0.00699393 0.00699393 0.00699393 0.0069939... \\\\\nSVM-l2             &       0.99

In [30]:
df_results.to_pickle("../../../projects/neuroblastoma/data_integration/results/Benchmark_rnaSeq.pkl")

## ELAstic net

In [65]:
df_results_new = df_results.iloc[[0,1,4,5], :]

In [66]:
df_results_new

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.998088,0.007962,0.832045,0.108576,"[[0.014563484775012445], [0.002222996482526195...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.7840909090909092, 0.5795454545454546, 0.875...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...
RidgeClassifier,1.0,0.0,0.834545,0.105582,"[19306.977288832535, 7543.120063354608, 19306....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.75, 0.8295454545454546, 0.9545454545454546,...",[[[[0.0467198 0.0467198 0.0467198 0.0467198...
RF,0.997757,0.009187,0.753409,0.10134,"[77, 59, 31, 27, 87, 83, 41, 9, 9, 21, 25, 47,...","[1.0, 1.0, 1.0, 0.9852941176470589, 1.0, 1.0, ...","[0.8295454545454546, 0.875, 0.625, 0.784090909...","[{'mean_fit_time': [0.01038829485575358, 0.013..."
SVM-l2,0.994654,0.011555,0.841591,0.110652,"[5.1794746792312125e-05, 7.906043210907702e-06...","[1.0, 0.9411764705882353, 0.9852941176470589, ...","[0.6931818181818181, 0.8295454545454546, 0.909...","[{'mean_fit_time': [0.03692324956258138, 0.031..."
