In [1]:
import numpy as np
import pandas as pd
#import torch
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score

In [2]:
rna_seq = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_rnaseq_genes_145subjects_17072genes.npy")
acgh = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_1mb_aCGH.npy")
clinical = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_clinical_data_for_fit.npy")

stages = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_stages.npy")
stages = np.array(stages).flatten().astype(str)
#stages

y = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_outputs.npy")
mergeable = pd.read_csv("/home/vero/projects/camda/NB/data_cleaned/clinical_data_for_fit.csv", index_col=0)
clinical_HR = pd.read_csv("/home/vero/projects/camda/NB/data_cleaned/info_patients_clinical.csv", 
                          index_col=0)['high_risk2']
clinical_HR.sort_index()
HR = clinical_HR.loc[mergeable.index].values
HR[np.where(HR=='HR')] = '1'
HR = HR.astype(int)

# benchmark single dataset

## Prediction with RNA-Seq

In [3]:
y = y[:,1]
X = clinical

In [4]:
X[np.where(np.isnan(X))] =0

In [5]:
columns = ['mean_score_tr', 'std_score_tr', 'mean_score_ts', 'std_score_ts', 
                                   'parameters', 'scores_tr', 'scores_ts', 'scores_CV']
df_results = pd.DataFrame(columns=columns)

### results with logistic regression

In [7]:
from sklearn.linear_model import LogisticRegressionCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(X, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = X[train,:]
    y_tr = y[train]
    x_ts = X[test, :]
    y_ts = y[test]
    lr = LogisticRegressionCV(Cs=np.logspace(-10,10,50))
    lr.fit(x_tr, y_tr)
    parameters.append(lr.C_)
    val_scores.append(lr.scores_)
    tr_scores.append(balanced_accuracy_score(y_tr, lr.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, lr.predict(x_ts)))

In [8]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts, parameters)

0.725235906862745 0.025570494421303217 0.7114772727272727 0.13074499461051073 [array([3.0888436e-06]), array([0.62505519]), array([5.17947468e-05]), array([0.24420531]), array([3.0888436e-06]), array([0.09540955]), array([0.24420531]), array([3.0888436e-06]), array([0.00013257]), array([0.09540955]), array([0.62505519]), array([2.02358965e-05]), array([0.00013257]), array([4.09491506]), array([1.59985872]), array([1.84206997e-07]), array([2.02358965e-05]), array([0.24420531]), array([7.90604321e-06]), array([3.0888436e-06]), array([3.0888436e-06]), array([0.62505519]), array([3.0888436e-06]), array([3.0888436e-06]), array([0.09540955]), array([7.90604321e-06]), array([1.84206997e-07]), array([3.0888436e-06]), array([5.17947468e-05]), array([1.20679264e-06]), array([4.71486636e-07]), array([0.24420531]), array([0.24420531]), array([0.62505519]), array([1.20679264e-06]), array([0.62505519]), array([0.00033932]), array([3.0888436e-06]), array([4.71486636e-07]), array([3.0888436e-06]), arr

In [9]:
res = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['Logisticregression'])
df_results = df_results.append(res)

In [10]:
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.725236,0.02557,0.711477,0.130745,"[[3.088843596477485e-06], [0.6250551925273976]...","[0.7334558823529411, 0.7377450980392157, 0.762...","[0.7045454545454546, 0.625, 0.5340909090909092...",[{1: [[0.29545455 0.72727273 0.72727273 0.7272...


### results with Ridge

In [13]:
from sklearn.linear_model import RidgeClassifierCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(X, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = X[train,:]
    y_tr = y[train]
    x_ts = X[test, :]
    y_ts = y[test]
    model = RidgeClassifierCV(alphas=np.logspace(-10,10,50), store_cv_values=True)
    model.fit(x_tr, y_tr)
    parameters.append(model.alpha_)
    val_scores.append(model.cv_values_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))
    

In [14]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

0.6732352941176472 0.05007627814142258 0.6496590909090908 0.13962996219642548


In [15]:
res = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['RidgeClassifier'])
df_results = df_results.append(res)

In [16]:
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.725236,0.02557,0.711477,0.130745,"[[3.088843596477485e-06], [0.6250551925273976]...","[0.7334558823529411, 0.7377450980392157, 0.762...","[0.7045454545454546, 0.625, 0.5340909090909092...",[{1: [[0.29545455 0.72727273 0.72727273 0.7272...
RidgeClassifier,0.673235,0.050076,0.649659,0.13963,"[0.2442053094548655, 0.6250551925273976, 0.625...","[0.6884191176470589, 0.678921568627451, 0.6789...","[0.625, 0.5, 0.7045454545454546, 0.45454545454...",[[[[1.55986185 1.55986185 1.55986185 1.5598618...


### results with SVM

In [17]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(X, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = X[train,:]
    y_tr = y[train]
    x_ts = X[test, :]
    y_ts = y[test]
    model = LinearSVC()
    gscv = GridSearchCV(model, {'C':np.logspace(-10,10,50)}, cv=3, iid=True)
    gscv.fit(x_tr, y_tr)
    model = gscv.best_estimator_
    parameters.append(gscv.best_params_['C'])
    val_scores.append(gscv.cv_results_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))
    

In [18]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

0.5691023284313725 0.11098487884580141 0.5605681818181819 0.1200040353143824


In [19]:
es = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['SVM-l2'])
df_results = df_results.append(es)

In [20]:
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.725236,0.02557,0.711477,0.130745,"[[3.088843596477485e-06], [0.6250551925273976]...","[0.7334558823529411, 0.7377450980392157, 0.762...","[0.7045454545454546, 0.625, 0.5340909090909092...",[{1: [[0.29545455 0.72727273 0.72727273 0.7272...
RidgeClassifier,0.673235,0.050076,0.649659,0.13963,"[0.2442053094548655, 0.6250551925273976, 0.625...","[0.6884191176470589, 0.678921568627451, 0.6789...","[0.625, 0.5, 0.7045454545454546, 0.45454545454...",[[[[1.55986185 1.55986185 1.55986185 1.5598618...
SVM-l2,0.569102,0.110985,0.560568,0.120004,"[126485.52168552957, 0.09540954763499963, 3556...","[0.5, 0.6072303921568627, 0.5, 0.5, 0.5, 0.5, ...","[0.5, 0.875, 0.5, 0.5, 0.5, 0.5, 0.5, 0.545454...","[{'mean_fit_time': [0.0007049242655436198, 0.0..."


## SVM with l1

In [25]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(X, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = X[train,:]
    y_tr = y[train]
    x_ts = X[test, :]
    y_ts = y[test]
    model = LinearSVC(penalty='l1', loss='squared_hinge', dual=False)
    gscv = GridSearchCV(model, {'C':np.logspace(-10,10,50)}, cv=3, iid=True)
    gscv.fit(x_tr, y_tr)
    model = gscv.best_estimator_
    parameters.append(gscv.best_params_['C'])
    val_scores.append(gscv.cv_results_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))
    

In [26]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

0.7249019607843138 0.023963413407557033 0.7160227272727272 0.13098773482106527


In [27]:
es = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['SVM-l1'])
df_results = df_results.append(es)
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.725236,0.02557,0.711477,0.130745,"[[3.088843596477485e-06], [0.6250551925273976]...","[0.7334558823529411, 0.7377450980392157, 0.762...","[0.7045454545454546, 0.625, 0.5340909090909092...",[{1: [[0.29545455 0.72727273 0.72727273 0.7272...
RidgeClassifier,0.673235,0.050076,0.649659,0.13963,"[0.2442053094548655, 0.6250551925273976, 0.625...","[0.6884191176470589, 0.678921568627451, 0.6789...","[0.625, 0.5, 0.7045454545454546, 0.45454545454...",[[[[1.55986185 1.55986185 1.55986185 1.5598618...
SVM-l2,0.569102,0.110985,0.560568,0.120004,"[126485.52168552957, 0.09540954763499963, 3556...","[0.5, 0.6072303921568627, 0.5, 0.5, 0.5, 0.5, ...","[0.5, 0.875, 0.5, 0.5, 0.5, 0.5, 0.5, 0.545454...","[{'mean_fit_time': [0.0007049242655436198, 0.0..."
SVM-l1,0.724902,0.023963,0.716023,0.130988,"[0.09540954763499963, 0.2442053094548655, 0.09...","[0.7282475490196079, 0.6841299019607843, 0.768...","[0.5795454545454546, 0.875, 0.4886363636363636...","[{'mean_fit_time': [0.0008358955383300781, 0.0..."


### results with Random Forest

In [28]:
from sklearn.ensemble import RandomForestClassifier

tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(X, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = X[train,:]
    y_tr = y[train]
    x_ts = X[test, :]
    y_ts = y[test]
    model = RandomForestClassifier()
    gscv = GridSearchCV(model, {'n_estimators':np.array(np.linspace(1,100,50)).astype(int)}, cv=3)
    gscv.fit(x_tr, y_tr)
    model = gscv.best_estimator_
    parameters.append(gscv.best_params_['n_estimators'])
    val_scores.append(gscv.cv_results_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))









In [29]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

0.9379810049019608 0.041129308139457504 0.6819318181818181 0.12414474980827292


In [30]:
res = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['RF'])
df_results = df_results.append(res)

In [31]:
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.725236,0.02557,0.711477,0.130745,"[[3.088843596477485e-06], [0.6250551925273976]...","[0.7334558823529411, 0.7377450980392157, 0.762...","[0.7045454545454546, 0.625, 0.5340909090909092...",[{1: [[0.29545455 0.72727273 0.72727273 0.7272...
RidgeClassifier,0.673235,0.050076,0.649659,0.13963,"[0.2442053094548655, 0.6250551925273976, 0.625...","[0.6884191176470589, 0.678921568627451, 0.6789...","[0.625, 0.5, 0.7045454545454546, 0.45454545454...",[[[[1.55986185 1.55986185 1.55986185 1.5598618...
SVM-l2,0.569102,0.110985,0.560568,0.120004,"[126485.52168552957, 0.09540954763499963, 3556...","[0.5, 0.6072303921568627, 0.5, 0.5, 0.5, 0.5, ...","[0.5, 0.875, 0.5, 0.5, 0.5, 0.5, 0.5, 0.545454...","[{'mean_fit_time': [0.0007049242655436198, 0.0..."
SVM-l1,0.724902,0.023963,0.716023,0.130988,"[0.09540954763499963, 0.2442053094548655, 0.09...","[0.7282475490196079, 0.6841299019607843, 0.768...","[0.5795454545454546, 0.875, 0.4886363636363636...","[{'mean_fit_time': [0.0008358955383300781, 0.0..."
RF,0.937981,0.041129,0.681932,0.124145,"[11, 1, 1, 11, 5, 9, 3, 1, 11, 5, 3, 1, 7, 9, ...","[0.960171568627451, 0.8866421568627451, 0.9436...","[0.7840909090909092, 0.45454545454545453, 0.69...","[{'mean_fit_time': [0.001749118169148763, 0.00..."


In [32]:
df_results.to_latex()

"\\begin{tabular}{lrrrrllll}\n\\toprule\n{} &  mean\\_score\\_tr &  std\\_score\\_tr &  mean\\_score\\_ts &  std\\_score\\_ts &                                         parameters &                                          scores\\_tr &                                          scores\\_ts &                                          scores\\_CV \\\\\n\\midrule\nLogisticregression &       0.725236 &      0.025570 &       0.711477 &      0.130745 &  [[3.088843596477485e-06], [0.6250551925273976]... &  [0.7334558823529411, 0.7377450980392157, 0.762... &  [0.7045454545454546, 0.625, 0.5340909090909092... &  [\\{1: [[0.29545455 0.72727273 0.72727273 0.7272... \\\\\nRidgeClassifier    &       0.673235 &      0.050076 &       0.649659 &      0.139630 &  [0.2442053094548655, 0.6250551925273976, 0.625... &  [0.6884191176470589, 0.678921568627451, 0.6789... &  [0.625, 0.5, 0.7045454545454546, 0.45454545454... &  [[[[1.55986185 1.55986185 1.55986185 1.5598618... \\\\\nSVM-l2             &       0.56

In [30]:
df_results.to_pickle("../../../projects/neuroblastoma/data_integration/results/Benchmark_acgh.pkl")

## ELAstic net

In [65]:
df_results_new = df_results.iloc[[0,1,4,5], :]

In [66]:
df_results_new

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.998088,0.007962,0.832045,0.108576,"[[0.014563484775012445], [0.002222996482526195...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.7840909090909092, 0.5795454545454546, 0.875...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...
RidgeClassifier,1.0,0.0,0.834545,0.105582,"[19306.977288832535, 7543.120063354608, 19306....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.75, 0.8295454545454546, 0.9545454545454546,...",[[[[0.0467198 0.0467198 0.0467198 0.0467198...
RF,0.997757,0.009187,0.753409,0.10134,"[77, 59, 31, 27, 87, 83, 41, 9, 9, 21, 25, 47,...","[1.0, 1.0, 1.0, 0.9852941176470589, 1.0, 1.0, ...","[0.8295454545454546, 0.875, 0.625, 0.784090909...","[{'mean_fit_time': [0.01038829485575358, 0.013..."
SVM-l2,0.994654,0.011555,0.841591,0.110652,"[5.1794746792312125e-05, 7.906043210907702e-06...","[1.0, 0.9411764705882353, 0.9852941176470589, ...","[0.6931818181818181, 0.8295454545454546, 0.909...","[{'mean_fit_time': [0.03692324956258138, 0.031..."
