In [8]:
import numpy as np
import pandas as pd
#import torch
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score

In [9]:
rna_seq = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_rnaseq_genes_145subjects_17072genes.npy")
acgh = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_1mb_aCGH.npy")
clinical = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_clinical_data_for_fit.npy")

stages = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_stages.npy")
stages = np.array(stages).flatten().astype(str)
#stages

y = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_outputs.npy")
mergeable = pd.read_csv("/home/vero/projects/camda/NB/data_cleaned/clinical_data_for_fit.csv", index_col=0)
clinical_HR = pd.read_csv("/home/vero/projects/camda/NB/data_cleaned/info_patients_clinical.csv", 
                          index_col=0)['high_risk2']
clinical_HR.sort_index()
HR = clinical_HR.loc[mergeable.index].values
HR[np.where(HR=='HR')] = '1'
HR = HR.astype(int)

# benchmark single dataset

## Prediction with RNA-Seq

In [10]:
y = y[:,1]
X = acgh
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)



In [4]:
columns = ['mean_score_tr', 'std_score_tr', 'mean_score_ts', 'std_score_ts', 
                                   'parameters', 'scores_tr', 'scores_ts', 'scores_CV']
df_results = pd.DataFrame(columns=columns)

### results with logistic regression

In [11]:
from sklearn.linear_model import LogisticRegressionCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(X, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = X[train,:]
    y_tr = y[train]
    imp.fit(x_tr)
    x_tr = imp.transform(x_tr)
    x_ts = X[test, :]
    x_ts = imp.transform(x_ts)
    y_ts = y[test]
    lr = LogisticRegressionCV(Cs=np.logspace(-10,10,50))
    lr.fit(x_tr, y_tr)
    parameters.append(lr.C_)
    val_scores.append(lr.scores_)
    tr_scores.append(balanced_accuracy_score(y_tr, lr.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, lr.predict(x_ts)))

In [12]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts, parameters)

0.7811366421568627 0.20848416565839864 0.5320454545454545 0.09215593818169701 [array([0.24420531]), array([1.e-10]), array([1.59985872]), array([1.e-10]), array([1.e-10]), array([0.03727594]), array([0.24420531]), array([0.24420531]), array([7543.12006335]), array([0.62505519]), array([1.e-10]), array([1.e-10]), array([1.59985872]), array([175.75106249]), array([0.09540955]), array([0.62505519]), array([0.09540955]), array([68.6648845]), array([0.09540955]), array([1.59985872]), array([0.03727594]), array([1.e-10]), array([1.e-10]), array([0.01456348]), array([0.62505519]), array([1.e-10]), array([1151.39539933]), array([1.e-10]), array([0.24420531]), array([0.62505519]), array([0.24420531]), array([10.48113134]), array([0.62505519]), array([0.09540955]), array([4.09491506]), array([1.e-10]), array([0.24420531]), array([4.09491506]), array([1.e-10]), array([0.09540955]), array([0.09540955]), array([1151.39539933]), array([175.75106249]), array([0.09540955]), array([0.09540955]), array(

In [13]:
res = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['Logisticregression'])
df_results = df_results.append(res)

In [14]:
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.781137,0.208484,0.532045,0.092156,"[[0.2442053094548655], [1e-10], [1.59985871960...","[0.9212622549019608, 0.5, 1.0, 0.5, 0.5, 0.685...","[0.6590909090909092, 0.5, 0.5340909090909092, ...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...


### results with Ridge

In [15]:
from sklearn.linear_model import RidgeClassifierCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(X, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = X[train,:]
    y_tr = y[train]
    imp.fit(x_tr)
    x_tr = imp.transform(x_tr)
    x_ts = X[test, :]
    x_ts = imp.transform(x_ts)
    y_ts = y[test]
    model = RidgeClassifierCV(alphas=np.logspace(-10,10,50), store_cv_values=True)
    model.fit(x_tr, y_tr)
    parameters.append(model.alpha_)
    val_scores.append(model.cv_values_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))
    

In [16]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

0.6134620098039217 0.07743355373529485 0.5096590909090908 0.055748452708024504


In [17]:
res = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['RidgeClassifier'])
df_results = df_results.append(res)

In [18]:
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.781137,0.208484,0.532045,0.092156,"[[0.2442053094548655], [1e-10], [1.59985871960...","[0.9212622549019608, 0.5, 1.0, 0.5, 0.5, 0.685...","[0.6590909090909092, 0.5, 0.5340909090909092, ...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...
RidgeClassifier,0.613462,0.077434,0.509659,0.055748,"[175.75106248547965, 175.75106248547965, 68.66...","[0.5588235294117647, 0.5925245098039216, 0.759...","[0.5, 0.45454545454545453, 0.625, 0.5, 0.625, ...",[[[[0.4587001 0.4587001 0.4587001 0.4587001...


### results with SVM

In [19]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(X, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = X[train,:]
    y_tr = y[train]
    imp.fit(x_tr)
    x_tr = imp.transform(x_tr)
    x_ts = X[test, :]
    x_ts = imp.transform(x_ts)
    y_ts = y[test]
    model = LinearSVC()
    gscv = GridSearchCV(model, {'C':np.logspace(-10,10,50)}, cv=3, iid=True)
    gscv.fit(x_tr, y_tr)
    model = gscv.best_estimator_
    parameters.append(gscv.best_params_['C'])
    val_scores.append(gscv.cv_results_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))
    

In [20]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

0.7798100490196078 0.1707749197057689 0.5386363636363636 0.10191857874899393


In [21]:
es = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['SVM-l2'])
df_results = df_results.append(es)

In [22]:
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.781137,0.208484,0.532045,0.092156,"[[0.2442053094548655], [1e-10], [1.59985871960...","[0.9212622549019608, 0.5, 1.0, 0.5, 0.5, 0.685...","[0.6590909090909092, 0.5, 0.5340909090909092, ...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...
RidgeClassifier,0.613462,0.077434,0.509659,0.055748,"[175.75106248547965, 175.75106248547965, 68.66...","[0.5588235294117647, 0.5925245098039216, 0.759...","[0.5, 0.45454545454545453, 0.625, 0.5, 0.625, ...",[[[[0.4587001 0.4587001 0.4587001 0.4587001...
SVM-l2,0.77981,0.170775,0.538636,0.101919,"[0.014563484775012445, 0.09540954763499963, 0....","[0.8771446078431373, 1.0, 0.9705882352941176, ...","[0.45454545454545453, 0.45454545454545453, 0.7...","[{'mean_fit_time': [0.007082462310791016, 0.00..."


## SVM with l1

In [23]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(X, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = X[train,:]
    y_tr = y[train]
    imp.fit(x_tr)
    x_tr = imp.transform(x_tr)
    x_ts = X[test, :]
    x_ts = imp.transform(x_ts)
    y_ts = y[test]
    model = LinearSVC(penalty='l1', loss='squared_hinge', dual=False)
    gscv = GridSearchCV(model, {'C':np.logspace(-10,10,50)}, cv=3, iid=True)
    gscv.fit(x_tr, y_tr)
    model = gscv.best_estimator_
    parameters.append(gscv.best_params_['C'])
    val_scores.append(gscv.cv_results_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))
    

In [24]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

0.877735906862745 0.1582870342825176 0.5612499999999999 0.11925750621766414


In [25]:
es = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['SVM-l1'])
df_results = df_results.append(es)
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.781137,0.208484,0.532045,0.092156,"[[0.2442053094548655], [1e-10], [1.59985871960...","[0.9212622549019608, 0.5, 1.0, 0.5, 0.5, 0.685...","[0.6590909090909092, 0.5, 0.5340909090909092, ...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...
RidgeClassifier,0.613462,0.077434,0.509659,0.055748,"[175.75106248547965, 175.75106248547965, 68.66...","[0.5588235294117647, 0.5925245098039216, 0.759...","[0.5, 0.45454545454545453, 0.625, 0.5, 0.625, ...",[[[[0.4587001 0.4587001 0.4587001 0.4587001...
SVM-l2,0.77981,0.170775,0.538636,0.101919,"[0.014563484775012445, 0.09540954763499963, 0....","[0.8771446078431373, 1.0, 0.9705882352941176, ...","[0.45454545454545453, 0.45454545454545453, 0.7...","[{'mean_fit_time': [0.007082462310791016, 0.00..."
SVM-l1,0.877736,0.158287,0.56125,0.119258,"[0.2442053094548655, 1.5998587196060574, 1e-10...","[0.6807598039215685, 1.0, 0.5, 0.5, 0.87714460...","[0.5, 0.5681818181818181, 0.5, 0.5, 0.5, 0.409...","[{'mean_fit_time': [0.02530670166015625, 0.019..."


### results with Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier

tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(X, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = X[train,:]
    y_tr = y[train]
    imp.fit(x_tr)
    x_tr = imp.transform(x_tr)
    x_ts = X[test, :]
    x_ts = imp.transform(x_ts)
    y_ts = y[test]
    model = RandomForestClassifier()
    gscv = GridSearchCV(model, {'n_estimators':np.array(np.linspace(1,100,50)).astype(int)}, cv=3)
    gscv.fit(x_tr, y_tr)
    model = gscv.best_estimator_
    parameters.append(gscv.best_params_['n_estimators'])
    val_scores.append(gscv.cv_results_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))









In [27]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

0.9948345588235294 0.0130189243121903 0.6348863636363636 0.12259731123944154


In [28]:
res = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['RF'])
df_results = df_results.append(res)

In [29]:
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.781137,0.208484,0.532045,0.092156,"[[0.2442053094548655], [1e-10], [1.59985871960...","[0.9212622549019608, 0.5, 1.0, 0.5, 0.5, 0.685...","[0.6590909090909092, 0.5, 0.5340909090909092, ...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...
RidgeClassifier,0.613462,0.077434,0.509659,0.055748,"[175.75106248547965, 175.75106248547965, 68.66...","[0.5588235294117647, 0.5925245098039216, 0.759...","[0.5, 0.45454545454545453, 0.625, 0.5, 0.625, ...",[[[[0.4587001 0.4587001 0.4587001 0.4587001...
SVM-l2,0.77981,0.170775,0.538636,0.101919,"[0.014563484775012445, 0.09540954763499963, 0....","[0.8771446078431373, 1.0, 0.9705882352941176, ...","[0.45454545454545453, 0.45454545454545453, 0.7...","[{'mean_fit_time': [0.007082462310791016, 0.00..."
SVM-l1,0.877736,0.158287,0.56125,0.119258,"[0.2442053094548655, 1.5998587196060574, 1e-10...","[0.6807598039215685, 1.0, 0.5, 0.5, 0.87714460...","[0.5, 0.5681818181818181, 0.5, 0.5, 0.5, 0.409...","[{'mean_fit_time': [0.02530670166015625, 0.019..."
RF,0.994835,0.013019,0.634886,0.122597,"[19, 83, 69, 25, 77, 25, 47, 5, 29, 45, 9, 95,...","[1.0, 1.0, 1.0, 1.0, 1.0, 0.9852941176470589, ...","[0.75, 0.5, 0.45454545454545453, 0.57954545454...","[{'mean_fit_time': [0.004931131998697917, 0.00..."


In [31]:
df_results.to_latex()

"\\begin{tabular}{lrrrrllll}\n\\toprule\n{} &  mean\\_score\\_tr &  std\\_score\\_tr &  mean\\_score\\_ts &  std\\_score\\_ts &                                         parameters &                                          scores\\_tr &                                          scores\\_ts &                                          scores\\_CV \\\\\n\\midrule\nLogisticregression &       0.781137 &      0.208484 &       0.532045 &      0.092156 &  [[0.2442053094548655], [1e-10], [1.59985871960... &  [0.9212622549019608, 0.5, 1.0, 0.5, 0.5, 0.685... &  [0.6590909090909092, 0.5, 0.5340909090909092, ... &  [\\{1: [[0.72727273 0.72727273 0.72727273 0.7272... \\\\\nRidgeClassifier    &       0.613462 &      0.077434 &       0.509659 &      0.055748 &  [175.75106248547965, 175.75106248547965, 68.66... &  [0.5588235294117647, 0.5925245098039216, 0.759... &  [0.5, 0.45454545454545453, 0.625, 0.5, 0.625, ... &  [[[[0.4587001  0.4587001  0.4587001  0.4587001... \\\\\nSVM-l2             &       0.77

In [30]:
df_results.to_pickle("../../../projects/neuroblastoma/data_integration/results/Benchmark_acgh.pkl")

## ELAstic net

In [65]:
df_results_new = df_results.iloc[[0,1,4,5], :]

In [66]:
df_results_new

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.998088,0.007962,0.832045,0.108576,"[[0.014563484775012445], [0.002222996482526195...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.7840909090909092, 0.5795454545454546, 0.875...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...
RidgeClassifier,1.0,0.0,0.834545,0.105582,"[19306.977288832535, 7543.120063354608, 19306....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.75, 0.8295454545454546, 0.9545454545454546,...",[[[[0.0467198 0.0467198 0.0467198 0.0467198...
RF,0.997757,0.009187,0.753409,0.10134,"[77, 59, 31, 27, 87, 83, 41, 9, 9, 21, 25, 47,...","[1.0, 1.0, 1.0, 0.9852941176470589, 1.0, 1.0, ...","[0.8295454545454546, 0.875, 0.625, 0.784090909...","[{'mean_fit_time': [0.01038829485575358, 0.013..."
SVM-l2,0.994654,0.011555,0.841591,0.110652,"[5.1794746792312125e-05, 7.906043210907702e-06...","[1.0, 0.9411764705882353, 0.9852941176470589, ...","[0.6931818181818181, 0.8295454545454546, 0.909...","[{'mean_fit_time': [0.03692324956258138, 0.031..."
