In [2]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import balanced_accuracy_score

In [3]:
rna_seq = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_rnaseq_genes_145subjects_17072genes.npy")
acgh = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_1mb_aCGH.npy")
clinical = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_clinical_data_for_fit.npy")

stages = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_stages.npy")
stages = np.array(stages).flatten().astype(str)
#stages

y = np.load("/home/vero/projects/camda/NB/data_cleaned/dataset_outputs.npy")
mergeable = pd.read_csv("/home/vero/projects/camda/NB/data_cleaned/clinical_data_for_fit.csv", index_col=0)
clinical_HR = pd.read_csv("/home/vero/projects/camda/NB/data_cleaned/info_patients_clinical.csv", 
                          index_col=0)['high_risk2']
clinical_HR.sort_index()
HR = clinical_HR.loc[mergeable.index].values
HR[np.where(HR=='HR')] = '1'
HR = HR.astype(int)

# benchmark single dataset

## Prediction with RNA-Seq

In [8]:
y = y[:,1]

In [36]:
columns = ['mean_score_tr', 'std_score_tr', 'mean_score_ts', 'std_score_ts', 
                                   'parameters', 'scores_tr', 'scores_ts', 'scores_CV']
df_results = pd.DataFrame(columns=columns)

### results with logistic regression

In [23]:
from sklearn.linear_model import LogisticRegressionCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(rna_seq, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = rna_seq[train,:]
    y_tr = y[train]
    x_ts = rna_seq[test, :]
    y_ts = y[test]
    lr = LogisticRegressionCV(Cs=np.logspace(-10,10,50))
    lr.fit(x_tr, y_tr)
    parameters.append(lr.C_)
    val_scores.append(lr.scores_)
    tr_scores.append(balanced_accuracy_score(y_tr, lr.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, lr.predict(x_ts)))

In [24]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts, parameters)

0.9980882352941177 0.007961575108023044 0.8320454545454546 0.108575724803148 [array([0.01456348]), array([0.002223]), array([0.002223]), array([0.002223]), array([0.01456348]), array([0.00568987]), array([0.002223]), array([0.00086851]), array([68.6648845]), array([0.00033932]), array([0.00086851]), array([0.00568987]), array([0.03727594]), array([0.00033932]), array([0.00568987]), array([0.01456348]), array([0.01456348]), array([0.00013257]), array([0.00033932]), array([7543.12006335]), array([68.6648845]), array([0.03727594]), array([0.00086851]), array([0.002223]), array([0.09540955]), array([0.002223]), array([0.00033932]), array([175.75106249]), array([0.00086851]), array([0.00013257]), array([0.00086851]), array([2947.05170255]), array([175.75106249]), array([68.6648845]), array([0.00086851]), array([68.6648845]), array([0.01456348]), array([10.48113134]), array([0.00086851]), array([0.00086851]), array([0.00086851]), array([1151.39539933]), array([68.6648845]), array([0.00086851

In [37]:
res = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['Logisticregression'])
df_results = df_results.append(res)

### results with Ridge

In [40]:
from sklearn.linear_model import RidgeClassifierCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(rna_seq, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = rna_seq[train,:]
    y_tr = y[train]
    x_ts = rna_seq[test, :]
    y_ts = y[test]
    model = RidgeClassifierCV(alphas=np.logspace(-10,10,50), store_cv_values=True)
    model.fit(x_tr, y_tr)
    parameters.append(model.alpha_)
    val_scores.append(model.cv_values_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))
    

In [41]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

1.0 0.0 0.8345454545454547 0.10558181974732689


In [42]:
res = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['RidgeClassifier'])
df_results = df_results.append(res)

### results with SVM

In [60]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV


tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(rna_seq, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = rna_seq[train,:]
    y_tr = y[train]
    x_ts = rna_seq[test, :]
    y_ts = y[test]
    model = LinearSVC()
    gscv = GridSearchCV(model, {'C':np.logspace(-10,10,50)}, cv=3)
    gscv.fit(x_tr, y_tr)
    model = gscv.best_estimator_
    parameters.append(gscv.best_params_['C'])
    val_scores.append(gscv.cv_results_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))
    

In [61]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

0.9946537990196078 0.011555353563578683 0.8415909090909091 0.11065194668773398


In [62]:
es = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['SVM-l2'])
df_results = df_results.append(es)

In [64]:
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.998088,0.007962,0.832045,0.108576,"[[0.014563484775012445], [0.002222996482526195...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.7840909090909092, 0.5795454545454546, 0.875...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...
RidgeClassifier,1.0,0.0,0.834545,0.105582,"[19306.977288832535, 7543.120063354608, 19306....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.75, 0.8295454545454546, 0.9545454545454546,...",[[[[0.0467198 0.0467198 0.0467198 0.0467198...
RidgeClassifier,1.0,0.0,0.834545,0.105582,"[19306.977288832535, 7543.120063354608, 19306....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.75, 0.8295454545454546, 0.9545454545454546,...",[[[[0.0467198 0.0467198 0.0467198 0.0467198...
SVM-l2,0.997757,0.009187,0.753409,0.10134,"[77, 59, 31, 27, 87, 83, 41, 9, 9, 21, 25, 47,...","[1.0, 1.0, 1.0, 0.9852941176470589, 1.0, 1.0, ...","[0.8295454545454546, 0.875, 0.625, 0.784090909...","[{'mean_fit_time': [0.01038829485575358, 0.013..."
RF,0.997757,0.009187,0.753409,0.10134,"[77, 59, 31, 27, 87, 83, 41, 9, 9, 21, 25, 47,...","[1.0, 1.0, 1.0, 0.9852941176470589, 1.0, 1.0, ...","[0.8295454545454546, 0.875, 0.625, 0.784090909...","[{'mean_fit_time': [0.01038829485575358, 0.013..."
SVM-l2,0.994654,0.011555,0.841591,0.110652,"[5.1794746792312125e-05, 7.906043210907702e-06...","[1.0, 0.9411764705882353, 0.9852941176470589, ...","[0.6931818181818181, 0.8295454545454546, 0.909...","[{'mean_fit_time': [0.03692324956258138, 0.031..."


### results with Random Forest

In [53]:
from sklearn.ensemble import RandomForestClassifier

tr_scores = []
ts_scores = []
parameters = []
val_scores = []
sss = StratifiedShuffleSplit(n_splits=100)
for train, test in sss.split(rna_seq, y):
    sss_ = StratifiedKFold(n_splits=3)
    x_tr = rna_seq[train,:]
    y_tr = y[train]
    x_ts = rna_seq[test, :]
    y_ts = y[test]
    model = RandomForestClassifier()
    gscv = GridSearchCV(model, {'n_estimators':np.array(np.linspace(1,100,50)).astype(int)}, cv=3)
    gscv.fit(x_tr, y_tr)
    model = gscv.best_estimator_
    parameters.append(gscv.best_params_['n_estimators'])
    val_scores.append(gscv.cv_results_)
    tr_scores.append(balanced_accuracy_score(y_tr, model.predict(x_tr)))
    ts_scores.append(balanced_accuracy_score(y_ts, model.predict(x_ts)))

In [54]:
means_tr = np.mean(tr_scores)
std_tr = np.std(tr_scores)
means_ts = np.mean(ts_scores)
std_ts = np.std(ts_scores)
print(means_tr, std_tr, means_ts, std_ts)

0.9977573529411764 0.009187487004596567 0.7534090909090909 0.10134044781643882


In [58]:
res = pd.DataFrame([[means_tr, std_tr, means_ts, std_ts, parameters, tr_scores, ts_scores, val_scores]],
                   columns=columns, index=['RF'])
df_results = df_results.append(res)

In [59]:
df_results

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.998088,0.007962,0.832045,0.108576,"[[0.014563484775012445], [0.002222996482526195...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.7840909090909092, 0.5795454545454546, 0.875...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...
RidgeClassifier,1.0,0.0,0.834545,0.105582,"[19306.977288832535, 7543.120063354608, 19306....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.75, 0.8295454545454546, 0.9545454545454546,...",[[[[0.0467198 0.0467198 0.0467198 0.0467198...
RidgeClassifier,1.0,0.0,0.834545,0.105582,"[19306.977288832535, 7543.120063354608, 19306....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.75, 0.8295454545454546, 0.9545454545454546,...",[[[[0.0467198 0.0467198 0.0467198 0.0467198...
SVM-l2,0.997757,0.009187,0.753409,0.10134,"[77, 59, 31, 27, 87, 83, 41, 9, 9, 21, 25, 47,...","[1.0, 1.0, 1.0, 0.9852941176470589, 1.0, 1.0, ...","[0.8295454545454546, 0.875, 0.625, 0.784090909...","[{'mean_fit_time': [0.01038829485575358, 0.013..."
RF,0.997757,0.009187,0.753409,0.10134,"[77, 59, 31, 27, 87, 83, 41, 9, 9, 21, 25, 47,...","[1.0, 1.0, 1.0, 0.9852941176470589, 1.0, 1.0, ...","[0.8295454545454546, 0.875, 0.625, 0.784090909...","[{'mean_fit_time': [0.01038829485575358, 0.013..."


In [65]:
df_results_new = df_results.iloc[[0,1,4,5], :]

In [66]:
df_results_new

Unnamed: 0,mean_score_tr,std_score_tr,mean_score_ts,std_score_ts,parameters,scores_tr,scores_ts,scores_CV
Logisticregression,0.998088,0.007962,0.832045,0.108576,"[[0.014563484775012445], [0.002222996482526195...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.7840909090909092, 0.5795454545454546, 0.875...",[{1: [[0.72727273 0.72727273 0.72727273 0.7272...
RidgeClassifier,1.0,0.0,0.834545,0.105582,"[19306.977288832535, 7543.120063354608, 19306....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.75, 0.8295454545454546, 0.9545454545454546,...",[[[[0.0467198 0.0467198 0.0467198 0.0467198...
RF,0.997757,0.009187,0.753409,0.10134,"[77, 59, 31, 27, 87, 83, 41, 9, 9, 21, 25, 47,...","[1.0, 1.0, 1.0, 0.9852941176470589, 1.0, 1.0, ...","[0.8295454545454546, 0.875, 0.625, 0.784090909...","[{'mean_fit_time': [0.01038829485575358, 0.013..."
SVM-l2,0.994654,0.011555,0.841591,0.110652,"[5.1794746792312125e-05, 7.906043210907702e-06...","[1.0, 0.9411764705882353, 0.9852941176470589, ...","[0.6931818181818181, 0.8295454545454546, 0.909...","[{'mean_fit_time': [0.03692324956258138, 0.031..."


In [74]:
a = np.linspace(1,100,5).astype(int)

In [77]:
b = [a]*4

In [78]:
b

[array([  1,  25,  50,  75, 100]),
 array([  1,  25,  50,  75, 100]),
 array([  1,  25,  50,  75, 100]),
 array([  1,  25,  50,  75, 100])]

In [79]:
from sklearn.utils import check_random_state
rs = check_random_state(1)


In [98]:
X = [np.zeros((15,1)).T for i in np.linspace(10,40,10).astype(int)]

In [87]:
w_i = [np.zeros(p_i) for p_i in [X_i.shape[1] for X_i in X]]

In [88]:
w_i

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([0., 0., 0., 0., 0.

In [99]:
np.array(X)[:,0,:].shape

(10, 15)

In [106]:
y = np.zeros((10,1))

In [102]:
from sklearn.utils import check_consistent_length

In [107]:
check_consistent_length(X, y)