In [22]:
from __future__ import absolute_import, division, print_function
import pathlib

import pandas as pd
import seaborn as sns
import numpy as np
import os
from sklearn.linear_model import ElasticNetCV
from sklearn import decomposition, datasets
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import RepeatedKFold
from numpy import arange
from sklearn.preprocessing import StandardScaler
from numpy.random import normal
from numpy import hstack
from statsmodels.distributions.empirical_distribution import ECDF
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


In [23]:
#need to download these files everytime
gc_burden_path = "gene_clinical_burden.csv" # generate by merge_two_files
gc_data = pd.read_csv(gc_burden_path, dtype = 'object')

hyp_path = "hyperparemter_data.csv" # generate by merge_two_files
hyp_data = pd.read_csv(hyp_path, dtype = 'object')

In [24]:
gc_data=gc_data.apply(pd.to_numeric)


In [25]:
# variables for pan cancer analysis
#will be used in manipulation for specific cancer type analysis as well
mat_data = gc_data.values.astype(float)
x_label = gc_data.columns
n = mat_data.shape[0]
d = mat_data.shape[1]
x = mat_data[:,:-1]

In [26]:
#ecdf
def ecdf(data):
    """ Compute ECDF """
    x = np.sort(data)
    n = x.size
    y = np.arange(1, n+1) / n
    return(y)
count=ecdf(x[:,0])
x[:,0]=count
y = mat_data[:,-1]

In [27]:
#creating variable for specific cancer types
cancer_start = (np.where(x_label=='ACC'))[0][0]
cancer_end = np.where(x_label == 'UVM')[0][0]
cancer_names = x_label[cancer_start : cancer_end+1]
#cancer data is the data of each patient for each type of cancer
#ie does a patient have a certain cancer
cancer_data = x[:,cancer_start:cancer_end+1]

#nx produces the x dtaset for the cancer specific type by getting rid of the cancer classifications
nx = x
nx = np.delete(nx,list(range(cancer_start, cancer_end+1)), axis=1)
nx_label = x_label
nx_label = np.delete(nx_label, list(range(cancer_start, cancer_end+1)))

In [28]:
#hyperamatyer variable needed for the model
Alpha=hyp_data['alpha']
l1= hyp_data['l1_ratio']


In [29]:
#turn values into floats
l1=pd.to_numeric(l1, errors='coerce')
Alpha=pd.to_numeric(Alpha, errors='coerce')


In [30]:
for i in range(len(cancer_names)):
  x_=nx[np.where(cancer_data[:,i]==1)]
  y_=y[np.where(cancer_data[:,i]==1)] 
  print(cancer_names[i])
  print(x_.shape)

ACC
(41, 308)
BLCA
(385, 308)
BRCA
(844, 308)
CESC
(230, 308)
CHOL
(34, 308)
COAD
(268, 308)
ESCA
(159, 308)
GBM
(294, 308)
HNSC
(473, 308)
KICH
(42, 308)
KIRC
(308, 308)
KIRP
(198, 308)
LGG
(495, 308)
LIHC
(329, 308)
LUAD
(423, 308)
LUSC
(356, 308)
MESO
(67, 308)
OV
(372, 308)
PAAD
(151, 308)
PCPG
(83, 308)
PRAD
(97, 308)
READ
(76, 308)
SARC
(165, 308)
SKCM
(436, 308)
STAD
(358, 308)
TGCT
(69, 308)
THCA
(314, 308)
UCEC
(483, 308)
UCS
(56, 308)
UVM
(55, 308)


In [44]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score 
from scipy import stats
import sklearn.metrics as metrics
import warnings
warnings.filterwarnings("ignore")

mean_square_errors=[]
root_mean_square_errors =[]
mean_abs_errors =[]
r_squared =[]
for i in range(len(cancer_names)):
  x_=nx[np.where(cancer_data[:,i]==1)]
  y_=y[np.where(cancer_data[:,i]==1)] 
  X_train, X_test, y_train, y_test = train_test_split(x_, y_, test_size=0.25)
  enet_regr= ElasticNet( l1_ratio=l1[i],random_state=0, max_iter=10000, alpha=Alpha[i])
  model_=enet_regr.fit(X_train, y_train)
  y_pred = model_.predict(X_test)
  m_val= mean_squared_error(y_test, y_pred)
  mean_square_errors.append(m_val)
  rm_val=mean_squared_error(y_test, y_pred, squared=False)
  root_mean_square_errors.append(rm_val)
  ma_val=mean_absolute_error(y_test, y_pred)
  mean_abs_errors.append(ma_val)
  actual = y_test
  predict =y_pred
  correlation_matrix = np.corrcoef(actual, predict)
  correlation_xy = correlation_matrix[0,1]
  r_squared_val = correlation_xy**2
  r_squared.append(r_squared_val)

In [45]:
df=pd.DataFrame({'Cancer Type':cancer_names,'Mean Square Error': mean_square_errors,'Root Mean Square Error':root_mean_square_errors,'Mean Absolute Error': mean_abs_errors, 'R Squared': r_squared})


In [46]:
df

Unnamed: 0,Cancer Type,Mean Square Error,Root Mean Square Error,Mean Absolute Error,R Squared
0,ACC,0.007584,0.087087,0.060358,0.12405
1,BLCA,0.025095,0.158414,0.126137,0.025184
2,BRCA,0.019105,0.138219,0.107684,0.142171
3,CESC,0.016868,0.129878,0.107197,0.00671
4,CHOL,0.078509,0.280194,0.193103,0.017745
5,COAD,0.011041,0.105076,0.08785,0.235094
6,ESCA,0.009393,0.096917,0.078921,0.050003
7,GBM,0.016321,0.127755,0.10125,0.202543
8,HNSC,0.0145,0.120415,0.095422,0.122488
9,KICH,0.014211,0.11921,0.091097,0.050278


In [47]:
df.sort_values(by=['R Squared'])

Unnamed: 0,Cancer Type,Mean Square Error,Root Mean Square Error,Mean Absolute Error,R Squared
14,LUAD,0.016384,0.128001,0.107576,0.000592
17,OV,0.008927,0.094484,0.070755,0.003951
18,PAAD,0.02693,0.164105,0.137291,0.004428
10,KIRC,0.01375,0.117261,0.093599,0.004938
3,CESC,0.016868,0.129878,0.107197,0.00671
27,UCEC,0.012139,0.110178,0.08066,0.006898
21,READ,0.008364,0.091454,0.076997,0.008463
23,SKCM,0.036464,0.190954,0.164222,0.012384
4,CHOL,0.078509,0.280194,0.193103,0.017745
29,UVM,0.002848,0.053366,0.044037,0.020587


In [None]:
from google.colab import drive
drive.mount('drive')

In [None]:
df.to_csv('drive/MyDrive/Mount Sinai/metrics_data.csv')