In [8]:
"""General methyl data import using MSM package. General dictonary data, beta value methylation array df, survival metadata df"""

import sys
sys.path.insert(0, '/Users/zacksiegfried/Documents/methylspan')
import MSM

all_files = MSM.pullMethylMetaData('pancreas')

# not gaureteed to return same number of cases, (remove file_id from sample data if missing from meta data)
# missing values dropped anyways in next steps
sample_data = MSM.methylDataFormat(all_files, 100)
meta_data = MSM.metaDataFormat(sample_data, all_files)



Number of methylation files for all cases for primary site pancreas: 430
Final number of cases in sample dataset : 90


In [9]:
### FORMATTING FOR SURVIVAL + PCA

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# drops all CpG sites with at least 1 missing value
sample_data.index.name = 'id'
sample_data.dropna(axis=1, inplace=True)

# standardizes dataframe
scaled_df = sample_data.copy()
scaled_df = pd.DataFrame(StandardScaler().fit_transform(scaled_df), index=scaled_df.index, columns=scaled_df.columns)

# converts CpG sites to PCA components
pca = PCA(n_components=14)      # setting number of PCAs to 14
pca.fit(scaled_df)

output = pca.transform(scaled_df)
x = pd.DataFrame(output, index=scaled_df.index)

meta_data.index.name = 'id'

# merges dfs and formats 
full_data = pd.merge(meta_data, x, on = 'id')
full_data.replace(['Dead', 'Alive'],[0, 1], inplace=True)
full_data = full_data[full_data['days_to_death'].notna()]

print(full_data.shape)

(84, 16)
                                      days_to_death  vital_status           0  \
id                                                                              
cc335f46-8c10-49f8-a056-9c38a9a4b45d         1305.0             0  -58.174093   
290b29b7-65aa-4132-88aa-99881d8275aa          980.0             0 -119.038828   
4e7c3fa8-3a45-4e41-850d-160b593db2c2          902.0             0 -183.947086   
70c62c07-ca7e-44c6-8c1e-28b9bfba517e          902.0             0  -98.115197   
132c5f3c-d91c-41ec-84e4-2cf013b91389          860.0             0 -172.665588   
...                                             ...           ...         ...   
96fcf55e-af6f-4dcc-94b4-8709a19a8a5f           12.0             0  135.763640   
60613cce-978a-4f0f-8402-573dc6415296         1502.0             0  -69.907017   
a3871b6b-b5e8-4346-94d0-b0b4067fd45f          532.0             0  149.581465   
1917b8b2-cdbf-40f9-9d33-67a3f2442d72         2036.0             0  126.316215   
f8186e89-50ea-4807-

In [10]:
### SURVIVAL ANALYSIS EXECUTION & TESTING

from lifelines import CoxPHFitter

cph = CoxPHFitter()
cph.fit(full_data, duration_col='days_to_death', event_col='vital_status')

print(cph.confidence_intervals_)

           95% lower-bound  95% upper-bound
covariate                                  
0                -0.007215         0.007155
1                -0.005838         0.003801
2                -0.004677         0.009295
3                -0.008030         0.005117
4                -0.006275         0.004274
5                -0.005814         0.012265
6                -0.014173         0.004003
7                -0.007342         0.015431
8                -0.009551         0.006126
9                -0.026396         0.008842
10               -0.011478         0.009689
11               -0.014758         0.015510
12               -0.003803         0.053177
13               -0.014262         0.002988


  self.params_ = pd.Series(params_, index=pd.Index(X.columns, name="covariate"), name="coef")
