In [None]:
import pandas as pd
import numpy as np
# StratifiedKFold
from sklearn.model_selection import StratifiedKFold
# CoxPHFitter
from sklearn.model_selection import train_test_split
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index
# PCA
from sklearn.decomposition import PCA
from utils import SAkfold

In [None]:
# data biomarkers
df_bio = pd.read_csv('data/sustain_results_biomarkers_rmstage0_zscored.csv')
print(df_bio.columns)

In [None]:
# biomarkers: HbA1c, BMI, HDLc, TyG, HLS_Retic, Glucose, RET, IRF, ApoA, ALT, TG, Urate, WBC, CHOL, CRP, LYM, LDLc, Vitamin_D
biomarkers =['HbA1c', 'BMI', 'HDLc', 'TyG', 'HLS_Retic', 'Glucose', 'RET', 'IRF', 'ApoA', 
             'ALT', 'TG', 'Urate', 'WBC', 'CHOL', 'CRP', 'LYM', 'LDLc', 'Vitamin_D']
diseases = [
    'Alzheimers_disease',
    'Anorexia_nervosa',
    'Anxiety_disorder',
    'Bipolar_disorder',
    'Depression_disorder',
    'Epilepsy',
    'Multiple_sclerosis',
    'Obsessive_compulsive_disorder',
    'Parkinsons_disease',
    'Schizophrenia',
    'Sleep_disorder',
    'Stroke'
]

In [None]:
# Subtype
df_bio['Subtype1'] = np.where(((df_bio['Subtype'] == 'Subtype 1') & (df_bio['t2dm'] == 1)), 1, 0)
df_bio['Subtype2'] = np.where(((df_bio['Subtype'] == 'Subtype 2') & (df_bio['t2dm'] == 1)), 1, 0)
print(df_bio['Subtype1'].value_counts())
print(df_bio['Subtype2'].value_counts())

In [None]:
# prediction model 1, only use biomarkers for disease prediction
feats = biomarkers
res_baseline = SAkfold(diseases=diseases, df_bio=df_bio, kf=10, feats=feats)

In [None]:
# mean, std of c-index
mean_c_index = np.mean(res_baseline, axis=1)
std_c_index = np.std(res_baseline, axis=1)
# to csv
df_res = pd.DataFrame({'disease': diseases, 'mean_c_index': mean_c_index, 'std_c_index': std_c_index})
df_res.to_csv('results/prediction/baseline_c_index.csv', index=False)

In [None]:
# prediction model 2, biomarkers and subtype 1 for disease prediction
# use the first N components as features for prediction
# feats = ['PC' + str(i) for i in range(N)]
feats = biomarkers + ['Subtype1']
res_s1 = SAkfold(diseases=diseases, df_bio=df_bio, kf=10, feats=feats)

In [None]:
mean_c_index = np.mean(res_s1, axis=1)
std_c_index = np.std(res_s1, axis=1)
# to csv
df_res_s1 = pd.DataFrame({'disease': diseases, 'mean_c_index': mean_c_index, 'std_c_index': std_c_index})
df_res_s1.to_csv('results/prediction/subtype1_c_index.csv', index=False)

In [None]:
# prediction model 3, biomarkers and subtype 2 for disease prediction
# use the first N components as features for prediction
# feats = ['PC' + str(i) for i in range(N)]
feats = biomarkers + ['Subtype2']
res_s2 = SAkfold(diseases=diseases, df_bio=df_bio, kf=10, feats=feats)

In [None]:
# mean, std of c-index
mean_c_index = np.mean(res_s2, axis=1)
std_c_index = np.std(res_s2, axis=1)
# to csv
df_res_s2 = pd.DataFrame({'disease': diseases, 'mean_c_index': mean_c_index, 'std_c_index': std_c_index})
df_res_s2.to_csv('results/prediction/subtype2_c_index.csv', index=False)

In [None]:
# merge s1, s2, baseline
cols = df_res.columns.tolist()
df_res['group'] = 'Baseline'
df_res_s1['group'] = 'Subtype 1'
df_res_s2['group'] = 'Subtype 2'
df_res_all = pd.concat([df_res, df_res_s1, df_res_s2])
df_res_all = df_res_all[['group', 'disease', 'mean_c_index', 'std_c_index']]

In [None]:
# sort by disease, group
df_res_all = df_res_all.sort_values(by=['disease', 'group'])
# to csv
df_res_all.to_csv('results/prediction/all_c_index.csv', index=False)

In [None]:
# pvalues for subtype 1 and control
from scipy.stats import ttest_ind
pvalues_s1, pvalues_s2, pvalues_s12 = [], [], []
for disease in diseases:
    c_index_control = res_baseline[diseases.index(disease)]
    c_index_s1 = res_s1[diseases.index(disease)]
    c_index_s2 = res_s2[diseases.index(disease)]
    _, pvalue_s1c = ttest_ind(c_index_s1, c_index_control)
    _, pvalue_s2c = ttest_ind(c_index_s2, c_index_control)
    _, pvalue_s12 = ttest_ind(c_index_s1, c_index_s2)
    pvalues_s1.append(pvalue_s1c)
    pvalues_s2.append(pvalue_s2c)
    pvalues_s12.append(pvalue_s12)

In [None]:
# to csv
df_pvalues = pd.DataFrame({'disease': diseases, 'pvalue_s1': pvalues_s1, 'pvalue_s2': pvalues_s2, 'pvalue_s12': pvalues_s12})

In [None]:
df_pvalues.to_csv('results/prediction/pvalues_cindex.csv', index=False)

In [None]:
# save all fold results
diseases_rep = [disease for disease in diseases for _ in range(10)]
df_res_baseline_af = pd.DataFrame({'disease': diseases_rep, 'c_index': res_baseline.flatten(), 'group': 'Baseline'})
df_res_s1_af = pd.DataFrame({'disease': diseases_rep, 'c_index': res_s1.flatten(), 'group': 'Subtype 1'})
df_res_s2_af = pd.DataFrame({'disease': diseases_rep, 'c_index': res_s2.flatten(), 'group': 'Subtype 2'})
# merge
df_res_all_af = pd.concat([df_res_baseline_af, df_res_s1_af, df_res_s2_af], axis=0)
# to csv
df_res_all_af.to_csv('results/prediction/all_c_index_af.csv', index=False)

In [None]:
cph = CoxPHFitter()
df_sa = pd.read_csv('data/subtype1/survival_data_hypertension.csv')

In [None]:
# train test split
df_train, df_test = train_test_split(df_sa, test_size=0.2, random_state=42)

In [None]:
# drop column Eid and Stage
# df_train = df_train.drop(columns=['Eid', 'Stage'])
cph.fit(df_train, duration_col='time', event_col='hypertension')

In [None]:
# predict survival at time 1-15 years
survival = cph.predict_survival_function(df_test[['Subtype']], times=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])