In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, confusion_matrix, log_loss
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import quantile_transform
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, SelectFdr
import matplotlib.pyplot as plt
import pickle
from sksurv.metrics import concordance_index_censored
import lifelines as ll
import glob
import os
import seaborn as sns
from scipy.stats import ttest_ind
# from lifelines.utils.sklearn_adapter import sklearn_adapter
# CoxRegression = sklearn_adapter(ll.CoxPHFitter, event_col = 'event')
import sys
sys.path.append('/odinn/users/thjodbjorge/Python_functions/')
import Predict_functions as pf
from Calculate_score import calculate_metrics, make_class_table
from R_functions import R_pROC,R_pROC_compareROC,R_pROC_compareROC_boot, R_pROC_AUC, R_timeROC, R_timeROC_CI, R_timeROC_pval, R_NRIbin,R_NRIcens,R_NRIcensipw, R_censROC, R_hoslem, R_Greenwood_Nam
from association_functions import group_assoc, associations

In [None]:
# raw_data = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/raw_with_info.csv',index_col = 'Barcode2d' )
probe_info = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/probe_info.csv', index_col = 'SeqId')

pn_info = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/pn_info_Mor/pn_info_Mor_event.csv',index_col = 'Barcode2d' )
probes_to_skip = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/probes_to_skip.txt')['probe']
nopro = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/no_protein_probes.txt', header = None)[0] # non-proten probes that were included 
probes_to_skip = set(probes_to_skip).union(set(nopro))

In [None]:
folder = '/odinn/users/thjodbjorge/Proteomics/Mortality2/'
feat_folder = 'Features2/'
pred_folder = 'Predictions3/'
plots = 'Plots2/'
save_plot = False
endpoints = ['death']
# endpoints = ['death','Cdeath','Gdeath','Ideath','Jdeath','Otherdeath']
# event_date = event_date_death
time_to_event = pn_info.time_to_death
no_event_before = pn_info.no_death_before
for endpoint in endpoints:
    if endpoint == 'death':
        use_event = pn_info.event_death
        print(use_event.sum())
    elif endpoint == 'Cdeath':
        use_event = pn_info.event_death & (pn_info.ICD_group == 'C')
        print(use_event.sum())
    elif endpoint == 'Gdeath':
        use_event = pn_info.event_death & (pn_info.ICD_group == 'G')
        print(use_event.sum())
    elif endpoint == 'Ideath':
        use_event = pn_info.event_death & (pn_info.ICD_group == 'I')
        print(use_event.sum())
    elif endpoint == 'Jdeath':
        use_event = pn_info.event_death & (pn_info.ICD_group == 'J')
        print(use_event.sum())
    elif endpoint == 'Otherdeath':
        use_event = pn_info.event_death & (~(pn_info.ICD_group == 'C')&~(pn_info.ICD_group == 'G')&~(pn_info.ICD_group == 'I')&~(pn_info.ICD_group == 'J'))
        print(use_event.sum())

y = []
for i in range(1,19):
    y.append(use_event & (time_to_event <= i))

kf = KFold(n_splits=10, random_state=10, shuffle=False) 
I_train_main, I_test_main = train_test_split(pn_info.index, train_size=0.7, random_state = 10)
# I_val_main, I_test_main = train_test_split(I_test_main, train_size=0.5, random_state = 10)


file = open(folder+"{}_keep_samples.pkl".format('Mor'),'rb')
keep_samples_dict = pickle.load(file)

print(keep_samples_dict.keys())

### Load Hera phenotypes

In [None]:
D = glob.glob('/odinn/data/phenotypes/pipeline/current_lists/quantitative//HERA*')
# D = D1+D2+D3+D4
print(len(D))
HERA_qt = pd.read_csv(D[0], delim_whitespace=True, index_col=0, names=['PN',os.path.basename(D[0])])

for i in range(1,len(D)):
        dfnew = pd.read_csv(D[i], delim_whitespace=True, index_col=0, names=['PN',os.path.basename(D[i])])
        HERA_qt=pd.merge(HERA_qt,dfnew, how='outer', left_index=True, right_index=True)

In [None]:
D = glob.glob('/odinn/data/phenotypes/pipeline/current_lists/categorical//HERA*')
# D = D1+D2+D3+D4
print(len(D))
HERA_cat = pd.read_csv(D[0], delim_whitespace=True, index_col=0, names=['PN',os.path.basename(D[0])])

for i in range(1,len(D)):
    try:
        dfnew = pd.read_csv(D[i], delim_whitespace=True, index_col=0, names=['PN',os.path.basename(D[i])])
        HERA_cat=pd.merge(HERA_cat,dfnew, how='outer', left_index=True, right_index=True)
    except:
        continue

In [None]:
HERA_cat

### Prepare pn_info variables

In [None]:
X = pd.DataFrame(index=pn_info.index)
X['PN'] = pn_info['PN']
X['sex'] = pn_info[['sex']].values-1
X['age'] = pn_info[['Age_at_sample_collection_2']].values

X['age2'] = X['age']**2
X['age3'] = X['age']**3
X['agesex'] = X['age']*X['sex']
X['age2sex'] = X['age2']*X['sex']
X['lnage'] = np.log(X['age'])

agesex = ['age','sex','agesex','age2','age2sex']

In [None]:
X = X.merge(HERA_qt, how = 'left',left_on='PN', right_index=True)
X = X.merge(HERA_cat, how = 'left',left_on='PN', right_index=True)

In [None]:
dataset = 'Old_18105'
HERA_dataset = 'HERA_18105'
all_dataset = 'All_18105'
new_dataset = 'New_18105'

pred_protein = pd.read_csv(folder+pred_folder+'{}_{}_protein_prediction_all.csv'.format(endpoint,dataset),index_col = 'Barcode2d')
pred_baseline = pd.read_csv(folder+pred_folder+'{}_{}_baseline_prediction_all.csv'.format(endpoint,dataset),index_col = 'Barcode2d')
# try: 
#     file = open(folder+pred_folder + "{}_{}_all_prediction.pkl".format(endpoint,dataset),'rb')
#     pred_dict = pickle.load(file)
# except:
#     print('No test predictions')
  
# print(pred_dict.keys())

k_plot=4
k = k_plot

plot_folder = '{}_{}/'.format(endpoint,dataset)

keep_samples = keep_samples_dict[dataset]

I_train = I_train_main.intersection(keep_samples)#.intersection(have_prs)
I_test = I_test_main.intersection(keep_samples)#.intersection(have_prs)

hera_samples = keep_samples_dict[HERA_dataset]
old_samples = keep_samples_dict[dataset]
all_samples = keep_samples_dict[all_dataset]
new_samples = keep_samples_dict[new_dataset]

I_old = old_samples
I_use =  hera_samples

y_train = y[k][I_train]
y_test= y[k][I_test]
y_use = y[k][I_use]


In [None]:
k=k_plot
# pred = pred_dict['{}_y{}_agesexprotein_l1'.format(dataset,k)]
pred = pred_protein['pred_y{}'.format(k)]

pred = pred.loc[I_use]

corr_col = agesex
pred_corr = pred.values - sm.OLS(pred,sm.add_constant(X.loc[I_use,corr_col])).fit().predict(sm.add_constant(X.loc[I_use,corr_col]))
pred_corr = pred_corr/pred_corr.std()

### experiment with correcting with splines
# I tried corrrecting for age with a smooth component to see if it affected the calculated correlations signifcantly. 
# It seems that the better age correction results in higher absolute correlation to phenotypes in most cases.
from statsmodels.gam.api import GLMGam, BSplines
agespline = BSplines(X.loc[I_use,['age','sex','agesex']],df=[5,5,5],degree=[3,2,3],include_intercept=True)
# GLMres = GLMGam(pred_corr, None,agespline).fit()
GLMres = GLMGam(pred, None,agespline).fit()
pred_corr = pred.values- GLMres.predict()
pred_corr = pred_corr/pred_corr.std()

### ------------

plt.plot(X.loc[I_use,'age'].sort_values(),np.sort(GLMres.predict()),'r')
plt.scatter(X.loc[I_use,'age'],pred_corr,alpha=0.2)
plt.title('Prediction after correction')
plt.xlabel('Age')
plt.ylabel('Prediction')
plt.legend(['Smoothly fitted age to corrected values','data points'])

X.loc[I_use,'pred'] = pred.values
X.loc[I_use,'pred_corr'] = pred_corr
X.loc[I_use,'pred_std'] = ((pred-pred.mean())/pred.std()).values
X.loc[I_use,'pred_corr_norm'] = quantile_transform(np.array(pred_corr).reshape(-1,1), n_quantiles=40000, random_state=10, output_distribution = 'normal',copy=True)

# pred_as = pred_dict['{}_y{}_baseline_lr'.format(dataset,k)]
pred_as = pred_baseline['pred_y{}'.format(k)]


pred_as = pred_as.loc[I_use]
pred_as_corr = pred_as.values - sm.OLS(pred_as,sm.add_constant(X.loc[I_use,corr_col])).fit().predict(sm.add_constant(X.loc[I_use,corr_col]))
pred_as_corr = pred_as_corr/pred_as_corr.std()

X.loc[I_use,'baseline'] = pred_as.values
X.loc[I_use,'baseline_corr'] = pred_as_corr
X.loc[I_use,'baseline_std'] = ((pred_as-pred_as.mean())/pred_as.std()).values

In [None]:
k=k_plot
# pred = pred_dict['{}_y{}_agesexprotein_l1'.format(dataset,k)]
pred = pred_protein['pred_y{}'.format(k)]
 
pred = pred.loc[I_old]

corr_col = agesex
pred_corr = pred.values - sm.OLS(pred,sm.add_constant(X.loc[I_old,corr_col])).fit().predict(sm.add_constant(X.loc[I_old,corr_col]))

pred_corr = pred_corr/pred_corr.std()

X.loc[I_old,'old_pred'] = pred.values
X.loc[I_old,'old_pred_corr'] = pred_corr
X.loc[I_old,'old_pred_std'] = ((pred-pred.mean())/pred.std()).values

# pred_as = pred_dict['{}_y{}_baseline_lr'.format(dataset,k)]
pred_as = pred_baseline['pred_y{}'.format(k)]
 
pred_as = pred_as.loc[I_old]
pred_as_corr = pred_as.values - sm.OLS(pred_as,sm.add_constant(X.loc[I_old,corr_col])).fit().predict(sm.add_constant(X.loc[I_old,corr_col]))
pred_as_corr = pred_as_corr/pred_as_corr.std()

X.loc[I_old,'old_baseline'] = pred_as.values
X.loc[I_old,'old_baseline_corr'] = pred_as_corr
X.loc[I_old,'old_baseline_std'] = ((pred_as-pred_as.mean())/pred_as.std()).values

In [None]:
I_use = all_samples
k=k_plot
# pred = pred_dict['{}_y{}_agesexprotein_l1'.format(dataset,k)]
pred = pred_protein['pred_y{}'.format(k)]
pred = pred.loc[I_use]

corr_col = agesex
pred_corr = pred.values - sm.OLS(pred,sm.add_constant(X.loc[I_use,corr_col])).fit().predict(sm.add_constant(X.loc[I_use,corr_col]))

pred_corr = pred_corr/pred_corr.std()

X.loc[I_use,'all_pred'] = pred.values
X.loc[I_use,'all_pred_corr'] = pred_corr
X.loc[I_use,'all_pred_std'] = ((pred-pred.mean())/pred.std()).values

# pred_as = pred_dict['{}_y{}_baseline_lr'.format(dataset,k)]
pred_as = pred_baseline['pred_y{}'.format(k)]
 
pred_as = pred_as.loc[I_use]
pred_as_corr = pred_as.values - sm.OLS(pred_as,sm.add_constant(X.loc[I_use,corr_col])).fit().predict(sm.add_constant(X.loc[I_use,corr_col]))
pred_as_corr = pred_as_corr/pred_as_corr.std()

X.loc[I_use,'all_baseline'] = pred_as.values
X.loc[I_use,'all_baseline_corr'] = pred_as_corr
X.loc[I_use,'all_baseline_std'] = ((pred_as-pred_as.mean())/pred_as.std()).values

In [None]:
I_use = new_samples
k=k_plot
# pred = pred_dict['{}_y{}_agesexprotein_l1'.format(dataset,k)]
pred = pred_protein['pred_y{}'.format(k)]
pred = pred.loc[I_use]

corr_col = agesex
pred_corr = pred.values - sm.OLS(pred,sm.add_constant(X.loc[I_use,corr_col])).fit().predict(sm.add_constant(X.loc[I_use,corr_col]))

pred_corr = pred_corr/pred_corr.std()

X.loc[I_use,'new_pred'] = pred.values
X.loc[I_use,'new_pred_corr'] = pred_corr
X.loc[I_use,'new_pred_std'] = ((pred-pred.mean())/pred.std()).values

# pred_as = pred_dict['{}_y{}_baseline_lr'.format(dataset,k)]
pred_as = pred_baseline['pred_y{}'.format(k)]
 
pred_as = pred_as.loc[I_use]
pred_as_corr = pred_as.values - sm.OLS(pred_as,sm.add_constant(X.loc[I_use,corr_col])).fit().predict(sm.add_constant(X.loc[I_use,corr_col]))
pred_as_corr = pred_as_corr/pred_as_corr.std()

X.loc[I_use,'new_baseline'] = pred_as.values
X.loc[I_use,'new_baseline_corr'] = pred_as_corr
X.loc[I_use,'new_baseline_std'] = ((pred_as-pred_as.mean())/pred_as.std()).values

### HERA quantitative

In [None]:
# %load_ext autoreload
# %autoreload 2

In [None]:
I_use = hera_samples

In [None]:
# pred_res = (pred_res-pred_res.mean())/pred_res.std()
covariates = agesex
pred_type={'org': 'pred', 'corr': 'pred_corr','std':'pred_std'}
# print('Hera')
ass_df = associations(X=X,col_names=HERA_qt,covariates=covariates, I_use=I_use, pred_type=pred_type,check_age=True)

In [None]:
display(ass_df.sort_values('pearson_pval').iloc[:20,:20])
if save_plot:
    ass_df.sort_values('pearson_pval').to_csv(folder+plots+plot_folder+'{}_{}_HERA_associations_quantitative_y{}.txt'.format(endpoint,dataset,k))
ass_df.sort_values('pearson_pval').to_csv(folder+plots+plot_folder+'tmp.csv')

#### Baseline

In [None]:
k=k_plot

# pred_res = (pred_res-pred_res.mean())/pred_res.std()
covariates = agesex
pred_type={'org': 'baseline', 'corr': 'baseline_corr','std':'baseline_std'}
# print('Hera')
ass_df = associations(X=X,col_names=HERA_qt,covariates=covariates, I_use=I_use, pred_type=pred_type,check_age=True)
display(ass_df.sort_values('pearson_pval').iloc[:20,])
if save_plot:
    ass_df.sort_values('pearson_pval').to_csv(folder+plots+plot_folder+'{}_{}_HERA_associations_quantitative_y{}_Baseline.txt'.format(endpoint,dataset,k))


In [None]:
ass_df.columns

### HERA categorical

In [None]:
I_use = hera_samples

In [None]:
k=k_plot

covariates = agesex
pred_type={'org': 'pred', 'corr': 'pred_corr','std':'pred_std'}
ass_df = associations(X=X,col_names=HERA_cat,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = True)
display(ass_df.sort_values('ttest_pval'))



In [None]:
# display(ass_df.sort_values('pearson_pval').iloc[:20,])
if save_plot:
    ass_df.sort_values('ttest_pval').to_csv(folder+plots+plot_folder+'{}_{}_HERA_associations_Categorical_y{}.txt'.format(endpoint,dataset,k))


In [None]:
k=k_plot

# pred_res = (pred_res-pred_res.mean())/pred_res.std()
covariates = agesex
pred_type={'org': 'baseline', 'corr': 'baseline_corr','std':'baseline_std'}
ass_df = associations(X=X,col_names=HERA_cat,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = True)
display(ass_df.sort_values('ttest_pval'))
if save_plot:
    ass_df.sort_values('ttest_pval').to_csv(folder+plots+plot_folder+'{}_{}_HERA_associations_Categorical_y{}_Baseline.txt'.format(endpoint,dataset,k))


### Other binary phenotypes

In [None]:
D = ['/odinn/data/phenotypes/pipeline/current_lists/categorical/Somatic_SNP_outliers_yes_vs_no_11092018.txt']
# D = D1+D2+D3+D4
print(len(D))
clonalh = pd.read_csv(D[0], delim_whitespace=True, index_col=0, names=['PN','clonalh'])

In [None]:
X = X.merge(clonalh, how = 'left',left_on='PN', right_index=True)

In [None]:

X['CAD'] =( ~pn_info.no_CAD_before)*1
X['MI'] = (~pn_info.no_MI_before)*1
X['cancer'] = pn_info.cancer_margin*1
X['Smoker'] = pn_info['Smoker'].astype(int).values
X['diabetes'] = pn_info['T2D'].astype(int).values
X['statin'] = pn_info.statin_estimate_unsure * 1
X['HTN_treated'] = pn_info[['HTN_treated']].astype(int).values

In [None]:
binphen = ['Smoker','diabetes','CAD','MI','statin','clonalh','cancer','HTN_treated']
covariates = agesex

In [None]:
I_use = hera_samples
pred_type={'org': 'pred', 'corr': 'pred_corr','std':'pred_std'}
ass_df = associations(X=X,col_names=binphen,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = True)
display(ass_df.sort_values('ttest_pval'))
if save_plot:
    ass_df.sort_values('ttest_pval').to_csv(folder+plots+plot_folder+'{}_{}_associations_Categorical_OtherHERA_y{}.txt'.format(endpoint,dataset,k))

In [None]:
I_use = hera_samples.intersection(all_samples)
pred_type={'org': 'pred', 'corr': 'pred_corr','std':'pred_std'}
ass_df = associations(X=X,col_names=binphen,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = True)
display(ass_df.sort_values('ttest_pval'))
if save_plot:
    ass_df.sort_values('ttest_pval').to_csv(folder+plots+plot_folder+'{}_{}_associations_Categorical_OtherHERAbf2019_y{}.txt'.format(endpoint,dataset,k))

In [None]:
I_use = new_samples.intersection(all_samples)
pred_type={'org': 'new_pred', 'corr': 'new_pred_corr','std':'new_pred_std'}
ass_df = associations(X=X,col_names=binphen,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = True)
display(ass_df.sort_values('ttest_pval'))
if save_plot:
    ass_df.sort_values('ttest_pval').to_csv(folder+plots+plot_folder+'{}_{}_associations_Categorical_OtherNewbf2019_y{}.txt'.format(endpoint,dataset,k))

In [None]:
I_use = new_samples
pred_type={'org': 'new_pred', 'corr': 'new_pred_corr','std':'new_pred_std'}
ass_df = associations(X=X,col_names=binphen,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = True)
display(ass_df.sort_values('ttest_pval'))
if save_plot:
    ass_df.sort_values('ttest_pval').to_csv(folder+plots+plot_folder+'{}_{}_associations_Categorical_OtherNew_y{}.txt'.format(endpoint,dataset,k))

In [None]:
I_use = hera_samples
pred_type={'org': 'baseline', 'corr': 'baseline_corr','std':'baseline_std'}
ass_df = associations(X=X,col_names=binphen,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = True)
display(ass_df.sort_values('ttest_pval'))

In [None]:
pred_type={'org': 'old_pred', 'corr': 'old_pred_corr','std':'old_pred_std'}
ass_df = associations(X=X,col_names=binphen,covariates=covariates, I_use=I_old, pred_type=pred_type,categorical = True)
display(ass_df.sort_values('ttest_pval'))

In [None]:
pred_type={'org': 'old_baseline', 'corr': 'old_baseline_corr','std':'old_baseline_std'}
ass_df = associations(X=X,col_names=binphen,covariates=covariates, I_use=I_old, pred_type=pred_type,categorical = True)
display(ass_df.sort_values('ttest_pval'))

In [None]:
pred_type={'org': 'all_pred', 'corr': 'all_pred_corr','std':'all_pred_std'}
ass_df = associations(X=X,col_names=binphen,covariates=covariates, I_use=all_samples, pred_type=pred_type,categorical = True)
display(ass_df.sort_values('ttest_pval'))
if save_plot:
    ass_df.sort_values('ttest_pval').to_csv(folder+plots+plot_folder+'{}_{}_associations_Categorical_OtherAall_y{}.txt'.format(endpoint,dataset,k))

#### Figures

In [None]:
VERY_SMALL_SIZE = 12
SMALL_SIZE = 14
MEDIUM_SIZE = 16
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=VERY_SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) 

In [None]:
plot_list = []
name_list = []
plot_list_hera = []
name_list_hera = []

In [None]:
I_plot = hera_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'pred'].values, 
                                  split = X.loc[I_plot,'statin'].astype(bool) , split_name = 'statin' )
plot_list_hera.extend([gs,gnos])
name_list_hera.extend(['Statin','No Statin'])

I_plot = old_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'old_pred'].values,
                                  split = X.loc[I_plot,'statin'].astype(bool) , split_name = 'statin' )
I_plot = all_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'all_pred'].values,
                                  split = X.loc[I_plot,'statin'].astype(bool) , split_name = 'statin' )

In [None]:
I_plot = hera_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'pred'].values, 
                                  split = X.loc[I_plot,'Smoker'].astype(bool) , split_name = 'Smoking' )
plot_list_hera.extend([gs,gnos])
name_list_hera.extend(['Smoking','No Smoking'])

I_plot = old_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'old_pred'].values,
                                  split = X.loc[I_plot,'Smoker'].astype(bool) , split_name = 'Smoking' )

I_plot = all_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'all_pred'].values,
                                  split = X.loc[I_plot,'Smoker'].astype(bool) , split_name = 'Smoking' )
plot_list.extend([gs,gnos])
name_list.extend(['Smoker','NoSmoker'])

In [None]:
I_plot = hera_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'pred'].values, 
                                  split = X.loc[I_plot,'clonalh'].astype(bool) , split_name = 'clonalh' )
plot_list_hera.extend([gs,gnos])
name_list_hera.extend(['clonalh','No clonalh'])

I_plot = old_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'old_pred'].values,
                                  split = X.loc[I_plot,'clonalh'].astype(bool) , split_name = 'clonalh' )

I_plot = all_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'all_pred'].values,
                                  split = X.loc[I_plot,'clonalh'].astype(bool) , split_name = 'clonalh' )
plot_list.extend([gs,gnos])
name_list.extend(['clonalh','No clonalh'])

In [None]:
I_plot = hera_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'pred'].values, 
                                  split = X.loc[I_plot,'diabetes'].astype(bool) , split_name = 'diabetes' )
plot_list_hera.extend([gs,gnos])
name_list_hera.extend(['Diabetes','No Diabetes'])

I_plot = old_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'old_pred'].values,
                                  split = X.loc[I_plot,'diabetes'].astype(bool) , split_name = 'diabetes' )
I_plot = all_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'all_pred'].values,
                                  split = X.loc[I_plot,'diabetes'].astype(bool) , split_name = 'diabetes' )
plot_list.extend([gs,gnos])
name_list.extend(['Diabetes','No Diabetes'])

In [None]:
I_plot = hera_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'pred'].values, 
                                  split = X.loc[I_plot,'CAD'].astype(bool) , split_name = 'CAD' )
plot_list_hera.extend([gs,gnos])
name_list_hera.extend(['CAD','No CAD'])

I_plot = old_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'old_pred'].values,
                                  split = X.loc[I_plot,'CAD'].astype(bool) , split_name = 'CAD' )
I_plot = all_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'all_pred'].values,
                                  split = X.loc[I_plot,'CAD'].astype(bool) , split_name = 'CAD' )
plot_list.extend([gs,gnos])
name_list.extend(['CAD','No CAD'])

In [None]:
I_plot = hera_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'pred'].values, 
                                  split = X.loc[I_plot,'MI'].astype(bool) , split_name = 'MI' )
plot_list_hera.extend([gs,gnos])
name_list_hera.extend(['MI','No MI'])

I_plot = old_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'old_pred'].values,
                                  split = X.loc[I_plot,'MI'].astype(bool) , split_name = 'MI' )
I_plot = all_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'all_pred'].values,
                                  split = X.loc[I_plot,'MI'].astype(bool) , split_name = 'MI' )
plot_list.extend([gs,gnos])
name_list.extend(['MI','No MI'])

In [None]:
I_plot = hera_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'pred'].values, 
                                  split = X.loc[I_plot,'cancer'].astype(bool) , split_name = 'cancer' )
plot_list_hera.extend([gs,gnos])
name_list_hera.extend(['Cancer','No Cancer'])

I_plot = old_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'old_pred'].values,
                                  split = X.loc[I_plot,'cancer'].astype(bool) , split_name = 'cancer' )
I_plot = all_samples
gs, gnos ,gsc,gnosc = group_assoc(X = X.loc[I_plot],pred = X.loc[I_plot,'all_pred'].values,
                                  split = X.loc[I_plot,'cancer'].astype(bool) , split_name = 'cancer' )
plot_list.extend([gs,gnos])
name_list.extend(['Cancer','No Cancer'])

In [None]:
fig = plt.figure(figsize=[6,4])
ax1 = fig.add_subplot(1,1,1)
sns.pointplot(data=plot_list[4:],join=False,ax=ax1)
ax1.set_xticklabels(name_list[4:],rotation = 30)
ax1.set_ylabel('Predicted risk')
ax1.set_title('Age and sex matched yes/no groups')
plt.grid()
plt.savefig(folder+plots+plot_folder+'{}_{}_correlation_disease_binary_y{}.png'.format(endpoint,dataset,k),bbox_inches="tight")

In [None]:
fig = plt.figure(figsize=[9,4])
ax1 = fig.add_subplot(1,1,1)
sns.pointplot(data=plot_list_hera,join=False,ax=ax1)
ax1.set_xticklabels(name_list_hera, rotation=45)
ax1.set_ylabel('Predicted risk')
ax1.set_title('Age and sex matched yes/no groups HERA')
plt.grid()
plt.savefig(folder+plots+plot_folder+'{}_{}_HERA_correlation binary_y{}.png'.format(endpoint,dataset,k),bbox_inches="tight")

In [None]:
fig = plt.figure(figsize=[4,4])
ax1 = fig.add_subplot(1,1,1)
sns.pointplot(data=plot_list[:4],join=False,ax=ax1)
ax1.set_xticklabels(name_list[:4],rotation = 30)
ax1.set_ylabel('Predicted risk')
ax1.set_title('Age and sex matched yes/no groups')
plt.grid()
plt.savefig(folder+plots+plot_folder+'{}_{}_correlation binary_y{}.png'.format(endpoint,dataset,k),bbox_inches="tight")

### Other quantitative

In [None]:
X['nonHDL'] = pn_info['TC'].values - pn_info['HDL'].values
X['TC'] = pn_info['TC'].values
X['HDL'] = pn_info['HDL'].values
X['bmi'] = pn_info['bmi'].values
X['ApoB'] = pn_info['ApoB'].values
X['statin_potency'] = pn_info.statin_closest_potency.values
X['statin'] = pn_info.statin

In [None]:
qtphen = ['HDL','TC','nonHDL','ApoB','bmi','nonHDL_corr','bmi_corr']
qtphen_norm = [p + '_norm' for p in qtphen]

covariates = agesex

In [None]:
I_use = hera_samples

In [None]:
k=k_plot
# pred = pred_dict['{}_y{}_agesexprotein_l1'.format(dataset,k)]

# pred = pred.loc[I_use]

corr_col = []
corr_col.extend(agesex)
corr_col.extend(['statin_potency'])
X.loc[I_use,'nonHDL_corr']= X.loc[I_use,'nonHDL'] - sm.OLS(X.loc[I_use,'nonHDL'].dropna(),sm.add_constant(X.loc[I_use,corr_col][~X.loc[I_use,'nonHDL'].isna()])).fit().predict(sm.add_constant(X.loc[I_use,corr_col]))
corr_col = []
corr_col.extend(agesex)
X.loc[I_use,'bmi_corr']= X.loc[I_use,'bmi'] - sm.OLS(X.loc[I_use,'bmi'].dropna(),sm.add_constant(X.loc[I_use,corr_col][~X.loc[I_use,'bmi'].isna()])).fit().predict(sm.add_constant(X.loc[I_use,corr_col]))
# pred_corr = pred_corr/pred_corr.std()
# 
# X.loc[I_use,'pred_corr_statin'] = pred_corr


In [None]:
out = quantile_transform(np.array(X.loc[I_use,qtphen]), n_quantiles=40000, random_state=10, output_distribution = 'normal',copy=True)
# X.loc[I_use][qtphen_norm] 

In [None]:
X = X.merge(pd.DataFrame(out,index = I_use,columns = qtphen_norm),how = 'left',left_index=True, right_index = True)
X

In [None]:
pred_type={'org': 'pred', 'corr': 'pred_corr','std':'pred_std'}
ass_df = associations(X=X,col_names=qtphen+qtphen_norm,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = False)
display(ass_df.sort_values('pearson_pval'))
if save_plot:
    ass_df.sort_values('pearson_pval').to_csv(folder+plots+plot_folder+'{}_{}_associations_Cholesterol_HERA_y{}.txt'.format(endpoint,dataset,k))

In [None]:
pred_type={'org': 'baseline', 'corr': 'baseline_corr','std':'baseline_std'}
ass_df = associations(X=X,col_names=qtphen+qtphen_norm,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = False)
display(ass_df.sort_values('pearson_pval'))

In [None]:
I_use = old_samples 
pred_type={'org': 'old_pred', 'corr': 'old_pred_corr','std':'old_pred_std'}
ass_df = associations(X=X,col_names=qtphen+qtphen_norm,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = False)
display(ass_df.sort_values('pearson_pval'))

#### Use Statins

In [None]:
I_use = hera_samples.intersection(X[(X['statin']==1)].index)
pred_type={'org': 'pred', 'corr': 'pred_corr','std':'pred_std'}
ass_df = associations(X=X,col_names=qtphen+qtphen_norm,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = False)
display(ass_df.sort_values('pearson_pval'))
if save_plot:
    ass_df.sort_values('pearson_pval').to_csv(folder+plots+plot_folder+'{}_{}_associations_Cholesterol_HERAstatin_y{}.txt'.format(endpoint,dataset,k))


In [None]:
# I_use = hera_samples.intersection(X[(X['statin']==1)].index).intersection(all_samples)
# pred_type={'org': 'pred', 'corr': 'pred_18105','std':'pred_std'}
# ass_df = associations(X=X,col_names=qtphen+qtphen_norm,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = False)
# display(ass_df.sort_values('pearson_pval'))
# # ass_df.sort_values('pearson_pval').to_csv(folder+plots+plot_folder+'{}_{}_associations_Cholesterol_HERAstatin_y{}.txt'.format(endpoint,dataset,k))

In [None]:
I_use = old_samples.intersection(X[(X['statin']==1)].index)
pred_type={'org': 'old_pred', 'corr': 'old_pred_corr','std':'old_pred_std'}
ass_df = associations(X=X,col_names=qtphen+qtphen_norm,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = False)
display(ass_df.sort_values('pearson_pval'))

#### Don't use statins

In [None]:
I_use = hera_samples.intersection(X[(X['statin']==0)].index)
pred_type={'org': 'pred', 'corr': 'pred_corr','std':'pred_std'}
ass_df = associations(X=X,col_names=qtphen+qtphen_norm,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = False)
display(ass_df.sort_values('pearson_pval'))
if save_plot:
    ass_df.sort_values('pearson_pval').to_csv(folder+plots+plot_folder+'{}_{}_associations_Cholesterol_HERAnostatin_y{}.txt'.format(endpoint,dataset,k))

I_use = old_samples.intersection(X[(X['statin']==0)].index)
pred_type={'org': 'old_pred', 'corr': 'old_pred_corr','std':'old_pred_std'}
ass_df = associations(X=X,col_names=qtphen+qtphen_norm,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = False)
display(ass_df.sort_values('pearson_pval'))

#### Without heart diesease

In [None]:
I_use = hera_samples.intersection(X[(X['CAD']==0)].index)
pred_type={'org': 'pred', 'corr': 'pred_corr','std':'pred_std'}
ass_df = associations(X=X,col_names=qtphen,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = False, check_age = False)
display(ass_df.sort_values('pearson_pval'))

I_use = old_samples.intersection(X[(X['CAD']==0)].index)
pred_type={'org': 'old_pred', 'corr': 'old_pred_corr','std':'old_pred_std'}
ass_df = associations(X=X,col_names=qtphen,covariates=covariates, I_use=I_use, pred_type=pred_type,categorical = False, check_age = False)
display(ass_df.sort_values('pearson_pval'))

#### ApoB correlations

In [None]:
X.loc[old_samples,['ApoB','nonHDL']].corr()

In [None]:
X.loc[hera_samples,['ApoB','nonHDL']].corr()