In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, confusion_matrix, log_loss
from sklearn.metrics import precision_recall_curve, roc_curve
import matplotlib.pyplot as plt
import pickle
from sklearn.utils import resample
from sksurv.metrics import concordance_index_censored
import lifelines as ll
import sys
sys.path.append('/odinn/users/thjodbjorge/Python_functions/')
from Calculate_score import calculate_metrics, make_class_table
from R_functions import R_pROC,R_pROC_compareROC,R_pROC_compareROC_boot, R_pROC_AUC, R_timeROC, R_timeROC_CI, R_timeROC_pval, R_NRIbin,R_NRIcens,R_NRIcensipw, R_censROC, R_hoslem, R_Greenwood_Nam


# raw_data = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/raw_with_info.csv',index_col = 'Barcode2d' )
probe_info = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/probe_info.csv', index_col = 'SeqId')

pn_info = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/pn_info_Mor/pn_info_Mor_event.csv',index_col = 'Barcode2d' )
probes_to_skip = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/probes_to_skip.txt')['probe']
nopro = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/no_protein_probes.txt', header = None)[0] # non-proten probes that were included 
probes_to_skip = set(probes_to_skip).union(set(nopro))

In [None]:
folder = '/odinn/users/thjodbjorge/Proteomics/Mortality2/'
feat_folder = 'Features2/'
pred_folder = 'Predictions3/'
pred_folder_update = 'Predictions4/'
plots = 'Plots5_plots_in_paper/'
save_plot = False
updated_predictions=True

endpoints = ['death']
# endpoints = ['Neoplasm','Nervous','Circulatory','Respiratory','Other','death'] 
time_to_event = pn_info.time_to_death
no_event_before = pn_info.no_death_before
for endpoint in endpoints:
    if endpoint == 'death':
        use_event = pn_info.event_death
    elif endpoint == 'Neoplasm':
        use_event = pn_info.event_death & (pn_info.Cause_of_death == 'Neoplasm')
    elif endpoint == 'Nervous':
        use_event = pn_info.event_death & (pn_info.Cause_of_death == 'Nervous')
    elif endpoint == 'Circulatory':
        use_event = pn_info.event_death & (pn_info.Cause_of_death == 'Circulatory')
    elif endpoint == 'Respiratory':
        use_event = pn_info.event_death & (pn_info.Cause_of_death == 'Respiratory')
    elif endpoint == 'Other':
        use_event = pn_info.event_death & (pn_info.Cause_of_death == 'Other')

y = []
for i in range(1,19):
    y.append(use_event & (time_to_event <= i))

kf = KFold(n_splits=10, random_state=10, shuffle=False) 
I_train_main, I_test_main = train_test_split(pn_info.index, train_size=0.7, random_state = 10)
# I_val_main, I_test_main = train_test_split(I_test_main, train_size=0.5, random_state = 10)


file = open(folder+"{}_keep_samples.pkl".format('Mor'),'rb')
keep_samples_dict = pickle.load(file)

# print(keep_samples_dict.keys())

In [None]:
VERY_SMALL_SIZE = 12
SMALL_SIZE = 14
MEDIUM_SIZE = 16
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=VERY_SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) 

In [None]:
line_cycle = ['-','--','-.',':','-','--','-.',':','-','--','-.',':','-','--','-.',':']
# color_cycle = ['#d73027','#fc8d59','#fee090','#e0f3f8','#91bfdb','#4575b4']
# color_cycle = ['#a6cee3','#1f78b4','#b2df8a','#33a02c']
# color_cycle = ['C0','C1','C2','C3','C4','C5','C6','C7','C8','C9']
# color_cycle =["#E69F00", "#56B4E9", "#009E73", "#0072B2", "#D55E00", "#CC79A7", "#F0E442"]
# color_cycle = [(0,0,0),(0.9,0.6,0),(0.35,0.7,0.9),(0,0.6,0.5)]
# color_cycle = ["#E69F00", "#56B4E9", "#009E73", "#F0E442","#0072B2", "#D55E00", "#CC79A7", "#000000"]
color_cycle = [ "#000000", "#CC79A7","#0072B2", "#D55E00","#009E73","#E69F00","#56B4E9","#F0E442"]

In [None]:
for i,c in enumerate(color_cycle):
    plt.scatter(i,i,color = c)

In [None]:
from matplotlib.font_manager import findfont, FontProperties, findSystemFonts
font = findfont(FontProperties(family=['sans-serif']))
print(font)
# findSystemFonts()

### Select dataset and k


In [None]:
dataset = 'Old_18105'
k_plot=4
k = k_plot
plot_folder = ''

## Load test predictions

In [None]:
try: 
    file = open(folder+pred_folder + "{}_{}_test_prediction.pkl".format(endpoint,dataset),'rb')
    pred_test_dict = pickle.load(file)
except:
    print('No test predictions')

if updated_predictions:
    print('Include updated predictiions')
    file = open(folder+pred_folder_update + "{}_{}_test_prediction.pkl".format(endpoint,dataset),'rb')
    pred_test_dict_update = pickle.load(file)   
    print(pred_test_dict['{}_y{}_baseline2_lr'.format(dataset,k)][:5])
    for key,value in pred_test_dict_update.items():
        print(key)
        pred_test_dict[key] = value
    print(pred_test_dict['{}_y{}_baseline2_lr'.format(dataset,k)][:5])
# print(pred_test_dict.keys())

keep_samples = keep_samples_dict[dataset]

I_train = I_train_main.intersection(keep_samples)#.intersection(have_prs)
I_test = I_test_main.intersection(keep_samples)#.intersection(have_prs)

y_train = y[k][I_train]
y_test= y[k][I_test]

print(len(I_test))

## Metrics for different k

#### agesex, baseline, gdf15, protein

In [None]:
k=k_plot

# K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
K_true = [l+1 for l in K]
# K= [0]
pred_k = []
Cindex = np.zeros([len(K),4,4])
scores = np.zeros([len(K),4,8])
for j,k in enumerate(K):
    keys = ['{}_y{}_agesex_lr'.format(dataset,k),'{}_y{}_baseline2_lr'.format(dataset,k),
           '{}_y{}_agesexGDF15_lr'.format(dataset,k),'{}_y{}_agesexprotein_l1'.format(dataset,k)]

    pred_key = []
    for i,key in enumerate(keys):
        pred = pred_test_dict[key]
        baseline = pred_test_dict[keys[1]]
        scores[j,i] = calculate_metrics(pred,baseline,y[k][I_test])    


In [None]:
name_keys = ['Age+sex','Baseline','Age+sex+GDF15','Age+sex+Protein']
fig=plt.figure(figsize=[10,6])
for i, key in enumerate(name_keys):
#     scores = score_dict[key]
    plt.plot(K_true,scores[:,i,0],linestyle = line_cycle[-i-1],color = color_cycle[i])
    plt.ylabel('AUC')
    plt.xlabel('Event within years')
    plt.xticks(K_true)
#     plt.title(dataset)

plt.legend(name_keys)
plt.grid()
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_AUC.png'.format(endpoint,dataset),bbox_inches="tight")
plt.show()


fig=plt.figure(figsize=[10,6])
for i, key in enumerate(name_keys):
#     scores = score_dict[key]
    plt.plot(K_true,scores[:,i,4],linestyle = line_cycle[-i-1],color = color_cycle[i])
    plt.ylabel('IDI')
    plt.xlabel('Event within years')
    plt.xticks(K_true)
#     plt.title(dataset)

plt.legend(name_keys)
plt.grid()
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_IDI.png'.format(endpoint,dataset),bbox_inches="tight")
plt.show()



### Baseline and noGDF15

In [None]:
k=k_plot

# K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
K_true = [l+1 for l in K]
# K= [0]
pred_k = []
Cindex = np.zeros([len(K),8,4])
scores = np.zeros([len(K),8,8])
for j,k in enumerate(K):
    keys = ['{}_y{}_agesex_lr'.format(dataset,k),'{}_y{}_baseline2_lr'.format(dataset,k),
            '{}_y{}_agesexGDF15_lr'.format(dataset,k),'{}_y{}_baseline2GDF15_lr'.format(dataset,k),
            '{}_y{}_agesexproteinnoGDF15_l1'.format(dataset,k),'{}_y{}_baseline2proteinnoGDF15_l1'.format(dataset,k),
           '{}_y{}_agesexprotein_l1'.format(dataset,k),'{}_y{}_baseline2protein_l1'.format(dataset,k),]

    pred_key = []
    for i,key in enumerate(keys):
        pred = pred_test_dict[key]
        baseline = pred_test_dict[keys[0]]
        scores[j,i] = calculate_metrics(pred,baseline,y[k][I_test])        


In [None]:
name_keys = ['Age+sex', 'Baseline','Age+sex+GDF15','Baseline +GDF15',
             'Age+sex+protein-GDF15','Baseline+protein-GDF15','Age+sex+protein','Baseline+protein']
fig=plt.figure(figsize=[10,6])
for i, key in enumerate(name_keys):
#     scores = score_dict[key]
#     plt.plot(K_true,scores[:,i,0],linestyle = line_cycle[i%2],color = color_cycle[int(i/2)])
    plt.plot(K_true,scores[:,i,0],linestyle = line_cycle[int(i/2)],color = color_cycle[i%2])
    plt.ylabel('AUC')
    plt.xlabel('Event within years')
    plt.xticks(K_true)
#     plt.title(dataset)

plt.legend(name_keys)
plt.grid()
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_AUC_wbasewnoGDF15.png'.format(endpoint,dataset),bbox_inches="tight")
plt.show()



### Other protein predictors

#### Age+sex+ protein model

In [None]:
k=k_plot

# K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
K_true = [l+1 for l in K]
# K= [0]
pred_k = []
Cindex = np.zeros([len(K),10,4])
scores = np.zeros([len(K),10,8])
for j,k in enumerate(K):
    keys = ['{}_y{}_agesex_lr'.format(dataset,k),
            '{}_y{}_agesexPAD2_lr'.format(dataset,k),'{}_y{}_agesexGDF15_lr'.format(dataset,k),'{}_y{}_agesexelife_lr'.format(dataset,k),
            '{}_y{}_agesexhopro_lr'.format(dataset,k),'{}_y{}_agesexprotein_l1'.format(dataset,k)]

#             'predy{}_{}_ageelife_cox'.format(k,dataset),  'predy{}_{}_agesexelife_cox'.format(k,dataset),
    pred_key = []
    for i,key in enumerate(keys):
        pred = pred_test_dict[key]
        baseline = pred_test_dict[keys[1]]
        scores[j,i] = calculate_metrics(pred,baseline,y[k][I_test])    


In [None]:
name_keys = ['Age+Sex','AS+PAD','AS+GDF15','AS+7 proteins(Tanaka et al.) ','AS+10 proteins(Ho et al.)','AS+Multiple proteins']
fig=plt.figure(figsize=[10,6])
for i, key in enumerate(name_keys):
#     scores = score_dict[key]
    plt.plot(K_true,scores[:,i,0],linestyle = line_cycle[-i-1],color = color_cycle[i])
    plt.ylabel('AUC')
    plt.xlabel('Event within years')
    plt.xticks(K_true)
#     plt.title(dataset)

plt.legend(name_keys)
plt.grid()
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_AUC_agesex_otherproteins.png'.format(endpoint,dataset),bbox_inches="tight")
plt.show()



#### Baseline + protein model

In [None]:
k=k_plot

# K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
K_true = [l+1 for l in K]
# K= [0]
pred_k = []
Cindex = np.zeros([len(K),10,4])
scores = np.zeros([len(K),10,8])
for j,k in enumerate(K):
    keys = ['{}_y{}_baseline2_lr'.format(dataset,k),'{}_y{}_baseline2PAD2_lr'.format(dataset,k),'{}_y{}_baseline2GDF15_lr'.format(dataset,k), 
            '{}_y{}_baseline2elife_lr'.format(dataset,k),'{}_y{}_baseline2hopro_lr'.format(dataset,k),'{}_y{}_baseline2protein_l1'.format(dataset,k)]

#             'predy{}_{}_ageelife_cox'.format(k,dataset),  'predy{}_{}_agesexelife_cox'.format(k,dataset),
    pred_key = []
    for i,key in enumerate(keys):
        pred = pred_test_dict[key]
        baseline = pred_test_dict[keys[1]]
        scores[j,i] = calculate_metrics(pred,baseline,y[k][I_test])    


In [None]:
name_keys = ['Baseline','B+PAD','B+GDF15','B+7 proteins(Tanaka et al.)','B+10 proteins(Ho et al.)','B+Multiple proteins']
fig=plt.figure(figsize=[10,6])
for i, key in enumerate(name_keys):
#     scores = score_dict[key]
    plt.plot(K_true,scores[:,i,0],linestyle = line_cycle[-i-1],color = color_cycle[i])
    plt.ylabel('AUC')
    plt.xlabel('Event within years')
    plt.xticks(K_true)
#     plt.title(dataset)

plt.legend(name_keys)
plt.grid()
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_AUC_baseline_otherproteins.png'.format(endpoint,dataset),bbox_inches="tight")
plt.show()




#### Age+sex+ protein model + suPAR

In [None]:
k=k_plot

# K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
K_true = [l+1 for l in K]
# K= [0]
pred_k = []
Cindex = np.zeros([len(K),10,4])
scores = np.zeros([len(K),10,8])
for j,k in enumerate(K):
    keys = ['{}_y{}_agesex_lr'.format(dataset,k),'{}_y{}_agesexPAD2_lr'.format(dataset,k),
            '{}_y{}_agesexPLAUR_lr'.format(dataset,k),'{}_y{}_agesexGDF15_lr'.format(dataset,k),'{}_y{}_agesexelife_lr'.format(dataset,k),
            '{}_y{}_agesexhopro_lr'.format(dataset,k),'{}_y{}_agesexprotein_l1'.format(dataset,k)]

#             'predy{}_{}_ageelife_cox'.format(k,dataset),  'predy{}_{}_agesexelife_cox'.format(k,dataset),
    pred_key = []
    for i,key in enumerate(keys):
        pred = pred_test_dict[key]
        baseline = pred_test_dict[keys[1]]
        scores[j,i] = calculate_metrics(pred,baseline,y[k][I_test])    


In [None]:
name_keys = ['Age+Sex','AS+PAD','AS+PLAUR','AS+GDF15','AS+7 proteins(Tanaka et al.) ','AS+10 proteins(Ho et al.)','AS+Multiple proteins']
fig=plt.figure(figsize=[10,6])
for i, key in enumerate(name_keys):
#     scores = score_dict[key]
    plt.plot(K_true,scores[:,i,0],linestyle = line_cycle[-i-1],color = color_cycle[i])
    plt.ylabel('AUC')
    plt.xlabel('Event within years')
    plt.xticks(K_true)
#     plt.title(dataset)

plt.legend(name_keys)
plt.grid()
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_AUC_agesex_otherproteins2.png'.format(endpoint,dataset),bbox_inches="tight")
plt.show()



#### Baseline + protein model

In [None]:
k=k_plot

# K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
K_true = [l+1 for l in K]
# K= [0]
pred_k = []
Cindex = np.zeros([len(K),10,4])
scores = np.zeros([len(K),10,8])
for j,k in enumerate(K):
    keys = ['{}_y{}_baseline2_lr'.format(dataset,k),'{}_y{}_baseline2PAD2_lr'.format(dataset,k),'{}_y{}_baseline2PLAUR_lr'.format(dataset,k),
            '{}_y{}_baseline2GDF15_lr'.format(dataset,k), 
            '{}_y{}_baseline2elife_lr'.format(dataset,k),'{}_y{}_baseline2hopro_lr'.format(dataset,k),'{}_y{}_baseline2protein_l1'.format(dataset,k)]

#             'predy{}_{}_ageelife_cox'.format(k,dataset),  'predy{}_{}_agesexelife_cox'.format(k,dataset),
    pred_key = []
    for i,key in enumerate(keys):
        pred = pred_test_dict[key]
        baseline = pred_test_dict[keys[1]]
        scores[j,i] = calculate_metrics(pred,baseline,y[k][I_test])    


In [None]:
name_keys = ['Baseline','B+PAD','B+PLAUR','B+GDF15','B+7 proteins(Tanaka et al.)','B+10 proteins(Ho et al.)','B+Multiple proteins']
fig=plt.figure(figsize=[10,6])
for i, key in enumerate(name_keys):
#     scores = score_dict[key]
    plt.plot(K_true,scores[:,i,0],linestyle = line_cycle[-i-1],color = color_cycle[i])
    plt.ylabel('AUC')
    plt.xlabel('Event within years')
    plt.xticks(K_true)
#     plt.title(dataset)

plt.legend(name_keys)
plt.grid()
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_AUC_baseline_otherproteins2.png'.format(endpoint,dataset),bbox_inches="tight")
plt.show()




### ROC curve

In [None]:
k=k_plot
keys = ['{}_y{}_agesex_lr'.format(dataset,k),'{}_y{}_baseline2_lr'.format(dataset,k),
       '{}_y{}_agesexGDF15_lr'.format(dataset,k),'{}_y{}_agesexprotein_l1'.format(dataset,k)]

base_key = '{}_y{}_baseline2_lr'.format(dataset,k)

roc_key = []
prere_key = []
for key in keys:
    pred = pred_test_dict[key]
    pred = pd.DataFrame(pred,index=I_test)[0]
#     pred_base = pd.DataFrame( pred_test_dict[base_key][0], index=I_test)[0]
    
    pre, re, th = precision_recall_curve(y[k][I_test],pred)
    fpr,tpr,th =roc_curve(y[k][I_test],pred)
    print(th[1:].max())
    roc_key.append([fpr,tpr,th])
    prere_key.append([pre,re,th])

In [None]:

name_keys = ['Age+sex','Baseline','Age+sex+GDF15','Age+sex+Protein']
# K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
# K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
K=[k_plot]
K_true = [l+1 for l in K]
# K= [0]
pred_k = []
Cindex = np.zeros([len(K),4,4])
scores = np.zeros([len(K),4,8])
for j,k in enumerate(K):
    keys = ['{}_y{}_agesex_lr'.format(dataset,k),'{}_y{}_baseline2_lr'.format(dataset,k),
           '{}_y{}_agesexGDF15_lr'.format(dataset,k),'{}_y{}_agesexprotein_l1'.format(dataset,k)]

    pred_key = []
    for i,key in enumerate(keys):
        pred = pred_test_dict[key]
        baseline = pred_test_dict[keys[0]]
        scores[j,i] = calculate_metrics(pred,baseline,y[k][I_test])        


In [None]:

k=k_plot
fig = plt.figure(figsize=[6,6])
for i in [0,1,2,3]:
    fpr,tpr,th = roc_key[i]
    plt.plot(fpr,tpr,'C{}'.format(i),linestyle = line_cycle[-i-1],color = color_cycle[i])

#     plt.plot(th[1:],fpr[1:])
#     plt.plot(th[1:],tpr[1:])
#     print(scores[k,i,0])
plt.legend(['Age+sex(AUC = {:0.3f})'.format(scores[0,0,0]),
            'Baseline (AUC = {:0.3f})'.format(scores[0,1,0]),
            'Age+sex+GDF15 (AUC = {:0.3f})'.format(scores[0,2,0]),
           'Age+sex+Protein (AUC = {:0.3f})'.format(scores[0,3,0])])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
# plt.title('ROC- curve')
# dim = int((ROC.shape[0]-1)/3)

if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_ROC_y{}.png'.format(endpoint,dataset,k))
# print(score[k,i])



### Calibration

In [None]:
k = k_plot
# k=4
keys = ['{}_y{}_agesex_lr'.format(dataset,k),'{}_y{}_baseline2_lr'.format(dataset,k),
       '{}_y{}_agesexGDF15_lr'.format(dataset,k),'{}_y{}_agesexprotein_l1'.format(dataset,k)]

name_keys = ['Age+sex', 'Baseline','Age+sex+GDF15','Age+sex+protein']

# event_comp = pd.DataFrame(np.where(y[14][I_test], 1,np.where(y_cens[14][I_test],2,0)),index=I_test)

fig = plt.figure(figsize = [10,11])
for j,key in enumerate(keys):


    pred = pred_test_dict[key]

    risk_bins =  np.digitize(pred,np.quantile(pred,np.arange(0,1.01,0.05)))
    pred= pd.DataFrame(pred,index=I_test)

    fig.add_subplot(2,2,j+1)
    KM = [] 
    predicted = []
    CI = []
    for i in range(1,len(np.arange(0,1.01,0.05))):
        kmf =  ll.fitters.kaplan_meier_fitter.KaplanMeierFitter()
        ind = I_test[risk_bins==i]
        kmf.fit(time_to_event[ind],use_event[ind])
        KM.append(1-kmf.predict(k+1))
        predicted.append(np.mean(pred.loc[ind]))
        
#         aj = ll.fitters.aalen_johansen_fitter.AalenJohansenFitter()
#         aj.fit(durations=tte_test[ind], event_observed=event_comp.loc[ind,0], event_of_interest=1)
#         CI.append(aj.predict(10))
#         CI.append(np.sum(X_test['event'][ind] & (tte_test[ind]<=10))/len(ind))
    print('Number in group: ', len(ind)) 
    plt.scatter(KM,predicted, color = 'k')
#     plt.scatter(CI,predicted,color='r')
#     x = KM
    predicted = np.array(predicted)[:,0]
    plt.plot(np.unique(KM), np.poly1d(np.polyfit(KM, predicted, 1))(np.unique(KM)),color_cycle[0])
    plt.plot([0,1],[0,1],'--',color = color_cycle[1])
#     plt.plot(np.unique(CI), np.poly1d(np.polyfit(CI, predicted, 1))(np.unique(CI)),'r')
    plt.title(name_keys[j])
    plt.ylabel('Mean predicted risk')
    plt.xlabel('Incidence proportion')
    plt.legend(['Best-fit line','Ideal line','Data points'])
    plt.grid(True)
fig.subplots_adjust(hspace=0.3)
#     plt.axis([0,0.5,0,0.5])
if save_plot: 
    plt.savefig(folder+plots+plot_folder+'{}_{}_Calibration_y{}.png'.format(endpoint,dataset,k))

## Selected groups 

In [None]:
# dataset = 'Old_cardio_risk_18105'

In [None]:
# pred = pd.DataFrame(pred_test_dict['{}_y{}_agesexprotein_l1'.format(dataset,k)],index=I_test)[0]
pred = pd.DataFrame(pred_test_dict['{}_y{}_agesexprotein_l1'.format(dataset,k)],index=I_test)[0]
baseline = pd.DataFrame(pred_test_dict['{}_y{}_baseline2_lr'.format(dataset,k)],index=I_test)[0]

In [None]:
cancer = pn_info.loc[I_test,'cancer']
CAD = ~ pn_info.loc[I_test,'no_CAD_before']
MI =  ~ pn_info.loc[I_test,'no_MI_before']
stroke = ~ pn_info.loc[I_test,'no_stroke_before']

statin = pn_info.loc[I_test, 'statin_estimate_unsure']
nosurg = pn_info.loc[I_test,'no_CABG_PCI_before']
no_diseases = (~cancer) & (~CAD)&(~MI)&(~stroke)
no_risk_factors = ~pn_info.loc[I_test,'T2D'] & ~pn_info.loc[I_test,'Smoker'] & ~statin & (pn_info.loc[I_test,'HTN_treated']==0)
no_comor = no_diseases & no_risk_factors
# age = (pn_info.loc[I_test,'Age_at_sample_collection_2']>80) & (pn_info.loc[I_test,'Age_at_sample_collection_2']<=105)

print(cancer.sum(), CAD.sum(), MI.sum())

### (80-105 years old)

In [None]:
high_risk = (pn_info.loc[I_test,'Age_at_sample_collection_2']>80) & (pn_info.loc[I_test,'Age_at_sample_collection_2']<=105)
print(pn_info.loc[I_test,'Age_at_sample_collection_2'][high_risk].mean())
print(high_risk.sum())

In [None]:
k=k_plot

# K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
K_true = [l+1 for l in K]
# K= [0]
pred_k = []
Cindex = np.zeros([len(K),6,4])
scores = np.zeros([len(K),6,8])
for j,k in enumerate(K):
    keys = ['{}_y{}_agesex_lr'.format(dataset,k),'{}_y{}_baseline2_lr'.format(dataset,k),'{}_y{}_agesexGDF15_lr'.format(dataset,k),
           '{}_y{}_agesexprotein_l1'.format(dataset,k)]#,'{}_y{}_baseline2protein_l1'.format(dataset,k)]

    pred_key = []
    for i,key in enumerate(keys):
        pred = pred_test_dict[key][high_risk]
        baseline = pred_test_dict[keys[1]][high_risk]
        scores[j,i] = calculate_metrics(pred,baseline,y[k][I_test][high_risk])        

In [None]:
name_keys = ['Age+sex', 'Baseline','Age+sex+GDF15','Age+sex+protein']
fig=plt.figure(figsize=[10,6])
for i, key in enumerate(name_keys):
#     scores = score_dict[key]
    plt.plot(K_true,scores[:,i,0],linestyle = line_cycle[-i-1],color = color_cycle[i])
    plt.ylabel('AUC')
    plt.xlabel('Event within years')
    plt.xticks(K_true)
#     plt.title(dataset)

plt.legend(name_keys)
plt.grid()
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_AUC_80105.png'.format(endpoint,dataset),bbox_inches="tight")
plt.show()



#### Kaplan Meier

In [None]:
# k = k_plot
k=9
keys = ['{}_y{}_agesex_lr'.format(dataset,k),'{}_y{}_baseline2_lr'.format(dataset,k),
       '{}_y{}_agesexGDF15_lr'.format(dataset,k),'{}_y{}_agesexprotein_l1'.format(dataset,k)]
name_keys = ['Age+sex','Baseline','Age+sex+GDF15','Age+sex+Protein']
fig = plt.figure(figsize = [20,12])
for j,key in enumerate(keys):
#     print(keys[j])
    # key = 'predy{}_{}_tradstatproteinprs_coxelnet'.format(k,dataset)
    pred = pred_test_dict[key][high_risk]

    risk_bins =  np.digitize(pred,np.quantile(pred,[0,0.2,0.5,0.8,1]))
    pred= pd.DataFrame(pred,index=I_test[high_risk])

    fig.add_subplot(2,2,j+1)
    KMFs = []
    for i in range(4,0,-1):
        kmf =  ll.fitters.kaplan_meier_fitter.KaplanMeierFitter()
        ind = I_test[high_risk][risk_bins==i]
        kmf.fit(time_to_event[ind],use_event[ind])
        KMFs.append(kmf)
        kmf.plot(loc=slice(0,16),color = color_cycle[i-1])
#         print(len(ind), np.mean(pred.loc[ind]))
#         print(kmf.event_table.loc[0,'at_risk'],1- kmf.predict(5),1-kmf.predict(10))
        if i in [4,1]:
            plt.scatter(2, kmf.predict(2),color='r',zorder=10)
            plt.scatter(5, kmf.predict(5),color='r',zorder=10)
            plt.scatter(10, kmf.predict(10),color='r',zorder=10)
            plt.annotate('{:0.3f}'.format(kmf.predict(2)),(2, kmf.predict(2)),(2+0.2, kmf.predict(2)))
            plt.annotate('{:0.3f}'.format(kmf.predict(5)),(5, kmf.predict(5)),(5+0.2, kmf.predict(5)))
            plt.annotate('{:0.3f}'.format(kmf.predict(10)),(10, kmf.predict(10)),(10+0.2, kmf.predict(10)))
    plt.legend(['80%-100%','50%-80%','20%-50%','0%-20%'])  
    # plt.legend(['0%-5%','5%-20%','20%-50%','50%-80%','80%-95%','95%-100%'])
    plt.axis([0,16,0,1.05])
    plt.title(name_keys[j])
    plt.ylabel('Survival')
    plt.xlabel('Time in years')
    plt.grid(True)
#     plt.show()
if save_plot: 
    plt.savefig(folder+plots+plot_folder+'{}_{}_KaplanMeier_20p_80105_y{}.png'.format(endpoint,dataset,k))


###  No diseases

In [None]:
high_risk = no_diseases.copy()
print(pn_info.loc[I_test,'Age_at_sample_collection_2'][high_risk].mean())
print(high_risk.sum())
print(y[-1][I_test][high_risk].sum())
print(y[k_plot][I_test][high_risk].sum())

In [None]:
I_test[high_risk]

In [None]:
k=k_plot

# K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
K_true = [l+1 for l in K]
# K= [0]
pred_k = []
Cindex = np.zeros([len(K),6,4])
scores = np.zeros([len(K),6,8])
for j,k in enumerate(K):
    keys = ['{}_y{}_agesex_lr'.format(dataset,k),'{}_y{}_baseline2_lr'.format(dataset,k),'{}_y{}_agesexGDF15_lr'.format(dataset,k),
           '{}_y{}_agesexprotein_l1'.format(dataset,k)]#,'{}_y{}_baseline2protein_l1'.format(dataset,k)]

    pred_key = []
    for i,key in enumerate(keys):
        pred = pred_test_dict[key][high_risk]
        baseline = pred_test_dict[keys[1]][high_risk]
        scores[j,i] = calculate_metrics(pred,baseline,y[k][I_test][high_risk])        

In [None]:
name_keys = ['Age+sex', 'Baseline','Age+sex+GDF15','Age+sex+protein']
fig=plt.figure(figsize=[10,6])
for i, key in enumerate(name_keys):
#     scores = score_dict[key]
    plt.plot(K_true,scores[:,i,0],linestyle = line_cycle[-i-1],color = color_cycle[i])
    plt.ylabel('AUC')
    plt.xlabel('Event within years')
    plt.xticks(K_true)
#     plt.title(dataset)

plt.legend(name_keys)
plt.grid()
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_AUC_no_diseases.png'.format(endpoint,dataset),bbox_inches="tight")
plt.show()


### Risk quantiles pælingar 60-80 years old

In [None]:
age = (pn_info.loc[I_test,'Age_at_sample_collection_2']<80) & (pn_info.loc[I_test,'Age_at_sample_collection_2']>=60)
high_risk = age

In [None]:
# k = k_plot
k = 9
keys = ['{}_y{}_agesex_lr'.format(dataset,k),'{}_y{}_baseline2_lr'.format(dataset,k),
       '{}_y{}_agesexGDF15_lr'.format(dataset,k),'{}_y{}_agesexprotein_l1'.format(dataset,k)]
name_keys = ['Age+sex','Baseline','Age+sex+GDF15','Age+sex+Protein']
fig = plt.figure(figsize = [20,12])

split_groups = {}
for j,key in enumerate(keys):
#     print(keys[j])
    # key = 'predy{}_{}_tradstatproteinprs_coxelnet'.format(k,dataset)
    pred = pred_test_dict[key][high_risk]

    risk_bins =  np.digitize(pred,np.quantile(pred,[0,0.05,0.2,0.8,0.95,1]))
    pred= pd.DataFrame(pred,index=I_test[high_risk])

    fig.add_subplot(2,2,j+1)
    KMFs = []
    split_group = []
    for i in range(5,0,-1):
        kmf =  ll.fitters.kaplan_meier_fitter.KaplanMeierFitter()
        ind = I_test[high_risk][risk_bins==i]
        split_group.append([time_to_event[ind],use_event[ind]])
        kmf.fit(time_to_event[ind],use_event[ind])
        KMFs.append(kmf)
        kmf.plot(loc=slice(0,16),color=color_cycle[i-1])
#         print(kmf.event_table.loc[0,'at_risk'],1- kmf.predict(5),1-kmf.predict(10))
#         print(len(ind), np.mean(pred.loc[ind]))
#         print(kmf.event_table.loc[0,'at_risk'],1- kmf.predict(5),1-kmf.predict(10))
        if i == 5:
            plt.scatter(5, kmf.predict(5),color='r',zorder=10)
            plt.scatter(10, kmf.predict(10),color='r',zorder=10)
            plt.annotate('{:0.3f}'.format(kmf.predict(5)),(5, kmf.predict(5)),(5+0.2, kmf.predict(5)))
            plt.annotate('{:0.3f}'.format(kmf.predict(10)),(10, kmf.predict(10)),(10+0.2, kmf.predict(10)))
    split_groups[key] = split_group
    plt.legend(['95%-100%','80%-95%','20%-80%','5%-20%','0%-5%'])  
    # plt.legend(['0%-5%','5%-20%','20%-50%','50%-80%','80%-95%','95%-100%'])
    plt.axis([0,16,0,1.05])
    plt.title(name_keys[j])
    plt.ylabel('Survival')
    plt.xlabel('Time in years')
    plt.grid(True)
    # plt.show()
if save_plot: 
    plt.savefig(folder+plots+plot_folder+'{}_{}_KaplanMeier_6080_5p_y{}_annotate.png'.format(endpoint,dataset,k),),bbox_inches="tight")


In [None]:
print(keys)
split_groups[keys[0]][0][0]
ll.statistics.logrank_test(split_groups[keys[1]][0][0],split_groups[keys[3]][0][0],
                           split_groups[keys[1]][0][1],split_groups[keys[3]][0][1])

In [None]:
# %%capture cap --no-stderr
# k = k_plot
k=9

keys = ['{}_y{}_agesex_lr'.format(dataset,k),'{}_y{}_baseline2_lr'.format(dataset,k),
       '{}_y{}_agesexGDF15_lr'.format(dataset,k),'{}_y{}_agesexprotein_l1'.format(dataset,k)]
name_keys = ['Age+sex','Baseline','Age+sex+GDF15','Age+sex+Protein']
fig = plt.figure(figsize = [10,6])
for j,key in enumerate(keys):
    print(keys[j])
    pred = pred_test_dict[key][high_risk]

#     risk_bins =  np.digitize(pred,np.quantile(pred,[0,0.05,0.2,0.8,0.95,1]))
#     risk_bins =  np.digitize(pred,np.quantile(pred,[0,0.1,0.9,1]))
    risk_bins =  np.digitize(pred,np.quantile(pred,[0,0.05,0.95,1]))
    pred= pd.DataFrame(pred,index=I_test[high_risk])

#     fig.add_subplot(2,2,j+1)
    KMFs = []
    timeline = np.arange(0,16,0.1)
    for i in range(3,0,-1):
        kmf =  ll.fitters.kaplan_meier_fitter.KaplanMeierFitter()
        ind = I_test[high_risk][risk_bins==i]
        kmf.fit(time_to_event[ind],use_event[ind], timeline = timeline)
        KMFs.append(kmf)
        print(kmf.event_table.loc[0,'at_risk'],1- kmf.predict(5),1-kmf.predict(10))
    prob_high_risk = KMFs[0].cumulative_density_at_times(timeline)
    prob_low_risk =  KMFs[-1].cumulative_density_at_times(timeline)
    plt.plot(timeline,(1-prob_high_risk)/(1-prob_low_risk),linestyle = line_cycle[-j-1],color = color_cycle[j])
#     plt.plot(timeline,(1-prob_low_risk)/(1-prob_high_risk))
#     plt.plot(timeline,(prob_high_risk)/(prob_low_risk))
#     plt.plot(timeline,(prob_low_risk)/(prob_high_risk))
#     plt.plot(timeline,1-prob_high_risk )
plt.legend(name_keys)
plt.axis([0,16,0,1.05])
plt.ylabel('Survival of 5% at highest risk \n divided by \n survival of 5% at lowest risk')
plt.xlabel('Time in years')

plt.grid(True)
#     plt.show()
if save_plot: 
    plt.savefig(folder+plots+plot_folder+'{}_{}_KaplanMeier_highvslow_6080_5p_y{}.png'.format(endpoint,dataset,k),bbox_inches='tight')


#### Prediction for different death types, one predictor

In [None]:

k=k_plot
print(k)
pred_as = pred_test_dict['{}_y{}_agesex_lr'.format(dataset,k)]
pred_baseline = pred_test_dict['{}_y{}_baseline2_lr'.format(dataset,k)]
# pred_as = pred_test_dict['{}_y{}_tradcancer_lr'.format(dataset,k)]
pred_gdf = pred_test_dict['{}_y{}_agesexGDF15_lr'.format(dataset,k)]
# pred_gdf = pred_test_dict['{}_y{}_tradcancerprotein_l1'.format(dataset,k)]
pred_pro = pred_test_dict['{}_y{}_agesexprotein_l1'.format(dataset,k)]
# pred_pro = pred_test_dict['{}_y{}_tradcancerbloodprotein_l1'.format(dataset,k)]
# pred_pro = pred_test_dict['{}_y{}_agesexprotein_l2'.format(dataset,k)]

cases = y[k][I_test]
print(roc_auc_score(y[k][I_test],pred_pro))

# groups = ['C','G','I','J']

group_list = []
group_list_as = []
group_list_baseline = []
group_list_gdf =[]
pn_info_list = []

risk = pred_pro[y[k][I_test]]
risk_as=pred_as[y[k][I_test]]
risk_gdf = pred_gdf[y[k][I_test]]
risk_baseline = pred_baseline[y[k][I_test]]
group_list.append(risk)
group_list_as.append(risk_as)
group_list_baseline.append(risk_baseline)
group_list_gdf.append(risk_gdf)
pn_info_list.append(pn_info.loc[I_test][y[k][I_test]])

groups = ['Neoplasm','Nervous','Circulatory','Respiratory','Other']
group_ind = []
for g in groups:
    ind = pn_info.loc[I_test][y[k][I_test]]['Cause_of_death'] == g
#     group_ind.append(ind)
    
# for g in groups:
#     ind = pn_info.loc[I_test][y[k][I_test]]['ICD_group'] == g
    risk = pred_pro[y[k][I_test]][ind]
    risk_as=pred_as[y[k][I_test]][ind]
    risk_gdf = pred_gdf[y[k][I_test]][ind]
    risk_baseline = pred_baseline[y[k][I_test]][ind]
    group_list.append(risk)
    group_list_as.append(risk_as)
    group_list_baseline.append(risk_baseline)
    group_list_gdf.append(risk_gdf)
    pn_info_list.append(pn_info.loc[I_test][y[k][I_test]][ind])
    print(np.sum(ind))


group_list.append(pred_pro[~y[k][I_test]])
group_list_as.append(pred_as[~y[k][I_test]])
group_list_baseline.append(pred_baseline[~y[k][I_test]])
group_list_gdf.append(pred_gdf[~y[k][I_test]])
pn_info_list.append(pn_info.loc[I_test][~y[k][I_test]])
print(np.sum(~y[k][I_test]))
groups.append('Ctrl')


fig = plt.figure(figsize= [13,6])

# 'C{}'.format(i)
boxprops_pro = dict(color=color_cycle[3], linewidth=2)    
boxprops_as = dict(color=color_cycle[0], linewidth=2)  
boxprops_gdf = dict(color=color_cycle[2], linewidth=2)   
boxprops_baseline = dict(color=color_cycle[1], linewidth=2)   

# linestyle = line_cycle[-i-1],color = color_cycle[i]


bp1 = plt.boxplot(group_list, positions = np.arange(1,3*len(group_list),3), boxprops =boxprops_pro)
bp2 = plt.boxplot(group_list_as, positions = np.arange(2.8,3*len(group_list),3),boxprops=boxprops_as)
bp3 = plt.boxplot(group_list_gdf, positions = np.arange(1.6,3*len(group_list),3),boxprops=boxprops_gdf)
bp4 = plt.boxplot(group_list_baseline, positions = np.arange(2.2,3*len(group_list),3),boxprops=boxprops_baseline)
# plt.axvline(x=3, color = 'k', alpha=0.7)
# plt.axvline(x=3*len(group_list)-3, color = 'k', alpha=0.5)

plt.legend([bp1['boxes'][0],bp3['boxes'][0],bp4['boxes'][0],bp2['boxes'][0]],['Age+sex+Protein','Age+sex+GDF15','Baseline','Age+sex'],loc='upper right')

plt.ylabel('Predicted probability of \n death within {} years'.format(k+1))#,fontsize=14)
plt.xlabel('Cause of death \n# participants')#,fontsize=14)

locs, _ = plt.xticks()
labels =['All-cause \n','Neoplasms \n','Nervous s. \n','Circulatory s. \n','Respiratory s. \n','Other \n','Alive \n']
new_labels = []
for i, lab in enumerate(labels):
    new_labels.append(lab + str(pn_info_list[i].shape[0]))
labels = new_labels
plt.xticks(np.arange(1,3*len(group_list),3)+0.75,labels = labels)
# plt.title('Predicted risk of death within {} years'.format(k+1), fontsize = 16)
plt.grid(axis='y')
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_pred_ICD_4models_boxplot_wAllcause_y{}.png'.format(endpoint,dataset,k),bbox_inches = 'tight')
plt.show()

#### AUC

In [None]:

k=k_plot
print(k)
cases = y[k][I_test]
controls = ~cases

keys = ['{}_y{}_agesex_lr'.format(dataset,k),'{}_y{}_baseline2_lr'.format(dataset,k),
        '{}_y{}_agesexGDF15_lr'.format(dataset,k),'{}_y{}_agesexprotein_l1'.format(dataset,k)]


groups = ['Neoplasm','Nervous','Circulatory','Respiratory','Other']
AUC_array = np.zeros([len(groups)+1,len(keys),3])
for i,key in enumerate(keys):
    pred = pred_test_dict[key]
    AUC_array[0,i,:] = R_pROC_AUC(y[k][I_test],pred)

    group_ind = []
    for j,g in enumerate(groups):
        ind = ((pn_info.loc[I_test]['Cause_of_death'] == g) | controls)
        AUC_array[j+1,i,:] = R_pROC_AUC(y[k][I_test][ind],pred[ind])
groups = ['All-cause'] + groups

In [None]:

name_keys = ['Age+sex','Baseline','Age+sex+GDF15','Age+sex+Protein']
fig = plt.figure(figsize=[10,6])
for i in range(len(keys)):
#     plt.scatter(groups,AUC_array[:,i,1])
    plt.errorbar(np.arange(0,len(groups),1)+0.2*i,AUC_array[:,i,1],yerr = [AUC_array[:,i,1]-AUC_array[:,i,0],AUC_array[:,i,2]-AUC_array[:,i,1]],
                 marker='o',ls='',elinewidth=0.5,capsize=5,color=color_cycle[i])
org_axis = plt.gca()
plt.legend(name_keys,loc = 'lower right')
for i in range(len(groups)-1):
    plt.axvline(x=i+0.8, color = 'k', alpha=1,linewidth = 0.5)
labels =['All-cause','Neoplasms','Nervous s.','Circulatory s.','Respiratory s.','Other']
plt.xticks(np.arange(0,len(groups),1)+0.3,labels, rotation = 15)
plt.xlabel('Cause of death')
plt.ylabel('AUC')
plt.grid(axis='y')
# plt.axis([org_axis.get_xlim()[0],org_axis.get_xlim()[1],org_axis.get_ylim()[0],org_axis.get_ylim()[1]])
plt.title('{}-Year Mortality'.format(k+1))
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_pred_ICD_4models_AUC_wAllcause_y{}.png'.format(endpoint,dataset,k),bbox_inches = 'tight')
plt.show()

## Single proteins
    - See Mor_pred2_results_single_protein.ipynb for original code

In [None]:
try: 
    file = open(folder+pred_folder + "{}_{}_test_prediction_single_protein.pkl".format(endpoint,dataset),'rb')
    pred_test_dict = pickle.load(file)
except:
    print('No test predictions')

if updated_predictions:
    print('Include updated predictiions')
    file = open(folder+pred_folder_update + "{}_{}_test_prediction.pkl".format(endpoint,dataset),'rb')
    pred_test_dict_update = pickle.load(file)     
    print(pred_test_dict['{}_y{}_baseline2_lr'.format(dataset,k)][:5])
    for key,value in pred_test_dict_update.items():
        pred_test_dict[key] = value
    print(pred_test_dict['{}_y{}_baseline2_lr'.format(dataset,k)][:5])
print(pred_test_dict.keys())

keep_samples = keep_samples_dict[dataset]

I_train = I_train_main.intersection(keep_samples)#.intersection(have_prs)
I_test = I_test_main.intersection(keep_samples)#.intersection(have_prs)

y_train = y[k][I_train]
y_test= y[k][I_test]



In [None]:
protein_names = ['GDF15','WFDC2','THBS2','ANTXR2','RBL2','SERPINA3','TNFRSF1A','ANGPT2','MMP12','SPON2']

#### AUC for categories

In [None]:

k=k_plot
print(k)
cases = y[k][I_test]
controls = ~cases

# keys = ['{}_y{}_agesexGDF15_lr'.format(dataset,k),'{}_y{}_agesexproteinnoGDF15_l1'.format(dataset,k),'{}_y{}_agesexprotein_l1'.format(dataset,k),
#         '{}_y{}_baselineGDF15_lr'.format(dataset,k),'{}_y{}_baselineproteinnoGDF15_l1'.format(dataset,k),'{}_y{}_baselineprotein_l1'.format(dataset,k)]

keys = ['{}_y{}_agesex_lr'.format(dataset,k),'{}_y{}_baseline2_lr'.format(dataset,k),
        '{}_y{}_agesexGDF15_lr'.format(dataset,k),'{}_y{}_agesexHE4_lr'.format(dataset,k),
        '{}_y{}_agesexTSP2_lr'.format(dataset,k),'{}_y{}_agesexANTR2_lr'.format(dataset,k)]

groups = ['Neoplasm','Nervous','Circulatory','Respiratory','Other']
AUC_array = np.zeros([len(groups)+1,len(keys),3])
for i,key in enumerate(keys):
    pred = pred_test_dict[key]
    AUC_array[0,i,:] = R_pROC_AUC(y[k][I_test],pred)

    group_ind = []
    for j,g in enumerate(groups):
        ind = ((pn_info.loc[I_test]['Cause_of_death'] == g) | controls)
        AUC_array[j+1,i,:] = R_pROC_AUC(y[k][I_test][ind],pred[ind])
groups = ['All-cause'] + groups

In [None]:

# name_keys = ['Age+sex','Baseline','Age+sex+GDF15','Age+sex+HE4','Age+sex+TSP2','Age+sex+ANTR2']
name_keys = ['Age+sex','Baseline','Age+sex+GDF15','Age+sex+WFDC2','Age+sex+THBS2','Age+sex+ANTXR2']
# protien_names = ['GDF15','WFDC2','THBS2','ANTXR2','RBL2','SERPINA3','TNFRSF1A','ANGPT2','MMP12','SPON2']
fig = plt.figure(figsize=[10,6])
for i in range(len(keys)):
#     plt.scatter(groups,AUC_array[:,i,1])
    plt.errorbar(np.arange(0,len(groups),1)+0.15*i,AUC_array[:,i,1],yerr = [AUC_array[:,i,1]-AUC_array[:,i,0],AUC_array[:,i,2]-AUC_array[:,i,1]],
                 marker='o',ls='',elinewidth=0.5,capsize=5,color=color_cycle[i])
org_axis = plt.gca()
plt.legend(name_keys,loc = 'lower right')
for i in range(len(groups)-1):
    plt.axvline(x=i+0.9, color = 'k', alpha=1,linewidth = 0.5)
labels =['All-cause','Neoplasms','Nervous s.','Circulatory s.','Respiratory s.','Other']
plt.xticks(np.arange(0,len(groups),1)+0.3,labels, rotation = 15)
plt.xlabel('Cause of death')
plt.ylabel('AUC')
plt.grid(axis='y')
# plt.axis([org_axis.get_xlim()[0],org_axis.get_xlim()[1],org_axis.get_ylim()[0],org_axis.get_ylim()[1]])
plt.title('{}-Year Mortality'.format(k+1))
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_pred_ICD_6models_AUC_wAllcause_wbaseline_y{}.png'.format(endpoint,dataset,k),bbox_inches = 'tight')
plt.show()

#### Metrics for different k

In [None]:
k=k_plot

# K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
K = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
K_true = [l+1 for l in K]
# K= [0]
pred_k = []
Cindex = np.zeros([len(K),12,4])
scores = np.zeros([len(K),12,8])
for j,k in enumerate(K):
    keys = ['{}_y{}_agesex_lr'.format(dataset,k),'{}_y{}_baseline2_lr'.format(dataset,k),
           '{}_y{}_agesexGDF15_lr'.format(dataset,k),'{}_y{}_agesexHE4_lr'.format(dataset,k),
           '{}_y{}_agesexTSP2_lr'.format(dataset,k),'{}_y{}_agesexANTR2_lr'.format(dataset,k)]
# protein_names = ['GDF15','HE4','TSP2','ANTR2','p130','A1AC','TNFsR1','Angp2','MMP12','SPON2']
    pred_key = []
    for i,key in enumerate(keys):
        pred = pred_test_dict[key]
        baseline = pred_test_dict[keys[0]]
        scores[j,i] = calculate_metrics(pred,baseline,y[k][I_test])        
#         pred = pd.DataFrame(pred,index=I_test)[0]
#         pred_key.append(np.array(pred))
# #         print(cind)
#     pred_k.append(pred_key)
#     cind


In [None]:
agesexproteins = ['Age+sex+'+p for p in protein_names]
name_keys = ['Age+sex','Baseline']+agesexproteins[:4]
# name_keys = ['Age+sex+protein L1','Age+sex+protein L2','Baseline+protein','Baseline+blood+Protein','Cox elnet Age+sex+protein']
fig=plt.figure(figsize=[10,6])
for i, key in enumerate(name_keys):
#     scores = score_dict[key]
    plt.plot(K_true,scores[:,i,0],linestyle = line_cycle[-i-1],color = color_cycle[i])
    plt.ylabel('AUC')
    plt.xlabel('Event within years')
    plt.xticks(K_true)
#     plt.title(dataset)

plt.legend(name_keys)
plt.grid()
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_4proteins_AUC.png'.format(endpoint,dataset),bbox_inches="tight")
plt.show()

## GDF15 properties
    - Mor_data.ipynb for original code
    - See Mor_plot_important_proteins.ipynb to plot distributions of many interesting proteins

In [None]:

np.log(pn_info.loc[keep_samples,'GDF15']).hist(bins=100,density=True)
# np.log(pn_info.loc[keep_samples,'ApoB']).hist(bins=100,density=True)
plt.xlabel('Log(Protein level)')
plt.ylabel('Density')
plt.title('GDF15 distribution')
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_GDF15_distribution.png'.format(dataset),bbox_inches="tight")
plt.show()

age = np.array(pn_info.loc[keep_samples,'Age_at_sample_collection_2'])
fig = plt.figure(figsize=[10,6])
plt.scatter(age,np.log(pn_info.loc[keep_samples,'GDF15']),alpha=.1,marker = '.',zorder=-1 ,color='k')
plt.plot(np.unique(age), np.poly1d(np.polyfit(age, np.log(pn_info.loc[keep_samples,'GDF15']), 1))(np.unique(age)),'r')
lowess= sm.nonparametric.lowess
z = lowess(np.log(pn_info.loc[keep_samples,'GDF15']),age,frac=0.1,delta = 0.01 * np.ptp(age))
plt.plot(z[:,0],z[:,1],color='b')


# fig = plt.figure(figsize=[10,6])
# plt.scatter(age,(pn_info.loc[keep_samples,'GDF15']),alpha=.1,marker = '.',zorder=-1 ,color='k')
# plt.plot(np.unique(age), np.poly1d(np.polyfit(age, (pn_info.loc[keep_samples,'GDF15']), 1))(np.unique(age)),'r')
# lowess= sm.nonparametric.lowess
# z = lowess((pn_info.loc[keep_samples,'GDF15']),age,frac=0.1,delta = 0.01 * np.ptp(age))
# # z = lowess(np.log(pn_info.loc[keep_samples,'GDF15']),age,delta = 1000)
# plt.plot(z[:,0],z[:,1],color='b')
# plt.axis([20,100,0,50000])

plt.legend(['Best line','Lowess'])
plt.grid()
plt.xlabel('Age')
plt.ylabel('Log(Protein level)')
plt.title('GDF15')
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_GDF15vsage.png'.format(dataset),bbox_inches="tight")
plt.show()

## New data prediction
    - Mor_results_newdata.ipynb for original code

In [None]:
pred_year = 'pred_y4'

In [None]:
# dataset = 'Old_18105'
pred = pd.read_csv(folder+pred_folder + "{}_{}_protein_prediction_all.csv".format(endpoint,dataset),index_col = 'Barcode2d')
pred_as = pd.read_csv(folder+pred_folder + "{}_{}_agesex_prediction_all.csv".format(endpoint,dataset),index_col = 'Barcode2d')
if updated_predictions:
    print('Use updated predictions')
    pred_baseline = pd.read_csv(folder+pred_folder_update + "{}_{}_baseline2_prediction_all.csv".format(endpoint,dataset),index_col = 'Barcode2d')
else:
    pred_baseline = pd.read_csv(folder+pred_folder + "{}_{}_baseline2_prediction_all.csv".format(endpoint,dataset),index_col = 'Barcode2d')

In [None]:
dataset_new = 'New_18105'
keep_samples_new = keep_samples_dict[dataset_new]

In [None]:
HERA_death = pn_info.loc[keep_samples_new][pn_info.loc[keep_samples_new,'event_death']].index
HERA_living = pn_info.loc[keep_samples_new][~pn_info.loc[keep_samples_new,'event_death']].index

In [None]:

boxprops_pro = dict(color=color_cycle[3], linewidth=2)    
boxprops_as = dict(color=color_cycle[0], linewidth=2)  
# boxprops_gdf = dict(color='C2', linewidth=2)   
boxprops_baseline = dict(color=color_cycle[1],linewidth=2)   

bp1 = plt.boxplot([pred.loc[HERA_death][pred_year],pred.loc[HERA_living][pred_year]], positions = [1,3], boxprops =boxprops_pro)
bp2 = plt.boxplot([pred_baseline.loc[HERA_death][pred_year],pred_baseline.loc[HERA_living][pred_year]], positions = [1.5,3.5], boxprops =boxprops_baseline)
bp3 = plt.boxplot([pred_as.loc[HERA_death][pred_year],pred_as.loc[HERA_living][pred_year]], positions = [2,4], boxprops =boxprops_as)

plt.legend([bp1['boxes'][0],bp2['boxes'][0],bp3['boxes'][0]],['Age+sex+Protein','Baseline','Age+sex'],loc='upper right')
plt.xticks([1.5,3.5],labels = ['Dead','Living'])
plt.ylabel('Predicted risk')
plt.xlabel('')
plt.axis([0,5,-0.05,1])
plt.grid()
if save_plot:
    plt.savefig(folder+plots+'{}_{}_{}_{}_boxplot_living_vs_dead.png'.format(endpoint,dataset,dataset_new,pred_year))
# plt.show()

## Data
    - Originally from Mor_data.ipynb

In [None]:
I_plot = keep_samples
pn_info.loc[I_plot,'Age_at_sample_collection_2'].hist(bins=np.arange(10,110,5))
# plt.title('Age distribution')
plt.xlabel('Age')
plt.ylabel('Number of participants')
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_age_distribution.png'.format(dataset),bbox_inches="tight")
plt.show()

pn_info['Time_of_plasma_collection_2'] = pd.to_datetime(pn_info.Time_of_plasma_collection_2)
sampling_date = pn_info['Time_of_plasma_collection_2'].dt.year + pn_info['Time_of_plasma_collection_2'].dt.dayofyear/365
sampling_date[I_plot].hist(bins=np.arange(2000,2006,0.5))
plt.xlabel('Sampling year')
plt.ylabel('Number of participants')
# plt.title("Time of sample collection")
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_sampling_date_distribution.png'.format(dataset),bbox_inches="tight")
plt.show()


print('Men: ', (pn_info.loc[I_plot,'sex']==1).sum())
print('Women: ', (pn_info.loc[I_plot,'sex']==2).sum())
print('Samples: ', len(I_plot))
print('Deaths: ', use_event[I_plot].sum())
print('')


## Proteins selected with forward selection in order  
    - This comes directly from Mor_feature_ranking.ipynb

In [None]:
try: 
    file = open(folder+pred_folder + "{}_{}_fewp_test_prediction.pkl".format(endpoint,dataset),'rb')
#     file = open(folder+pred_folder + "{}_{}_fewp_noGDF15_test_prediction.pkl".format(endpoint,dataset),'rb')
    pred_test_dict = pickle.load(file)
except:
    print('No test predictions')
    pred_test_dict = {} 


In [None]:
# k=k_plot
# k=1
k= 4
k2=k+1

nump = 101 
K = [k]

AUC_CI = []

for j,k in enumerate(K):
    for i in range(nump):
        for l in range(1):
            pred = pd.DataFrame(pred_test_dict['{}_y{}_agesex_forwardp{}_l2'.format(dataset,k,i)],index=I_test)[0]
            
            AUC_CI.append(R_pROC_AUC(y[k][I_test],pred))
AUC_CI = np.array(AUC_CI)

fig = plt.figure(figsize=[8,5])
# fig.add_subplot(1,2,1)

plt.plot(AUC_CI[:,1])
plt.ylabel('AUC')
plt.xlabel('Number of proteins')
plt.grid()
plt.title('{}-Year Mortality'.format(k+1))
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_NumProtein_forward_AUC_y{}.png'.format(endpoint,dataset,k))
    # plt.savefig(folder+plots+'{}_{}_NumProtein_forward_noGDF15_AUC_y{}.png'.format(endpoint,dataset,k))


In [None]:
# k=k_plot
# k=1
k= 9
k2=k+1

nump = 101 
K = [k]

AUC_CI = []

for j,k in enumerate(K):
    for i in range(nump):
        for l in range(1):
            pred = pd.DataFrame(pred_test_dict['{}_y{}_agesex_forwardp{}_l2'.format(dataset,k,i)],index=I_test)[0]
            
            AUC_CI.append(R_pROC_AUC(y[k][I_test],pred))
AUC_CI = np.array(AUC_CI)

fig = plt.figure(figsize=[8,5])
# fig.add_subplot(1,2,1)

plt.plot(AUC_CI[:,1])
plt.ylabel('AUC')
plt.xlabel('Number of proteins')
plt.grid()
plt.title('{}-Year Mortality'.format(k+1))
if save_plot:
    plt.savefig(folder+plots+plot_folder+'{}_{}_NumProtein_forward_AUC_y{}.png'.format(endpoint,dataset,k))
    # plt.savefig(folder+plots+'{}_{}_NumProtein_forward_noGDF15_AUC_y{}.png'.format(endpoint,dataset,k))

## Experiments with baseline features
     - Mor_Trad_20210106.ipynb has the functions to plot the baseline experiments 

## Cross validation results 
    - See Mor_predict_cv_results.ipynb to plot the cross validation experiements

## Protein correlation heatmap
    - See Mor_top_protein.ipynb