In [None]:
#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, confusion_matrix, log_loss
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, LinearRegression
from sklearn.preprocessing import QuantileTransformer, PowerTransformer
from sklearn.feature_selection import SelectKBest, SelectFromModel, RFE, RFECV, SelectPercentile, SelectFpr, SelectFdr, SelectFwe
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sksurv.metrics import concordance_index_censored
import matplotlib.pyplot as plt
import pickle
import sys
sys.path.append('/odinn/users/thjodbjorge/Python_functions/')
# import Predict_functions as pf
from Calculate_score import calculate_metrics

In [None]:
# raw_data = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/raw_with_info.csv',index_col = 'Barcode2d' )
probe_info = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/probe_info.csv', index_col = 'SeqId')

pn_info = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/pn_info_Mor/pn_info_Mor_event.csv',index_col = 'Barcode2d' )
probes_to_skip = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/probes_to_skip.txt')['probe']

folder = '/odinn/users/thjodbjorge/Proteomics/Mortality2/'
feat_folder = 'Features2/'
pred_folder = 'Predictions_cv2/'
pred_folder_update = 'Predictions4/'
plots = 'Plots5_plots_in_paper/'


endpoints = ['death']
# endpoints = ['death','Cdeath','Gdeath','Ideath','Jdeath','Otherdeath']
# event_date = event_date_death
time_to_event = pn_info.time_to_death
no_event_before = pn_info.no_death_before
for endpoint in endpoints:
    if endpoint == 'death':
        use_event = pn_info.event_death
        print(use_event.sum())
    elif endpoint == 'Cdeath':
        use_event = pn_info.event_death & (pn_info.ICD_group == 'C')
        print(use_event.sum())
    elif endpoint == 'Gdeath':
        use_event = pn_info.event_death & (pn_info.ICD_group == 'G')
        print(use_event.sum())
    elif endpoint == 'Ideath':
        use_event = pn_info.event_death & (pn_info.ICD_group == 'I')
        print(use_event.sum())
    elif endpoint == 'Jdeath':
        use_event = pn_info.event_death & (pn_info.ICD_group == 'J')
        print(use_event.sum())
    elif endpoint == 'Otherdeath':
        use_event = pn_info.event_death & (~(pn_info.ICD_group == 'C')&~(pn_info.ICD_group == 'G')&~(pn_info.ICD_group == 'I')&~(pn_info.ICD_group == 'J'))
        print(use_event.sum())


y = []
for i in range(1,19):
    y.append(use_event & (time_to_event <= i))

kf = KFold(n_splits=10, random_state=10, shuffle=False) 
I_train_main, I_test_main = train_test_split(pn_info.index, train_size=0.7, random_state = 10)
I_val_main, I_test_main = train_test_split(I_test_main, train_size=0.5, random_state = 10)


file = open(folder+"{}_keep_samples.pkl".format('Mor'),'rb')
keep_samples_dict = pickle.load(file)

print(keep_samples_dict.keys())
# keep_samples_keys = ['Old_18105', 'Old_60105', 'Old_6080','Old_18105_C', 'Old_18105_I', 'Old_18105_J', 'Old_18105_G','Old_18105_Other']
# keep_samples_keys = ['Old_6080']

In [None]:
dataset = 'Old_18105'
plot_folder = '{}_{}/'.format(endpoint,dataset)
print(plot_folder)

file = open(folder+pred_folder+"{}_{}_predict_cv.pkl".format(endpoint,dataset),'rb')
pred_dict_cv = pickle.load(file)
# print(pred_dict_cv.keys())

file = open(folder+pred_folder_update+"{}_{}_predict_cv.pkl".format(endpoint,dataset),'rb')
pred_dict_cv_update = pickle.load(file)

# file = open(folder+pred_folder+"{}_{}_predict_cv_corr.pkl".format(endpoint,dataset),'rb')
# pred_dict_cv_corr = pickle.load(file)
# print(pred_dict_cv_corr.keys())

# file = open(folder+pred_folder+"{}_{}_predict_cv_MLP.pkl".format(endpoint,dataset),'rb')
# pred_dict_cv_MLP = pickle.load(file)
# print(pred_dict_cv_MLP.keys())

# file = open(folder+pred_folder+"{}_{}_predict_cv_SVM.pkl".format(endpoint,dataset),'rb')
# pred_dict_cv_SVM = pickle.load(file)
# print(pred_dict_cv_SVM.keys())

# file = open(folder+pred_folder+"{}_{}_predict_cv_XGB.pkl".format(endpoint,dataset),'rb')
# pred_dict_cv_XGB = pickle.load(file)
# print(pred_dict_cv_XGB.keys())

# pred_dict_cv = {**pred_dict_cv, **pred_dict_cv_MLP,**pred_dict_cv_XGB}  
# pred_dict_cv = {**pred_dict_cv, **pred_dict_cv_corr}
# print(pred_dict_cv.keys())
print(pred_dict_cv['Old_18105_baseline2_lr_y9'][2][0][:5])
for key,value in pred_dict_cv_update.items():
    print(key)
    pred_dict_cv[key] = value 
print(pred_dict_cv['Old_18105_baseline2_lr_y9'][2][0][:5])

In [None]:
k=9
keep_samples = keep_samples_dict[dataset]

I_train = I_train_main.intersection(keep_samples)
I_test = I_test_main.intersection(keep_samples)
tte_train = time_to_event[I_train]
# try: 
#     file = open(folder+pred_folder+"{}_{}_predict_cv.pkl".format(endpoint,dataset),'rb')
#     pred_dict_cv = pickle.load(file)

# except:
#     pred_dict_cv = {}

score_dict = {}

keys = pred_dict_cv.keys()
# new_keys = []
# for key in keys:
#     if 'newy{}_'.format(k) in key:
#         new_keys.append(key)
# keys = new_keys

# keys = ['PCEprs_4080_boruta_y9_coxl2','PCEprs_4080_boruta_y9_coxl2end']
print(keys)
# keys=['{}_all_y{}_lrl1'.format(dataset,k)]
print(keys)
bins = [0,0.075,1]
for key in keys:
    out = []
    fold_num = 0
    for train_index, test_index in kf.split(I_train):
        fold = pred_dict_cv[key][2][fold_num]
        numf = pred_dict_cv[key][0][fold_num]
        y_test = y[k][I_train].iloc[test_index]
        tte_test = tte_train.iloc[test_index]
#         base_fold = pred_dict_cv['{}_agesex_lr_y{}'.format(dataset,k)][2][fold_num]        
        base_fold = pred_dict_cv['{}_agesex_cox_y{}'.format(dataset,k)][2][fold_num]  
#         y_cens_test = y_cens[k][I_train].iloc[test_index]
#         print(key)
        auc =[]
        for i,case in enumerate(fold):
#             auc.append([len(numf[i]),roc_auc_score(y_test,np.array(case).reshape(-1)),log_loss(y_test,np.array(case).reshape(-1))])
#             auc.append([len(numf[i]),roc_auc_score(y_test[~y_cens_test],np.array(case).reshape(-1)[~y_cens_test]),log_loss(y_test[~y_cens_test],np.array(case).reshape(-1)[~y_cens_test])])
            auc.append(len(numf[i]))
#             auc.extend(calculate_metrics(np.array(case).reshape(-1)[~y_cens_test],np.array(base_fold[0]).reshape(-1)[~y_cens_test],y_test[~y_cens_test],bins=bins))
            auc.extend(calculate_metrics(np.array(case).reshape(-1),np.array(base_fold[0]).reshape(-1),y_test,bins=bins))
            auc.append(concordance_index_censored(y_test,tte_test,np.array(case).reshape(-1))[0])
        
        out.append(auc)
        fold_num = fold_num+1
    score_dict[key] = out
#             print(len(y_test), len(case))
f = open(folder+pred_folder+"{}_{}_score_cv.pkl".format(endpoint,dataset),"wb")
pickle.dump(score_dict,f)
f.close()

In [None]:
try: 
    file = open(folder+pred_folder+"{}_{}_score_cv.pkl".format(endpoint,dataset),'rb')
    score_dict = pickle.load(file)
except:
    print('No score dict')


# keys = pred_dict_cv.keys()
keys = score_dict.keys()
new_keys = []
for key in keys:
    if 'y{}'.format(k) in key:
        new_keys.append(key)
keys = new_keys

# fig = plt.figure(figsize=[6,6])
for key in keys:
    out=score_dict[key]
#     plt.scatter(np.array(out).mean(axis=0)[:,0],np.array(out).mean(axis=0)[:,1])
    plt.scatter(np.array(out).mean(axis=0)[0],np.array(out).mean(axis=0)[1])

lgd = plt.legend(keys,loc='center left', bbox_to_anchor=(1.0, 0.5))
# plt.savefig(folder+"Predictions_cv/{}_{}_AUC_y{}.png".format(endpoint,dataset,k), bbox_extra_artists=(lgd,), bbox_inches='tight')
plt.show()

# fig = plt.figure(figsize=[6,6])
for key in keys:
    out=score_dict[key]
#     plt.scatter(np.array(out).mean(axis=0)[:,0],np.array(out).mean(axis=0)[:,2])
    plt.scatter(np.array(out).mean(axis=0)[0],np.array(out).mean(axis=0)[2])
# plt.legend(keys)
lgd = plt.legend(keys,loc='center left', bbox_to_anchor=(1.0, 0.5))

# plt.savefig(folder+pred_folder+"{}_{}_logloss_y{}.png".format(endpoint,dataset,k),bbox_extra_artists=(lgd,), bbox_inches='tight')
plt.show()

In [None]:
try: 
    file = open(folder+pred_folder+"{}_{}_score_cv.pkl".format(endpoint,dataset),'rb')
    score_dict = pickle.load(file)
except:
    print('No score dict')


# keys = ['PCEprs_4080_boruta_y9_coxl2','PCEprs_4080_boruta_y9_coxl2end']
keys = score_dict.keys()
new_keys = []
for key in keys:
    if 'y{}'.format(k) in key:
        new_keys.append(key)
keys = new_keys

df = pd.DataFrame()
for i,key in enumerate(keys):
    out=score_dict[key]
    ## AUC, Bries, logloss, IDI, NRI, AP, NRI_events, NRI_ctrl
    df[key] = np.array(out).mean(axis=0)
#     print(key, np.array(out).mean(axis=0)[-4], np.array(out).mean(axis=0)[-2],np.array(out).mean(axis=0)[-1])
df_res = df.transpose()
df_res.columns = ['Num','AUC', 'Bries', 'logloss', 'IDI', 'NRI', 'AP', 'NRI_events', 'NRI_ctrl','Cindex']

In [None]:
pd.options.display.max_rows = 999
df_res.sort_values('AUC')
# df_res.sort_values('Cindex')
# df_res.sort_values('logloss')
# df_res.sort_values('Bries')
# df_res.sort_values('IDI')
# df_res.sort_values('AP')
# df_res.sort_values('NRI')

In [None]:
SMALL_SIZE = 14
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) 


### PRS

In [None]:
# keys = ['{}_tradstatprsprotein_coxelnet'.format(dataset),'{}_tradstatprsprotein_elnet'.format(dataset),'{}_tradstatprsprotein_mlp'.format(dataset),'{}_tradstatprsprotein_xgb'.format(dataset)]
# key_names = ['Cox elnet','Elnet','MLP','XGB']
# k = 9

keys = ['{}_agesex_lr_y{}'.format(dataset,k),'{}_agesexprs_lr_y{}'.format(dataset,k), '{}_baseline2_lr_y{}'.format(dataset,k),'{}_baseline2prs_lr_y{}'.format(dataset,k)]
key_names = ['Age+sex','Age+sex+PRS', 'Baseline','Baseline+PRS']
# key_names = ['Age + sex', 'Baseline','Baseline \n +PRS','Baseline \n +Tri+Cre+Pla',
#              'Baseline  \n +PRS \n +Tri+Cre+Pla','','']

In [None]:
plt.figure(figsize=[8,6])
out_list = []
for i,key in enumerate(keys):
    out=score_dict[key]
    out_list.append(np.array(out)[:,1])
plt.boxplot(out_list,labels = key_names)
plt.scatter(range(1,len(keys)+1), np.mean(np.array(out_list),axis=1))
plt.grid()
plt.ylabel('AUC')
plt.xlabel('Model')

plt.title(' CV {}-Year Mortality'.format(k+1))
# plt.show()

if 1:
    plt.savefig(folder+plots+plot_folder+'{}_{}_baselineprscv_AUC_y{}.png'.format(endpoint,dataset,k),bbox_inches="tight")
#     plt.savefig(folder+plots +plot_folder+'{}_{}_baselinecv_AUC_y{}_all.png'.format(endpoint,dataset,k),bbox_inches="tight")
#     plt.savefig(folder+plots +plot_folder+'{}_{}_baselinecv_AUC_y{}_all_new.png'.format(endpoint,dataset,k),bbox_inches="tight")
plt.show()

plt.figure(figsize=[12,6])
out_list = []
for i,key in enumerate(keys):
    out=score_dict[key]
    out_list.append(np.array(out)[:,2])

#     plt.xticks(rotation=0)
#     plt.xlabel('Model')
plt.boxplot(out_list,labels = key_names)
plt.grid()
plt.ylabel('Bries')
plt.title(' CV {}-Year Mortality'.format(k+1))
# if 1:
#     plt.savefig(folder+plots +plot_folder+'{}_{}_baselinecv_Bries_y{}_use.png'.format(endpoint,dataset,k),bbox_inches="tight")
#     plt.savefig(folder+plots +plot_folder+'{}_{}_baselinecv_Bries_y{}_all.png'.format(endpoint,dataset,k),bbox_inches="tight")
#     plt.savefig(folder+plots +plot_folder+'{}_{}_baselinecv_Bries_y{}_all_new.png'.format(endpoint,dataset,k),bbox_inches="tight")
plt.show()

plt.figure(figsize=[10,6])
out_list = []
for i,key in enumerate(keys):
    out=score_dict[key]
    out_list.append(np.array(out)[:,4])

#     plt.xticks(rotation=0)
#     plt.xlabel('Model')
plt.boxplot(out_list,labels = key_names)
plt.ylabel('IDI')
plt.show()

### Comparison of modeling methods for protein model

In [None]:
k=4
keys = ['{}_agesexboruta_elnet_y{}'.format(dataset,k),'{}_agesexboruta_l1_y{}'.format(dataset,k), 
        '{}_agesexboruta_l2_y{}'.format(dataset,k),'{}_agesexboruta_coxelnet_y{}'.format(dataset,k),
       '{}_agesexboruta_xgb_y{}'.format(dataset,k),'{}_agesexboruta_mlp_y{}'.format(dataset,k)]
key_names = ['Elnet', 'L1','L2' ,'Cox Elnet','XGB','MLP']

In [None]:
plt.figure(figsize=[8,6])
out_list = []
key_num = []
for i,key in enumerate(keys):
    out=score_dict[key]
    out_list.append(np.array(out)[:,1])
    key_num.append(key_names[i] +'\n {:0.0f}'.format(np.array(out)[:,0][0]))
plt.boxplot(out_list,labels = key_num)
plt.scatter(range(1,len(keys)+1), np.mean(np.array(out_list),axis=1))
plt.ylabel('AUC')
plt.xlabel('Model / # proteins')
plt.title(' CV {} Year Mortality'.format(k+1))
plt.grid()

if 1:
    plt.savefig(folder+plots +plot_folder+'{}_{}_modelscv_all_AUC_y{}.png'.format(endpoint,dataset,k),bbox_inches="tight")
plt.show()

plt.figure(figsize=[8,6])
out_list = []
num_list = []
key_num = []
for i,key in enumerate(keys):
    out=score_dict[key]
    out_list.append(np.array(out)[:,2])
    key_num.append(key_names[i] +'\n {:0.0f}'.format(np.array(out)[:,0][0]))
#     plt.xticks(rotation=0)
plt.xlabel('Model/ # proteins')
plt.boxplot(out_list,labels = key_num)
plt.scatter(range(1,len(keys)+1), np.mean(np.array(out_list),axis=1))
plt.ylabel('Bries')
plt.title(' CV {} Year Mortality'.format(k+1))
plt.grid()
if 1:
    plt.savefig(folder+plots +plot_folder+'{}_{}_modelscv_all_Bries_y{}.png'.format(endpoint,dataset,k),bbox_inches="tight")
plt.show()

plt.figure(figsize=[8,6])
out_list = []
for i,key in enumerate(keys):
    out=score_dict[key]
    out_list.append(np.array(out)[:,-1])
plt.boxplot(out_list,labels = key_names)
plt.scatter(range(1,len(keys)+1), np.mean(np.array(out_list),axis=1))
plt.title(' CV {} Year Mortality'.format(k+1))
plt.ylabel('C-index')
plt.xlabel('Model/ # proteins')
plt.grid()
if 1:
    plt.savefig(folder+plots +plot_folder+'{}_{}_modelscv_all_Cind_y{}.png'.format(endpoint,dataset,k),bbox_inches="tight")
plt.show()


plt.figure(figsize=[8,6])
out_list = []
for i,key in enumerate(keys):
    out=score_dict[key]
    out_list.append(np.array(out)[:,6])
plt.boxplot(out_list,labels = key_names)
plt.scatter(range(1,len(keys)+1), np.mean(np.array(out_list),axis=1))
plt.title(' CV {} Year Mortality'.format(k+1))
plt.ylabel('Average Precision')
plt.xlabel('Model/ # proteins')
plt.grid()
if 1:
    plt.savefig(folder+plots +plot_folder+'{}_{}_modelscv_all_AP_y{}.png'.format(endpoint,dataset,k),bbox_inches="tight")
plt.show()