In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, confusion_matrix, log_loss
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import QuantileTransformer, PowerTransformer
from sklearn.feature_selection import SelectKBest, SelectFromModel, RFE, RFECV, SelectPercentile, SelectFpr, SelectFdr, SelectFwe
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, SelectFdr
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
import matplotlib.pyplot as plt
import pickle
from sksurv.metrics import concordance_index_censored
import lifelines as ll
# from lifelines.utils.sklearn_adapter import sklearn_adapter
# CoxRegression = sklearn_adapter(ll.CoxPHFitter, event_col = 'event')
import sys
sys.path.append('/odinn/users/thjodbjorge/Python_functions/')
import Predict_functions as pf
from Calculate_score import calculate_metrics, make_class_table
from R_functions import R_pROC,R_pROC_compareROC,R_pROC_compareROC_boot, R_pROC_AUC, R_timeROC, R_timeROC_CI, R_timeROC_pval, R_NRIbin,R_NRIcens,R_NRIcensipw, R_censROC, R_hoslem, R_Greenwood_Nam

raw_data = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/raw_with_info.csv',index_col = 'Barcode2d' )
probe_info = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/probe_info.csv', index_col = 'SeqId')

pn_info = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/pn_info_Mor/pn_info_Mor_event.csv',index_col = 'Barcode2d' )
probes_to_skip = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/probes_to_skip.txt')['probe']
nopro = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/no_protein_probes.txt', header = None)[0] # non-proten probes that were included 
probes_to_skip = set(probes_to_skip).union(set(nopro))

In [None]:
folder = '/odinn/users/thjodbjorge/Proteomics/Mortality2/'
feat_folder = 'Features2/'
pred_folder = 'Predictions3/'
plots = 'Plots2/'
save_plot = True

endpoints = ['death']
# endpoints = ['death','Cdeath','Gdeath','Ideath','Jdeath','Otherdeath']
# event_date = event_date_death
time_to_event = pn_info.time_to_death
no_event_before = pn_info.no_death_before
for endpoint in endpoints:
    if endpoint == 'death':
        use_event = pn_info.event_death
        print(use_event.sum())
    elif endpoint == 'Cdeath':
        use_event = pn_info.event_death & (pn_info.ICD_group == 'C')
        print(use_event.sum())
    elif endpoint == 'Gdeath':
        use_event = pn_info.event_death & (pn_info.ICD_group == 'G')
        print(use_event.sum())
    elif endpoint == 'Ideath':
        use_event = pn_info.event_death & (pn_info.ICD_group == 'I')
        print(use_event.sum())
    elif endpoint == 'Jdeath':
        use_event = pn_info.event_death & (pn_info.ICD_group == 'J')
        print(use_event.sum())
    elif endpoint == 'Otherdeath':
        use_event = pn_info.event_death & (~(pn_info.ICD_group == 'C')&~(pn_info.ICD_group == 'G')&~(pn_info.ICD_group == 'I')&~(pn_info.ICD_group == 'J'))
        print(use_event.sum())

y = []
for i in range(1,19):
    y.append(use_event & (time_to_event <= i))

kf = KFold(n_splits=10, random_state=10, shuffle=False) 
I_train_main, I_test_main = train_test_split(pn_info.index, train_size=0.7, random_state = 10)
# I_val_main, I_test_main = train_test_split(I_test_main, train_size=0.5, random_state = 10)


file = open(folder+"{}_keep_samples.pkl".format('Mor'),'rb')
keep_samples_dict = pickle.load(file)

dataset = 'Old_60105'


In [None]:
do_prediction = True
if do_prediction:

    keep_samples = keep_samples_dict[dataset]

    I_train = I_train_main.intersection(keep_samples)#.intersection(have_prs)
    I_test = I_test_main.intersection(keep_samples)#.intersection(have_prs)

    print('Training set: {}, MI within 15: {}, 10: {}, 5: {}, 2: {}'.format(len(I_train),y[14][I_train].sum(),y[9][I_train].sum(),y[4][I_train].sum(),y[1][I_train].sum()))
    print('Test set: {}, MI within 15: {}, 10: {}, 5: {}, 2: {}'.format(len(I_test),y[14][I_test].sum(),y[9][I_test].sum(),y[4][I_test].sum(),y[1][I_test].sum()))

        # ### Select data and normalize

    X = np.log(raw_data.iloc[:,16:].drop(probes_to_skip,axis=1))

    all_protein = X.columns
    X['sex'] = pn_info[['sex']].values-1
    X['age'] = pn_info[['Age_at_sample_collection_2']].values

    X['age2'] = X['age']**2
#     X['age3'] = X['age']**3
    X['agesex'] = X['age']*X['sex']
    X['age2sex'] = X['age2']*X['sex']
        
    agesex = ['age','sex','agesex','age2','age2sex']
       
    X['lnage'] = np.log(X['age'])
    X['lnage2'] = X['lnage']**2
    
    X['PAD'] = pn_info['PAD']
    no_bmi = (X['PAD'].isna())
    no_bmi_ind = X[no_bmi].index
    X.loc[I_train.intersection(no_bmi_ind),'PAD'] = X.loc[I_train].PAD.mean()
    X.loc[I_test.intersection(no_bmi_ind),'PAD'] = X.loc[I_test].PAD.mean()
    
    
    X['CAD'] = ~pn_info.no_CAD_before
    X['MI'] = ~pn_info.no_MI_before
    X['cancer'] = pn_info.cancer_margin
    X['ApoB'] = X['SeqId.2797-56']
    X['Smoker'] = pn_info['Smoker'].astype(int).values
    X['diabetes'] = pn_info['T2D'].astype(int).values
    X['HTN_treated'] = pn_info[['HTN_treated']].astype(int).values
#     X['statin'] = pn_info['statin'].astype(int).values
    X['statin'] = pn_info['statin_estimate_unsure'].astype(int).values
    X['ApoBstatin']  = X['ApoB']*X['statin']
    
    X['cancer1y']  = pn_info['cancer1y']
    X['cancer5y']  = pn_info['cancer5y']

    X['GDF15'] = X['SeqId.4374-45'].copy()
    X['GDF152'] = X['GDF15']**2
    X['GDF15age']  = X['GDF15']*X['age']
    X['GDF15sex']  = X['GDF15']*X['sex']
    
    X['bmi'] = pn_info['bmi']

    no_bmi = (X['bmi'].isna())
    no_bmi_ind = X[no_bmi].index
#     X.loc[no_bmi_ind,'bmi'] = X.loc[I_train].bmi.mean()
    X.loc[I_train.intersection(no_bmi_ind),'bmi'] = X.loc[I_train].bmi.mean()       
    X.loc[I_test.intersection(no_bmi_ind),'bmi'] = X.loc[I_test].bmi.mean()   
    
    X['bmi2'] = X['bmi']*X['bmi']
    
    X['Platelets'] = pn_info['Platelets']
    no_p = (X['Platelets'].isna()); print(no_p.sum())
    no_p_ind = X[no_p].index
    X.loc[I_train.intersection(no_p_ind),'Platelets'] = X.loc[I_train].Platelets.mean()
    X.loc[I_test.intersection(no_p_ind),'Platelets'] = X.loc[I_test].Platelets.mean()
    X['Platelets2'] = X['Platelets']*X['Platelets']
    
    X['Creatinine'] = pn_info['Creatinine']
    no_p = (X['Creatinine'].isna()); print(no_p.sum())
    no_p_ind = X[no_p].index
    X.loc[I_train.intersection(no_p_ind),'Creatinine'] = X.loc[I_train].Creatinine.mean()
    X.loc[I_test.intersection(no_p_ind),'Creatinine'] = X.loc[I_test].Creatinine.mean()
    
    X['Triglycerides'] = pn_info['Triglycerides']
    no_p = (X['Triglycerides'].isna()); print(no_p.sum())
    no_p_ind = X[no_p].index
    X.loc[I_train.intersection(no_p_ind),'Triglycerides'] = X.loc[I_train].Triglycerides.mean()    
    X.loc[I_test.intersection(no_p_ind),'Triglycerides'] = X.loc[I_test].Triglycerides.mean()   
    

    X['bmiage'] = X['bmi']*X['age']
    X['bmisex'] = X['bmi']*X['sex']
    X['bmi2age'] = X['bmi2']*X['age']
    X['bmi2sex'] = X['bmi2']*X['sex']
    X['PADage'] = X['PAD']*X['age']
    X['PADsex'] = X['PAD']*X['sex']
    
    X['ApoBage']  = X['ApoB']*X['age']
    X['Smokerage'] = X['Smoker']*X['age']
    X['diabetesage'] = X['diabetes']*X['age'] 
    X['statinage'] = X['statin']*X['age']
    X['CADage'] = X['CAD']*X['age']
    X['MIage'] = X['MI'] * X['age']
    X['HTN_treatedage'] =  X['age']*X['HTN_treated']    
    X['cancerage'] = X['age']*X['cancer']
    
    X['Plateletsage'] = X['Platelets']*X['age']
    X['Creatinineage'] = X['Creatinine']*X['age']
    X['Triglyceridesage'] = X['Triglycerides']*X['age']    
    X['Platelets2age'] = X['Platelets2']*X['age']
    
    X['cancer1yage'] = X['cancer1y']*X['age'] 
    X['cancer5yage'] = X['cancer5y']*X['age']     
    
    X['ApoBsex']  = X['ApoB']*X['sex']
    X['Smokersex'] = X['Smoker']*X['sex']
    X['diabetessex'] = X['diabetes']*X['sex'] 
    X['statinsex'] = X['statin']*X['sex']
    X['CADsex'] = X['CAD']*X['sex']
    X['MIsex'] = X['MI'] * X['sex']
    X['HTN_treatedsex'] =  X['sex']*X['HTN_treated']   
    X['cancersex'] = X['sex']*X['cancer']
    
    X['Plateletssex'] = X['Platelets']*X['sex']
    X['Creatininesex'] = X['Creatinine']*X['sex']
    X['Triglyceridessex'] = X['Triglycerides']*X['sex']        
    
    X = X.join(pd.get_dummies(pn_info['agebin'],drop_first = True,prefix='age'))
    X['ageage2'] = X['age']*X['age_2.0']
    X['ageage3'] = X['age']*X['age_3.0']
    X['ageage4'] = X['age']*X['age_4.0']
    
    agebins = ['age_2.0','age_3.0','age_4.0', 'ageage2','ageage3','ageage4']
    agebinssex = [s+'sex' for s in agebins]
    X[agebinssex] = (X[agebins].transpose()*X['sex']).transpose()    
    
    
    PRS = ['nonHDL_prs', 'HT_prs', 'CAD_prs', 'Cancer_prs', 'Stroke2_prs', 'alz_Jansen',
       'pgc_adhd_2017', 'PD_Nalls_2018', 'edu_160125', 'dep_2018', 'bpd_2018',
       'giant_bmi', 'schizo_clozuk', 'iq_2018', 'ipsych_pgc_aut_2017',
       'pgc_Anorexia_2019']
    X[PRS] = pn_info[PRS]
    
    trad = ['ApoB','Smoker','diabetes','HTN_treated','statin','CAD','MI','bmi','bmi2']
    tradage = ['ApoBage','Smokerage','diabetesage','CADage','MIage','HTN_treatedage','bmiage']
    tradsex = ['ApoBsex','Smokersex','diabetessex','CADsex','MIsex','HTN_treatedsex','bmisex']
    
    tradcoxR = ['Smoker','Smokersex','diabetes','diabetesage','HTN_treated','HTN_treatedage','MI','MIage','CAD','bmi','bmiage','statin','statinage']
    tradextralog = ['Smokersex','diabetessex','CADsex','CADage','MIage','HTN_treatedage','bmiage','statinage','bmi2age','ApoBstatin']
    tradextralognosex = ['CADage','MIage','HTN_treatedage','bmiage']
    tradblood = ['Creatinine','Triglycerides','Platelets','Platelets2','Plateletsage','Creatinineage','Platelets2age']
    tradcancer = ['cancer','cancerage']    
    extra_cancer = ['cancer1y','cancer5y','cancer1yage','cancer5yage']


    X_train = X.loc[I_train]
    X_test = X.loc[I_test]

    train_mean = X_train.mean()
    train_std = X_train.std()

    X_train = (X_train-train_mean)/train_std
    X_test = (X_test-train_mean)/train_std

        ## For survival analysis    
    X_train['event'] = use_event[I_train]
    X_test['event'] = use_event[I_test]

    tte_train = time_to_event[I_train]
    tte_test = time_to_event[I_test]

    ysurv_train = pd.DataFrame()
    ysurv_train['event'] = use_event[I_train]
    ysurv_train['time_to_event'] = time_to_event[I_train]


In [None]:
# dataset = 'Old_6080'
# try: 
#     file = open(folder+pred_folder+"{}_{}_predict.pkl".format(endpoint,dataset),'rb')
#     pred_dict = pickle.load(file)
# except:
#     pred_dict = {}
# pred_dict.keys()

In [None]:
k_plot=9
k = k_plot


## Bootstrap selection


In [None]:

file = open(folder+feat_folder+"{}_{}_features.pkl".format(endpoint,dataset),'rb')
features_dict = pickle.load(file)           

#         boruta = sorted(features_dict['{}_boruta_y{}'.format(dataset,k)])
file = open(folder+pred_folder + "{}_{}_bootstrap.pkl".format(endpoint,dataset),'rb')
bootstrap_dict = pickle.load(file)
  
boot = np.abs(np.array(bootstrap_dict['{}_y{}_asprotein_l1_boot'.format(dataset,k)])[:,0,:])
feat =  sorted(features_dict['{}_boruta_y{}'.format(dataset,k)])
    

plot_folder = '{}_{}/'.format(endpoint,dataset)

keep_samples = keep_samples_dict[dataset]

I_train = I_train_main.intersection(keep_samples)#.intersection(have_prs)
I_test = I_test_main.intersection(keep_samples)#.intersection(have_prs)

y_train = y[k][I_train]
y_test= y[k][I_test]

print(bootstrap_dict.keys())

In [None]:
num = np.sum(boot>0,axis=0)
rank = np.mean(boot,axis=0)
df_feat = pd.DataFrame([feat, num[-len(feat):], rank[-len(feat):]]).transpose()
pd.options.display.max_rows = 100
feat_sorted = df_feat.sort_values(2,ascending=False).sort_values(1, ascending=False)
nump = len(feat_sorted)
print(nump)
feat_sorted.head(10)

#### Training

In [None]:
do_training = False

try: 
    file = open(folder+pred_folder + "{}_{}_fewp_predict.pkl".format(endpoint,dataset),'rb')
    pred_dict = pickle.load(file)
except:
    print('No test predictions')
    pred_dict = {}

if do_training:
    k=k_plot
    for i in range(0,100):
        print(feat_sorted.iloc[:i,0])  
        try:
            feat = []        
            feat.extend(agesex)
            feat.extend(feat_sorted.iloc[:i,0])
            
            pred_dict['{}_y{}_agesex_l1bootp{}_elnet'.format(dataset,k,i)] = pf.predict(feat=feat,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,model_type='lrl1l2')
            f = open(folder+pred_folder+"{}_{}_fewp_predict.pkl".format(endpoint,dataset),"wb")
            pickle.dump(pred_dict,f)
            f.close()
            print('Done i = ',i)
        except Exception as e:
            print('Fail boruta', i)
            print(e)
pred_dict.keys()

In [None]:
feat_sorted_ind = [f[6:] for f in feat_sorted[0]]
# pd.DataFrame([feat_sorted[0] probe_info.loc[feat_sorted_ind,'Target']]).transpose()
feat_sorted['Target'] = probe_info.loc[feat_sorted_ind,'Target'].values
feat_sorted.head(20)

## Bootstrap 1000

In [None]:

file = open(folder+feat_folder+"{}_{}_features.pkl".format(endpoint,dataset),'rb')
features_dict = pickle.load(file)           

#         boruta = sorted(features_dict['{}_boruta_y{}'.format(dataset,k)])
file = open(folder+pred_folder + "{}_{}_bootstrap.pkl".format(endpoint,dataset),'rb')
bootstrap_dict = pickle.load(file)
  
boot = np.abs(np.array(bootstrap_dict['{}_y{}_asprotein_l1_boot1000'.format(dataset,k)])[:,0,:])
feat =  sorted(features_dict['{}_boruta_y{}'.format(dataset,k)])
    

plot_folder = '{}_{}/'.format(endpoint,dataset)

keep_samples = keep_samples_dict[dataset]

I_train = I_train_main.intersection(keep_samples)#.intersection(have_prs)
I_test = I_test_main.intersection(keep_samples)#.intersection(have_prs)

y_train = y[k][I_train]
y_test= y[k][I_test]

print(bootstrap_dict.keys())

In [None]:
num = np.sum(boot>0,axis=0)
rank = np.mean(boot,axis=0)
df_feat = pd.DataFrame([feat, num[-len(feat):], rank[-len(feat):]]).transpose()
pd.options.display.max_rows = 100
feat_sorted = df_feat.sort_values(2,ascending=False).sort_values(1, ascending=False)
nump = len(feat_sorted)
print(nump)
feat_sorted.head(10)

#### Training

In [None]:
do_training = True

try: 
    file = open(folder+pred_folder + "{}_{}_fewp_predict.pkl".format(endpoint,dataset),'rb')
    pred_dict = pickle.load(file)
except:
    print('No test predictions')
    pred_dict = {}

if do_training:
    k=k_plot
    for i in range(0,100):
        print(feat_sorted.iloc[:i,0])  
        try:
            feat = []        
            feat.extend(agesex)
            feat.extend(feat_sorted.iloc[:i,0])
            
            pred_dict['{}_y{}_agesex_l1boot1000p{}_l2'.format(dataset,k,i)] = pf.predict(feat=feat,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,model_type='lrl2')
            f = open(folder+pred_folder+"{}_{}_fewp_predict.pkl".format(endpoint,dataset),"wb")
            pickle.dump(pred_dict,f)
            f.close()
            print('Done i = ',i)
        except Exception as e:
            print('Fail boruta', i)
            print(e)
pred_dict.keys()

In [None]:
feat_sorted_ind = [f[6:] for f in feat_sorted[0]]
# pd.DataFrame([feat_sorted[0] probe_info.loc[feat_sorted_ind,'Target']]).transpose()
feat_sorted['Target'] = probe_info.loc[feat_sorted_ind,'Target'].values
feat_sorted['Target_name'] = probe_info.loc[feat_sorted_ind,'TargetFullName'].values
feat_sorted['Gene_name'] = probe_info.loc[feat_sorted_ind,'GeneName'].values
feat_sorted['Uniprot'] = probe_info.loc[feat_sorted_ind,'UniProt Full Name'].values
feat_sorted['Uniprot2'] = probe_info.loc[feat_sorted_ind,'Uniprot Short Names'].values
feat_sorted.head(20)

## Forward selection

In [None]:
dataset = 'Old_18105'
file = open(folder+pred_folder + "{}_{}_forward.pkl".format(endpoint,dataset),'rb')
# file = open(folder+pred_folder + "{}_{}_forward_noGDF15.pkl".format(endpoint,dataset),'rb')
# file = open(folder+pred_folder + "{}_{}_forward_noGDF15HE4.pkl".format(endpoint,dataset),'rb')
bootstrap_dict = pickle.load(file)
bootstrap_dict.keys()

In [None]:
# k = k_plot
k=4
boot = bootstrap_dict['{}_y{}_asprotein_lr'.format(dataset,k)]
# boot = bootstrap_dict['{}_y{}_baselineprotein_lr'.format(dataset,k)]
# boot = bootstrap_dict['Old_18105_asprotein_lr_boot']

In [None]:
boot_df = pd.DataFrame(boot[1])
# boot_df.set_index(0,inplace=True)
# boot_df.sort_values(1)
display(boot_df.head())
feat_sorted = boot_df[0]

In [None]:
# boot_df.sort_values(1)
# feat_sorted = list(boot_df.mean(axis=1).sort_values().index)
feat_sorted

In [None]:

# try: 
#     file = open(folder+pred_folder + "{}_{}_fewp_predict.pkl".format(endpoint,dataset),'rb')
#     pred_dict = pickle.load(file)
# except:
#     print('No test predictions')
#     pred_dict = {}
# pred_dict.keys()

#### Training

In [None]:
do_training = True

try: 
    file = open(folder+pred_folder + "{}_{}_fewp_noGDF15_predict.pkl".format(endpoint,dataset),'rb')
    pred_dict = pickle.load(file)
except:
    print('No test predictions')
    pred_dict = {}
print(pred_dict.keys())
    
if do_training:
    k=k_plot
    y_train = y[k][I_train]
    y_test = y[k][I_test]    
    print(k)
    for i in range(0,101):
        print(feat_sorted[:i])  
        try:
            feat = []        
            feat.extend(agesex)
            feat.extend(feat_sorted[:i])
            
            pred_dict['{}_y{}_agesex_forwardp{}_l2'.format(dataset,k,i)] = pf.predict(feat=feat,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,model_type='lrl2')
#             f = open(folder+pred_folder+"{}_{}_fewp_predict.pkl".format(endpoint,dataset),"wb")
            f = open(folder+pred_folder+"{}_{}_fewp_noGDF15_predict.pkl".format(endpoint,dataset),"wb")
            pickle.dump(pred_dict,f)
            f.close()
            print('Done i = ',i)
        except Exception as e:
            print('Fail boruta', i)
            print(e)
pred_dict.keys()

In [None]:
# do_training = True

# try: 
#     file = open(folder+pred_folder + "{}_{}_fewp_predict.pkl".format(endpoint,dataset),'rb')
# #     file = open(folder+pred_folder + "{}_{}_fewpnoGDF15_predict.pkl".format(endpoint,dataset),'rb')
#     pred_dict = pickle.load(file)
# except:
#     print('No test predictions')
#     pred_dict = {}
# print(pred_dict.keys())

# if do_training:
#     for k in [0,2,3,5,6,7,8,10,11,12,13,14]:
#         print(k)
#         boot = bootstrap_dict['{}_y{}_asprotein_lr'.format(dataset,k)]
#         boot_df = pd.DataFrame(boot[1])
#         display(boot_df.head())
#         feat_sorted = boot_df[0]
        
#         y_train = y[k][I_train]
#         y_test = y[k][I_test]    
#         print(k)
#         for i in range(0,101):
#             print(feat_sorted[:i])  
#             try:
#                 feat = []        
#                 feat.extend(agesex)
#                 feat.extend(feat_sorted[:i])

#                 pred_dict['{}_y{}_agesex_forwardp{}_l2'.format(dataset,k,i)] = pf.predict(feat=feat,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,model_type='lrl2')
#                 f = open(folder+pred_folder+"{}_{}_fewp_predict.pkl".format(endpoint,dataset),"wb")
# #                 f = open(folder+pred_folder+"{}_{}_fewpnoGDF15_predict.pkl".format(endpoint,dataset),"wb")
#                 pickle.dump(pred_dict,f)
#                 f.close()
#                 print('Done i = ',i)
#             except Exception as e:
#                 print('Fail boruta', i)
#                 print(e)
# pred_dict.keys()

## Forward boot

In [None]:
file = open(folder+pred_folder + "{}_{}_forward.pkl".format(endpoint,dataset),'rb')
bootstrap_dict = pickle.load(file)

bootstrap_dict.keys()

In [None]:
# boot = bootstrap_dict['Old_18105_asprotein_lr']
boot = bootstrap_dict['{}_y{}_asprotein_lr_boot'.format(dataset,k)]

In [None]:
boot.sort_values(1)

In [None]:
boot.mean(axis=1).sort_values()

In [None]:
# boot.sort_values(1)
feat_sorted = list(boot.mean(axis=1).sort_values().index)
feat_sorted

#### Training

In [None]:
do_training = True

try: 
    file = open(folder+pred_folder + "{}_{}_fewp_predict.pkl".format(endpoint,dataset),'rb')
    pred_dict = pickle.load(file)
except:
    print('No test predictions')
    pred_dict = {}

if do_training:
    k=k_plot
    y_train = y[k][I_train]
    y_test = y[k][I_test]    
    
    for i in range(0,100):
        print(feat_sorted[:i])  
        try:
            feat = []        
            feat.extend(agesex)
            feat.extend(feat_sorted[:i])
            
            pred_dict['{}_y{}_agesex_forward_bootp{}_l2'.format(dataset,k,i)] = pf.predict(feat=feat,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,model_type='lrl2')
            f = open(folder+pred_folder+"{}_{}_fewp_predict.pkl".format(endpoint,dataset),"wb")
            pickle.dump(pred_dict,f)
            f.close()
            print('Done i = ',i)
        except Exception as e:
            print('Fail boruta', i)
            print(e)
pred_dict.keys()

In [None]:
feat_sorted_ind = [f[6:] for f in feat_sorted]

pd.DataFrame([feat_sorted, probe_info.loc[feat_sorted_ind,'Target']]).transpose()

## Backwards selection

In [None]:
file = open(folder+pred_folder + "{}_{}_forward.pkl".format(endpoint,dataset),'rb')
bootstrap_dict = pickle.load(file)
bootstrap_dict.keys()

In [None]:
feat = bootstrap_dict['{}_y{}_asprotein_RFElr'.format(dataset,k)]

In [None]:
feat

In [None]:
feat_sorted = feat[5:].sort_values(0).index
feat_sorted

In [None]:
do_training = True

try: 
    file = open(folder+pred_folder + "{}_{}_fewp_predict.pkl".format(endpoint,dataset),'rb')
    pred_dict = pickle.load(file)
except:
    print('No test predictions')
    pred_dict = {}

if do_training:
    k=k_plot
    y_train = y[k][I_train]
    y_test = y[k][I_test]    
    
    for i in range(0,100):
        print(feat_sorted[:i])  
        try:
            feat = []        
            feat.extend(agesex)
            feat.extend(feat_sorted[:i])
            
            pred_dict['{}_y{}_agesex_backward_p{}_l2'.format(dataset,k,i)] = pf.predict(feat=feat,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,model_type='lrl2')
            f = open(folder+pred_folder+"{}_{}_fewp_predict.pkl".format(endpoint,dataset),"wb")
            pickle.dump(pred_dict,f)
            f.close()
            print('Done i = ',i)
        except Exception as e:
            print('Fail boruta', i)
            print(e)
pred_dict.keys()

## Testing

In [None]:
do_prediction = True
try: 
#     file = open(folder+pred_folder + "{}_{}_fewp_predict.pkl".format(endpoint,dataset),'rb')
    file = open(folder+pred_folder + "{}_{}_fewp_noGDF15_predict.pkl".format(endpoint,dataset),'rb')
    pred_dict = pickle.load(file)
except:
    print('No test predictions')
    pred_dict = {}

try: 
#     file = open(folder+pred_folder + "{}_{}_fewp_test_prediction.pkl".format(endpoint,dataset),'rb')
    file = open(folder+pred_folder + "{}_{}_fewp_noGDF15_test_prediction.pkl".format(endpoint,dataset),'rb')
    pred_test_dict = pickle.load(file)
except:
    print('No test predictions')
    pred_test_dict = {}

keep_samples = keep_samples_dict[dataset]

I_train = I_train_main.intersection(keep_samples)
I_test = I_test_main.intersection(keep_samples)
    
# k = k_plot
# k=9
if do_prediction:
#     y_train = y[k][I_train]
#     y_test = y[k][I_test]

#             model_key = '{}_boruta_y9_coxl2'.format(dataset)
    model_keys = pred_dict.keys()
    for model_key in model_keys:
        print(model_key)
        model = pred_dict[model_key][0]
        feat = pred_dict[model_key][3]

        pred_test = model.predict_proba(X_test[feat])[:,1]
        pred_test_dict[model_key] = pred_test

    f = open(folder+pred_folder+"{}_{}_fewp_noGDF15_test_prediction.pkl".format(endpoint,dataset),"wb")
    pickle.dump(pred_test_dict,f)
    f.close()
    print('Done')

In [None]:
VERY_SMALL_SIZE = 12
SMALL_SIZE = 14
MEDIUM_SIZE = 16
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=VERY_SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) 

### Forward selection testing

In [None]:
k=k_plot
# k=1
# k=
k2=k+1

nump = 101 
K = [k]
NRI_low = np.zeros([nump,5])
NRI_middle =  np.zeros([nump,5])
NRI_high =  np.zeros([nump,5])
IDI =  np.zeros([nump,5])
AUC =  np.zeros([nump,5])
AUC_CI = []
NRI_low_CI = []
NRI_middle_CI = []
NRI_high_CI = []
out_list = []
for j,k in enumerate(K):

    for i in range(nump):
        for l in range(1):
            pred = pd.DataFrame(pred_test_dict['{}_y{}_agesex_forwardp{}_l2'.format(dataset,k,i)],index=I_test)[0]
            baseline = pd.DataFrame(pred_test_dict['{}_y{}_agesex_forwardp0_l2'.format(dataset,k)],index=I_test)[0]
            
#             pred = pd.DataFrame(pred_test_dict['{}_y{}_agesexp{}_elnet'.format(dataset,k,i)],index=I_test)[0]
#             baseline = pd.DataFrame(pred_test_dict['{}_y{}_agesexp0_elnet'.format(dataset,k)],index=I_test)[0]
#             pred = pd.DataFrame(pred_test_dict['predy{}_{}_tradprs_boot_forwardp{}_coxl2'.format(k,dataset,i)][0],index=I_test)[0]
#             baseline = pd.DataFrame(pred_test_dict['predy{}_{}_tradprs_boot_forwardp0_coxl2'.format(k,dataset)][0],index=I_test)[0]

            out = calculate_metrics(pred,baseline,y[k][I_test],bins=[0,0.3,1])
            out_list.append(out)
            NRI_low[i,l] = out[4]
            NRI_middle[i,l] = calculate_metrics(pred,baseline,y[k][I_test],bins=[0,0.5,1])[4]
            NRI_high[i,l] = calculate_metrics(pred,baseline,y[k][I_test],bins=[0,0.7,1])[4]
            IDI[i,l] = out[3]
            AUC[i,l] = out[0]
            
#             NRI = R_NRIbin(y[k][I_test],baseline,pred, 0.3)
#             NRI_low_CI.append(np.array(NRI[0].loc['NRI',:]))
#             NRI = R_NRIbin(y[k][I_test],baseline,pred, 0.5)
#             NRI_middle_CI.append(np.array(NRI[0].loc['NRI',:]))
#             NRI = R_NRIbin(y[k][I_test],baseline,pred, 0.7)
#             NRI_high_CI.append(np.array(NRI[0].loc['NRI',:]))
            
            AUC_CI.append(R_pROC_AUC(y[k][I_test],pred))
AUC_CI = np.array(AUC_CI)
# NRI_low_CI = np.array(NRI_low_CI)
# NRI_middle_CI = np.array(NRI_middle_CI)
# NRI_high_CI = np.array(NRI_high_CI)


In [None]:
file = open(folder+pred_folder + "{}_{}_forward_noGDF15.pkl".format(endpoint,dataset),'rb')
bootstrap_dict = pickle.load(file)
boot = bootstrap_dict['{}_y{}_asprotein_lr'.format(dataset,k)]

boot = pd.DataFrame(boot[1])
boot = boot.sort_values(1)

In [None]:
res_df = pd.DataFrame(out_list,columns = ['AUC', 'Bries', 'logloss', 'IDI', 'NRI', 'AP', 'NRI_events', 'NRI_ctrl'])
res_df['AUC_pROC'] = AUC_CI[:,1]
res_df['AUC_pROC_low'] = AUC_CI[:,0]
res_df['AUC_pROC_high'] = AUC_CI[:,2]
res_df['protein'] =  ['']+list(boot[0])
res_df

In [None]:
# res_df.to_csv(folder+plots+'{}_{}_NumProtein_forward_y{}.csv'.format(endpoint,dataset,k))
res_df.to_csv(folder+plots+'{}_{}_NumProtein_forward_noGDF15_y{}.csv'.format(endpoint,dataset,k))

In [None]:

fig = plt.figure(figsize=[8,5])
# fig.add_subplot(1,2,1)

plt.plot(AUC_CI[:,1])
plt.ylabel('AUC')
plt.xlabel('Number of proteins')
plt.grid()
plt.title('{}-Year Mortality'.format(k+1))

# plt.savefig(folder+plots+'{}_{}_NumProtein_forward_AUC_y{}.png'.format(endpoint,dataset,k))
plt.savefig(folder+plots+'{}_{}_NumProtein_forward_noGDF15_AUC_y{}.png'.format(endpoint,dataset,k))

plt.figure()
    # fig.add_subplot(1,2,2)
plt.plot(NRI_low[:,0])
plt.plot(NRI_middle[:,0])
plt.plot(NRI_high[:,0])
# plt.plot(IDI)
plt.ylabel('NRI')
plt.xlabel('Number of proteins')
plt.grid()
plt.legend(['threshold = 0.3','threshold = 0.5','threshold=0.7'])
plt.title('Age+sex+protein')

print(AUC_CI[:,1])

## 1000 bootstrap testing

In [None]:
k=k_plot
k2=k+1

nump = 100 
K = [k_plot]
NRI_low = np.zeros([nump,5])
NRI_middle =  np.zeros([nump,5])
NRI_high =  np.zeros([nump,5])
IDI =  np.zeros([nump,5])
AUC =  np.zeros([nump,5])
AUC_CI = []
NRI_low_CI = []
NRI_middle_CI = []
NRI_high_CI = []
out_list = []
for j,k in enumerate(K):

    for i in range(nump):
        for l in range(1):
#             pred = pd.DataFrame(pred_test_dict['{}_y{}_agesex_forwardp{}_elnet'.format(dataset,k,i)],index=I_test)[0]
#             baseline = pd.DataFrame(pred_test_dict['{}_y{}_agesex_forwardp0_elnet'.format(dataset,k)],index=I_test)[0]
            
            pred = pd.DataFrame(pred_test_dict['{}_y{}_agesex_l1boot1000p{}_l2'.format(dataset,k,i)],index=I_test)[0]
            baseline = pd.DataFrame(pred_test_dict['{}_y{}_agesex_l1boot1000p0_l2'.format(dataset,k)],index=I_test)[0]
#             pred = pd.DataFrame(pred_test_dict['predy{}_{}_tradprs_boot_forwardp{}_coxl2'.format(k,dataset,i)][0],index=I_test)[0]
#             baseline = pd.DataFrame(pred_test_dict['predy{}_{}_tradprs_boot_forwardp0_coxl2'.format(k,dataset)][0],index=I_test)[0]

            out = calculate_metrics(pred,baseline,y[k][I_test],bins=[0,0.3,1])
            out_list.append(out)
            NRI_low[i,l] = out[4]
            NRI_middle[i,l] = calculate_metrics(pred,baseline,y[k][I_test],bins=[0,0.5,1])[4]
            NRI_high[i,l] = calculate_metrics(pred,baseline,y[k][I_test],bins=[0,0.7,1])[4]
            IDI[i,l] = out[3]
            AUC[i,l] = out[0]
            
#             NRI = R_NRIbin(y[k][I_test],baseline,pred, 0.3)
#             NRI_low_CI.append(np.array(NRI[0].loc['NRI',:]))
#             NRI = R_NRIbin(y[k][I_test],baseline,pred, 0.5)
#             NRI_middle_CI.append(np.array(NRI[0].loc['NRI',:]))
#             NRI = R_NRIbin(y[k][I_test],baseline,pred, 0.7)
#             NRI_high_CI.append(np.array(NRI[0].loc['NRI',:]))
            
            AUC_CI.append(R_pROC_AUC(y[k][I_test],pred))
AUC_CI = np.array(AUC_CI)
# NRI_low_CI = np.array(NRI_low_CI)
# NRI_middle_CI = np.array(NRI_middle_CI)
# NRI_high_CI = np.array(NRI_high_CI)


In [None]:

fig = plt.figure(figsize=[16,5])
fig.add_subplot(1,2,1)
# plt.plot(Cindex[:,0,0])
# plt.plot(AUC[:,0])
# plt.plot(AUC_CI[:,1])
plt.plot(AUC_CI[:,1])
# plt.fill_between(range(AUC_CI.shape[0]),AUC_CI[:,0], AUC_CI[:,2], alpha=0.1)
# plt.plot(Cindex[:,0,1])
# plt.plot(Cindex[:,0,2])
# plt.plot(np.mean(Cindex[:,:,0],axis=1))
# plt.plot(np.mean(NRI_low,axis=1))
plt.ylabel('AUC')
plt.xlabel('Number of proteins')
plt.grid()
plt.title('Age+sex+protein')
fig.add_subplot(1,2,2)
plt.plot(NRI_low[:,0])
plt.plot(NRI_middle[:,0])
plt.plot(NRI_high[:,0])
# plt.plot(IDI)
plt.ylabel('NRI')
plt.xlabel('Number of proteins')
plt.grid()
plt.legend(['threshold = 0.3','threshold = 0.5','threshold=0.7'])
plt.title('Age+sex+protein')

plt.savefig(folder+plots+'{}_{}_NumProtein_Lassoboot_y{}.png'.format(endpoint,dataset,k))

In [None]:
k=k_plot
# k=9
k2=k+1

nump = 100 
K = [k]
NRI_low = np.zeros([nump,5])
NRI_middle =  np.zeros([nump,5])
NRI_high =  np.zeros([nump,5])
IDI =  np.zeros([nump,5])
AUC =  np.zeros([nump,5])
AUC_CI = []
NRI_low_CI = []
NRI_middle_CI = []
NRI_high_CI = []
out_list = []
for j,k in enumerate(K):

    for i in range(nump):
        for l in range(1):
            pred = pd.DataFrame(pred_test_dict['{}_y{}_agesex_backward_p{}_l2'.format(dataset,k,i)],index=I_test)[0]
            baseline = pd.DataFrame(pred_test_dict['{}_y{}_agesex_backward_p0_l2'.format(dataset,k)],index=I_test)[0]
            
#             pred = pd.DataFrame(pred_test_dict['{}_y{}_agesexp{}_elnet'.format(dataset,k,i)],index=I_test)[0]
#             baseline = pd.DataFrame(pred_test_dict['{}_y{}_agesexp0_elnet'.format(dataset,k)],index=I_test)[0]
#             pred = pd.DataFrame(pred_test_dict['predy{}_{}_tradprs_boot_forwardp{}_coxl2'.format(k,dataset,i)][0],index=I_test)[0]
#             baseline = pd.DataFrame(pred_test_dict['predy{}_{}_tradprs_boot_forwardp0_coxl2'.format(k,dataset)][0],index=I_test)[0]

            out = calculate_metrics(pred,baseline,y[k][I_test],bins=[0,0.3,1])
            out_list.append(out)
            NRI_low[i,l] = out[4]
            NRI_middle[i,l] = calculate_metrics(pred,baseline,y[k][I_test],bins=[0,0.5,1])[4]
            NRI_high[i,l] = calculate_metrics(pred,baseline,y[k][I_test],bins=[0,0.7,1])[4]
            IDI[i,l] = out[3]
            AUC[i,l] = out[0]
            
#             NRI = R_NRIbin(y[k][I_test],baseline,pred, 0.3)
#             NRI_low_CI.append(np.array(NRI[0].loc['NRI',:]))
#             NRI = R_NRIbin(y[k][I_test],baseline,pred, 0.5)
#             NRI_middle_CI.append(np.array(NRI[0].loc['NRI',:]))
#             NRI = R_NRIbin(y[k][I_test],baseline,pred, 0.7)
#             NRI_high_CI.append(np.array(NRI[0].loc['NRI',:]))
            
            AUC_CI.append(R_pROC_AUC(y[k][I_test],pred))
AUC_CI = np.array(AUC_CI)
# NRI_low_CI = np.array(NRI_low_CI)
# NRI_middle_CI = np.array(NRI_middle_CI)
# NRI_high_CI = np.array(NRI_high_CI)


In [None]:

fig = plt.figure(figsize=[16,5])
fig.add_subplot(1,2,1)
# plt.plot(Cindex[:,0,0])
# plt.plot(AUC[:,0])
# plt.plot(AUC_CI[:,1])
plt.plot(AUC_CI[:,1])
# plt.fill_between(range(AUC_CI.shape[0]),AUC_CI[:,0], AUC_CI[:,2], alpha=0.1)
# plt.plot(Cindex[:,0,1])
# plt.plot(Cindex[:,0,2])
# plt.plot(np.mean(Cindex[:,:,0],axis=1))
# plt.plot(np.mean(NRI_low,axis=1))
plt.ylabel('AUC')
plt.xlabel('Number of proteins')
plt.grid()
plt.title('Age+sex+protein')
fig.add_subplot(1,2,2)
plt.plot(NRI_low[:,0])
plt.plot(NRI_middle[:,0])
plt.plot(NRI_high[:,0])
# plt.plot(IDI)
plt.ylabel('NRI')
plt.xlabel('Number of proteins')
plt.grid()
plt.legend(['threshold = 0.3','threshold = 0.5','threshold=0.7'])
plt.title('Age+sex+protein')

plt.savefig(folder+plots+'{}_{}_NumProtein_backward_y{}.png'.format(endpoint,dataset,k))

print(AUC_CI[:,1])

### Test statistics metrics forward selection

In [None]:
file = open(folder+pred_folder + "{}_{}_forward.pkl".format(endpoint,dataset),'rb')
bootstrap_dict = pickle.load(file)
k = 9
print(k)
print(bootstrap_dict.keys())
boot = bootstrap_dict['{}_y{}_asprotein_lr'.format(dataset,k)]
boot_df = pd.DataFrame(boot[1])
boot_df.set_index(0,inplace=True)

In [None]:
boot_df.sort_values(1)
feat_sorted = list(boot_df.mean(axis=1).sort_values().index)
feat_sorted

In [None]:
res_list = []
metrics_list = []
y_train = y[k][I_train]
for i in range(0,101):
    print(feat_sorted[:i])  

    feat = []        
    feat.extend(agesex)
    feat.extend(feat_sorted[:i])

    model = sm.Logit(y_train,sm.add_constant(X_train[feat]))
    res = model.fit(disp=0)
    metrics_list.append([res.llf, res.llr_pvalue, res.aic,res.bic])
    res_list.append(res)

In [None]:
plt.plot(np.array(metrics_list)[:,3])
plt.xlabel('Number of proteins')
plt.ylabel('BIC')
plt.scatter(np.argmin(np.array(metrics_list)[:,3]), np.min(np.array(metrics_list)[:,3]))
plt.title('Traing set {} year prediction'.format(k+1))
plt.savefig(folder+plots+'{}_{}_NumProtein_forward_BIC_y{}.png'.format(endpoint,dataset,k))

In [None]:
plt.plot(np.array(metrics_list)[:,2])

In [None]:
print(np.argmin(np.array(metrics_list)[:,3]))
(feat_sorted[:np.argmin(np.array(metrics_list)[:,3])])


In [None]:
print(features_dict.keys())