In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, confusion_matrix, log_loss
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, LassoCV
from sklearn.preprocessing import QuantileTransformer, PowerTransformer
from sklearn.feature_selection import SelectKBest, SelectFromModel, RFE, RFECV, SelectPercentile, SelectFpr, SelectFdr, SelectFwe
import matplotlib.pyplot as plt
import pickle
from sklearn.decomposition import PCA
from scipy.stats import pearsonr
import sys
sys.path.append('/odinn/users/thjodbjorge/Python_functions/')
import Predict_functions as pf
from Calculate_score import calculate_metrics

In [None]:
folder = '/odinn/users/thjodbjorge/Proteomics/Mortality2/'
feat_folder = 'Features2/'
pred_folder = 'Predictions3/'
# corr_type = 'sitesampleageqt'
corr_type = 'None'


In [None]:
if corr_type == 'qt':
    print('Load qt transformed proteins')
    proteins = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/protein_data/protein_qt.csv',index_col = 'Barcode2d' )
else:
    print('Load raw protein values')
    raw_data = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/raw_with_info.csv',index_col = 'Barcode2d' )
    
probe_info = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/probe_info.csv', index_col = 'SeqId')

pn_info = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/pn_info_Mor/pn_info_Mor_event.csv',index_col = 'Barcode2d' )
probes_to_skip = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/Data/probes_to_skip.txt')['probe']

In [None]:

if corr_type == 'pqtl':
    pqtl_protein = pd.read_csv('/odinn/users/egilf/pQTL/for_benedikt/pQTL_conditional_04052020.gor', sep='\t')
    # pqtl = pd.read_csv('/odinn/users/steinthora/proteomics/proteomic_project/Data/pQTL_Merged_08052020.csv', sep = '\t', index_col = 'PN')
    pqtl = pd.read_csv('/odinn/users/thjodbjorge/Proteomics/pqtl/pqtl_combined_meanimp.csv',index_col = 'PN')


    # In[4]:

    pqtl = pd.merge(pn_info['PN'],pqtl,left_on='PN',right_index=True)
    pqtl.drop('PN',axis=1,inplace=True)
    pro_pqtl = {}
    for i in raw_data.iloc[:,16:].columns:
        pro_pqtl[i] = list(pqtl_protein[pqtl_protein.SeqId == i[6:].replace('-','_')]['SentinelMarker'])

In [None]:
endpoint = 'age'

kf = KFold(n_splits=10, random_state=10, shuffle=False) 
I_train_main, I_test_main = train_test_split(pn_info.index, train_size=0.7, random_state = 10)
# I_val_main, I_test_main = train_test_split(I_test_main, train_size=0.5, random_state = 10)



file = open(folder+"{}_keep_samples.pkl".format('Mor'),'rb')
keep_samples_dict = pickle.load(file)

# print(keep_samples_dict.keys())
# keep_samples_keys = ['Old_18105', 'Old_60105', 'Old_6080','Old_18105_C', 'Old_18105_I', 'Old_18105_J', 'Old_18105_G','Old_18105_Other']
keep_samples_keys = ['Old_18105']#,'Old_60105']
# keep_samples_keys = ['Old_18105_Neoplasms','Old_18105_I','Old_18105_J','Old_18105_G','Old_18105_Other']
skip_PC = 5

In [None]:
for dataset in keep_samples_keys:

    print(dataset)
    keep_samples = keep_samples_dict[dataset]

    I_train = I_train_main.intersection(keep_samples)#.intersection(have_prs)
    I_test = I_test_main.intersection(keep_samples)#.intersection(have_prs)
        # ### Select data and normalize

    if corr_type == 'qt':
        X = proteins
    else:
        X = np.log(raw_data.iloc[:,16:].drop(probes_to_skip,axis=1))

    all_protein = X.columns
    X['sex'] = pn_info[['sex']].values-1
    X['age'] = pn_info[['Age_at_sample_collection_2']].values

    X['age2'] = X['age']**2
#     X['age3'] = X['age']**3
    X['agesex'] = X['age']*X['sex']
    X['age2sex'] = X['age2']*X['sex']
        
    agesex = ['age','sex','agesex','age2','age2sex']
    
    X['PAD'] = pn_info['PAD']
    no_bmi = (X['PAD'].isna())
    no_bmi_ind = X[no_bmi].index
    X.loc[I_train.intersection(no_bmi_ind),'PAD'] = X.loc[I_train].PAD.mean()
    X.loc[I_test.intersection(no_bmi_ind),'PAD'] = X.loc[I_test].PAD.mean()
    
    X['site'] = (pn_info['site'] == 'DC').astype(int)
    X['Sample_age'] = pn_info['Sample_age']
    try: 
        print('Load age dictonary')
        file = open(folder+pred_folder+"age_predict.pkl",'rb')
        age_dict = pickle.load(file)
        file.close()
        PAD2 = age_dict['{}_sexprotein_lasso'.format(dataset)][4]-X.age
        X['PAD2'] = PAD2
    except:
        print('No file to load')
    
    X_train = X.loc[I_train]
    X_test = X.loc[I_test]
    

            
    if corr_type == 'sitesampleage':
        print('Correct proteins fro site and sample age')
        for p in all_protein:    
            corr_model = sm.OLS(X_train[p],sm.add_constant(X_train[['site','Sample_age']])).fit()
            corr_train = corr_model.predict(sm.add_constant(X_train[['site','Sample_age']]))
            corr_test = corr_model.predict(sm.add_constant(X_test[['site','Sample_age']]))
        #     corr_train.columns = all_protein
            X_train[p] = X_train[p] - corr_train
            X_test[p] = X_test[p] - corr_test
        print('Correction done')     
        
        
    if corr_type == 'sitesampleageqt':
        print('Correct proteins for site and sample age')
        for p in all_protein:    
            corr_model = sm.OLS(X_train[p],sm.add_constant(X_train[['site','Sample_age']])).fit()
            corr_train = corr_model.predict(sm.add_constant(X_train[['site','Sample_age']]))
            corr_test = corr_model.predict(sm.add_constant(X_test[['site','Sample_age']]))
        #     corr_train.columns = all_protein
            X_train[p] = X_train[p] - corr_train
            X_test[p] = X_test[p] - corr_test
        print('Normalize proteins')    
        transformer = QuantileTransformer(n_quantiles=50000, output_distribution = 'normal',random_state=10)
        transformer.fit(X_train[all_protein])
        X_train[all_protein] = transformer.transform(X_train[all_protein])
        X_test[all_protein] = transformer.transform(X_test[all_protein]) 
        
    if corr_type == 'batch':
        for p in all_protein:    
            corr_model = sm.OLS(X_train[p],sm.add_constant(X_train[batch_var])).fit()
            corr_train = corr_model.predict(sm.add_constant(X_train[batch_var]))
            corr_test = corr_model.predict(sm.add_constant(X_test[batch_var]))
        #     corr_train.columns = all_protein
            X_train[p] = X_train[p] - corr_train
            X_test[p] = X_test[p] - corr_test           
        
    
    if corr_type == 'PCA':
        pca1 = PCA(skip_PC)
        x_pca1 = pca1.fit_transform(X_train[all_protein])
        x_1 = pca1.inverse_transform(x_pca1)
        X_train[all_protein] = X_train[all_protein] - x_1

        x_pca1 = pca1.transform(X_test[all_protein])
        x_1 = pca1.inverse_transform(x_pca1)
        X_test[all_protein] = X_test[all_protein] - x_1
    
    
    train_mean = X_train.mean()
    train_std = X_train.std()

    X_train = (X_train-train_mean)/train_std
    X_test = (X_test-train_mean)/train_std

    print('Done')
    
    try: 
        print('Load prediction dictonary')
        file = open(folder+pred_folder+"{}_predict.pkl".format(endpoint),'rb')
        pred_dict = pickle.load(file)
        file.close()
    except:
        print('No file to load')
        pred_dict = {}

In [None]:
X_train['PAD2']

In [None]:
pearsonr(X_train.PAD,X_train.PAD2)

In [None]:
y_train = X.loc[I_train,endpoint]
y_test = X.loc[I_test,endpoint]

In [None]:
feat = ['sex']
feat.extend(all_protein)
model = LassoCV(cv = 5,n_jobs=-1)
model.fit(X_train[feat],y_train)   


In [None]:
plt.plot(model.alphas_,model.mse_path_.mean(axis=1))
print(model.score(X_train[feat],y_train))
print(model.score(X_test[feat],y_test))

In [None]:
(X[feat] - train_mean[feat])/train_std[feat]

In [None]:


train_pred = model.predict(X_train[feat])
test_pred = model.predict(X_test[feat])
pred_all = model.predict((X[feat] - train_mean[feat])/train_std[feat])

In [None]:
plt.scatter(y_train,train_pred)

In [None]:
plt.scatter(y_test,test_pred)

In [None]:
pearsonr(y_train,train_pred)

In [None]:
print(endpoint)
plt.scatter(X.age,pred_all)

In [None]:
pred_dict['{}_sexprotein_lasso'.format(dataset)] = model,train_pred,test_pred, feat, pred_all

In [None]:
f = open(folder+pred_folder+"{}_predict.pkl".format(endpoint),"wb")
pickle.dump(pred_dict,f)
f.close()

In [None]:
pearsonr(y_train+X['PAD'][I_train],train_pred)

In [None]:
pearsonr(X['PAD'][I_train],train_pred-y_train)

In [None]:
pd.DataFrame(feat)[np.abs(model.coef_) > 0]

In [None]:
# f = open(folder+pred_folder+"{}_predict.pkl".format(endpoint),"wb")
# pickle.dump(pred_dict,f)
# f.close()

#### Model from the paper

In [None]:
df = pd.DataFrame(['P00533',
'P61769',
'Q2UY09',
'P49755',
'Q76LX8',
'Q9GZX9',
'Q9H4F8',
'P07949',
'Q92626',
'Q9Y5H3',
'P01034',
'P19438',
'Q8WWX9',
'P20333',
'Q01974',
'Q96DX5',
'Q9BXY4',
'P21757',
'P07998',
'Q99988',
'O00300',
'P45379',
'Q13790',
'Q01995',
'Q12805',
'Q4LDE5',
'Q9NP99',
'Q9H5V8',
'O76076',
'Q2I0M5',
'O95633',
'Q96GP6',
'Q9BU40',
'P41222',
'P21246'],columns = ['UniProt'])


In [None]:
df

In [None]:
probe_info.UniProt

In [None]:
df.merge(probe_info.reset_index()[['SeqId','UniProt','TargetFullName']],how='left',left_on='UniProt',right_on='UniProt')

In [None]:
probe_info.reset_index()[['SeqId','UniProt','TargetFullName']]