In [2]:
import pandas as pd
from utils import *

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
gadd_allprot = pd.read_excel('Data/Other_Biomarkers/Gadd_Suppl.xlsx', sheet_name = 'Supplementary Table 12', header = 3)
gadd_allprot_met = pd.read_excel('Data/Other_Biomarkers/Gadd_Suppl.xlsx', sheet_name = 'Supplementary Table 17', header = 3)

In [8]:
gadd_allprot['ProteinScore'].value_counts()

ProteinScore
Death                            201
Type 2 diabetes                  143
Ischaemic heart disease          136
COPD                             111
Liver disease                     93
Parkinson's disease               62
Vascular dementia                 42
Ischaemic stroke                  36
Alzheimer's dementia              36
Rheumatoid arthritis              33
Prostate cancer                   33
Colorectal cancer                 27
Amyotrophic lateral sclerosis     10
Breast cancer                      9
Cystitis                           8
Inflammatory bowel disease         7
Gynaecological cancer              7
Lung cancer                        6
Endometriosis                      5
Name: count, dtype: int64

In [4]:
gadd_allprot_death = gadd_allprot[gadd_allprot['ProteinScore'] == "Death"]
gadd_allprot_death.loc[:, 'Protein'] = gadd_allprot_death['Protein'].str.split('.').str[0]
gadd_allprot_death

Unnamed: 0,Protein,Coefficient,ProteinScore
363,NPPB,0.013310,Death
364,DUOX2,0.009098,Death
365,SERPINB5,0.006486,Death
366,TSPAN1,0.051822,Death
367,ACTA2,0.019484,Death
...,...,...,...
559,GDNF,0.013876,Death
560,NEFL,0.212153,Death
561,KIRREL2,-0.013689,Death
562,AGR2,0.060899,Death


In [24]:
prots_set3 = pd.read_csv('Data/Other_Biomarkers/proteins_set3_imputed_other_biomarkers.csv')
NMR = pd.read_csv('Data/NMR_NotDerived.csv')
set3_prot_met = pd.merge(prots_set3, NMR, on = 'eid', how = 'inner')
set3_prot_met.columns = set3_prot_met.columns.str.lower()
set3_prot_met

Unnamed: 0,eid,a1bg,aamdc,aarsd1,abca2,abhd14b,abl1,abo,abraxas2,acaa1,...,m_hdl_p,m_hdl_pl,m_hdl_ce,m_hdl_fc,m_hdl_tg,s_hdl_p,s_hdl_pl,s_hdl_ce,s_hdl_fc,s_hdl_tg
0,1001997,0.3519,-0.01845,-0.72940,-0.206350,-0.08290,-1.5240,-0.0793,0.122200,0.69460,...,0.002145,0.30031,0.24383,0.037758,0.035120,0.007090,0.48011,0.25286,0.072361,0.038235
1,1002479,0.1195,0.62735,0.49685,-0.151700,0.84135,0.5843,-0.8762,0.186800,1.41210,...,0.003652,0.45491,0.40935,0.078638,0.038392,0.010595,0.68297,0.37739,0.114740,0.043757
2,1021769,-0.0050,-0.90435,-0.52115,0.193700,-0.47845,-0.3828,1.4314,-0.685900,-1.55405,...,0.003943,0.49751,0.40121,0.091757,0.069824,0.007478,0.58024,0.23912,0.103330,0.052798
3,1062421,-0.0128,-0.07355,-0.16825,-0.067533,-0.22740,-0.9587,0.9043,-0.599444,-0.47050,...,0.003229,0.41625,0.35149,0.068696,0.045558,0.009655,0.64296,0.33265,0.108430,0.051783
4,1065061,-0.0840,0.07045,-0.08035,-0.196200,0.00460,-0.2503,-2.1431,-0.109700,-0.65270,...,0.004887,0.56913,0.53882,0.119360,0.028059,0.009508,0.63051,0.33452,0.117470,0.023365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2155,6010736,-0.2907,0.92735,0.58480,-0.806650,0.86960,1.5103,0.0935,1.337500,0.12450,...,0.004481,0.56810,0.48260,0.104090,0.055275,0.010293,0.72557,0.35699,0.123040,0.045874
2156,6011825,-0.1062,0.17305,0.12625,-0.281500,0.05955,0.0193,-1.5305,0.624100,-0.04370,...,0.003958,0.47461,0.43833,0.087894,0.038621,0.009880,0.63914,0.34799,0.114600,0.039757
2157,6019414,-0.1140,-0.42655,-0.05650,-0.045250,-0.70250,-0.8900,-1.5465,-1.036500,-0.30770,...,0.005076,0.58833,0.57818,0.120440,0.021411,0.010462,0.67141,0.37377,0.123520,0.024533
2158,6021354,0.0256,-0.19625,-0.21925,-0.667800,-0.16385,-0.2491,-1.6394,-0.765100,-0.42735,...,0.003519,0.42887,0.40256,0.079400,0.025101,0.010119,0.62084,0.36107,0.111260,0.030732


In [16]:
def calculate_biomarkers(protein_df, coefficients_df, name_col, weight_col):
    # Map proteins to coefficients (protein names are keys, coefficients are values)
    coefficient_dict = (
        coefficients_df
        .set_index(name_col)[weight_col]
        .rename(index=str.lower)  # Convert keys to lowercase
        .to_dict()
    )
    # Identify the common proteins
    common_proteins = protein_df.columns.intersection(coefficient_dict.keys())
    
    # Multiply protein by weight
    weighted_values = protein_df[common_proteins].mul([coefficient_dict[protein] for protein in common_proteins], axis=1)
    
    # Create sumscore
    sumscore = weighted_values.sum(axis=1)
    
    return sumscore



In [27]:
gadd_allprot_met_death = gadd_allprot_met[gadd_allprot_met['Outcome'] == "Death"]
gadd_allprot_met_death.loc[:, 'Predictor'] = gadd_allprot_met_death['Predictor'].str.split('.').str[0] #Use same names as we do

#Make seperate scores into seperate dfs
gadd_allprot_met_death_prot = gadd_allprot_met_death[gadd_allprot_met_death['Score '] == 'ProteinScore (proteomics only)']
gadd_allprot_met_death_met = gadd_allprot_met_death[gadd_allprot_met_death['Score '] == 'MetabScore (metabolomics only)']
gadd_allprot_met_death_metprot = gadd_allprot_met_death[gadd_allprot_met_death['Score '] == 'MetabProteinScore (metabolomics and proteomics)']
gadd_allprot_met_death['Score '].value_counts()

Score 
MetabProteinScore (metabolomics and proteomics)    87
ProteinScore (proteomics only)                     78
MetabScore (metabolomics only)                     20
Name: count, dtype: int64

In [32]:
prots_set3['Gaddprot'] = calculate_biomarkers(prots_set3, gadd_allprot_death, 'Protein', 'Coefficient')
prots_set3['Gaddprot2'] = calculate_biomarkers(prots_set3, gadd_allprot_met_death_prot, 'Predictor', 'Coefficient')

In [30]:
set3_prot_met['Gaddprotmet'] = calculate_biomarkers(set3_prot_met, gadd_allprot_met_death_metprot, 'Predictor', 'Coefficient')
set3_prot_met['Gaddmet'] = calculate_biomarkers(set3_prot_met, gadd_allprot_met_death_met, 'Predictor', 'Coefficient')

In [35]:
df_save = pd.merge(prots_set3[['eid', 'Gaddprot']],  set3_prot_met[['eid','Gaddprotmet', 'Gaddmet']], on = 'eid', how = 'left')
df_save.to_csv('Data/Other_Biomarkers/Gadd_biomarkers.csv', index = False)