In [23]:
import numpy as np
import pandas as pd

In [24]:
df_eqtl = pd.read_excel('data/results_mr_coloc_significant.xlsx', sheet_name='BAG eqtl')
df_eqtl_coloc = pd.read_excel('data/results_mr_coloc_significant.xlsx', sheet_name='BAG eqtl coloc')

In [25]:
df_evidence = df_eqtl[['hgnc_names', 'druggability_tier']].copy()
# Drop duplicates
df_evidence.drop_duplicates(inplace=True)
print(len(df_evidence))

64


In [26]:
# evidence num in MR eqtl in blood and brain tissues
mr_eqtl_blood_cnt = df_eqtl[df_eqtl['eQTL_type'] == 'Blood eQTL']['hgnc_names'].value_counts().reset_index()
mr_eqtl_blood_cnt.columns = ['hgnc_names', 'eQTL_MR_blood']
mr_eqtl_brain_cnt = df_eqtl[df_eqtl['eQTL_type'] == 'Brain Tissue eQTL']['hgnc_names'].value_counts().reset_index()
mr_eqtl_brain_cnt.columns = ['hgnc_names', 'eQTL_MR_brain']

# evidence num in MR eqtl coloc in blood and brain tissues
mr_eqtl_blood_coloc_cnt = df_eqtl_coloc[df_eqtl_coloc['eQTL_type'] == 'Blood eQTL']['hgnc_names'].value_counts().reset_index()
mr_eqtl_blood_coloc_cnt.columns = ['hgnc_names', 'eQTL_coloc_blood']
mr_eqtl_brain_coloc_cnt = df_eqtl_coloc[df_eqtl_coloc['eQTL_type'] == 'Brain Tissue eQTL']['hgnc_names'].value_counts().reset_index()
mr_eqtl_brain_coloc_cnt.columns = ['hgnc_names', 'eQTL_coloc_brain']

mr_eqtl_coloc_cnt = df_eqtl_coloc['hgnc_names'].value_counts().reset_index()
mr_eqtl_coloc_cnt.columns = ['hgnc_names', 'eQTL_coloc']

In [27]:
# merge evidence in MR and coloc
df_evidence_blood_eqtl = mr_eqtl_blood_cnt.merge(mr_eqtl_blood_coloc_cnt, on='hgnc_names', how='left')
df_evidence_brain_eqtl = mr_eqtl_brain_cnt.merge(mr_eqtl_brain_coloc_cnt, on='hgnc_names', how='left')
# set NAN to -1
df_evidence_blood_eqtl.fillna(-1, inplace=True)
df_evidence_brain_eqtl.fillna(-1, inplace=True)

In [28]:
# evidence num in pQTL in INTERVAL and deCODE cohorts
df_pqtl_interval = pd.read_csv('results_interval_bag/mr_res_complete.csv')
df_pqtl_decode = pd.read_csv('results_decode_bag/mr_res_blood_all.csv')

# keep results with MR egger p-value > 0.05
df_pqtl_interval = df_pqtl_interval[df_pqtl_interval['egger_pval'] > 0.05]
df_pqtl_decode = df_pqtl_decode[df_pqtl_decode['egger_pval'] > 0.05]

df_pqtl_coloc = pd.read_csv('coloc_results/coloc_results_bag.csv')
df_pqtl_coloc_interval = df_pqtl_coloc[df_pqtl_coloc['Source'] == 'INTERVAL'].copy()
df_pqtl_coloc_decode = df_pqtl_coloc[df_pqtl_coloc['Source'] == 'deCODE'].copy()

# keep only Inverse variance weighted', 'Wald ratio' methods
df_pqtl_interval = df_pqtl_interval[df_pqtl_interval['method'].isin(['Inverse variance weighted', 'Wald ratio'])]
df_pqtl_decode = df_pqtl_decode[df_pqtl_decode['method'].isin(['Inverse variance weighted', 'Wald ratio'])]

# keep results with MR egger p-value > 0.05
df_pqtl_interval = df_pqtl_interval[df_pqtl_interval['egger_pval'] > 0.05]
df_pqtl_decode = df_pqtl_decode[df_pqtl_decode['egger_pval'] > 0.05]

df_pqtl_interval = df_pqtl_interval[['hgnc_names', 'significant']].copy()
df_pqtl_interval.columns = ['hgnc_names', 'pqtl_interval']
df_pqtl_decode = df_pqtl_decode[['Gene', 'significant']].copy()
df_pqtl_decode.columns = ['hgnc_names', 'pqtl_decode']

# significant coloc results
df_pqtl_coloc_interval['significant'] = 1
df_pqtl_coloc_decode['significant'] = 1
df_pqtl_coloc_interval = df_pqtl_coloc_interval[['Gene', 'significant']].copy()
df_pqtl_coloc_interval.columns = ['hgnc_names', 'pqtl_coloc_interval']
df_pqtl_coloc_decode = df_pqtl_coloc_decode[['Gene', 'significant']].copy()
df_pqtl_coloc_decode.columns = ['hgnc_names', 'pqtl_coloc_decode']

In [29]:
# merge evidence in MR and coloc for INTERVAL and deCODE, keep left and right
df_evidence_pqtl_interval = df_pqtl_interval.merge(df_pqtl_coloc_interval, on='hgnc_names', how='outer')
df_evidence_pqtl_decode = df_pqtl_decode.merge(df_pqtl_coloc_decode, on='hgnc_names', how='outer')
# set NAN to -1
df_evidence_pqtl_interval.fillna(-1, inplace=True)
df_evidence_pqtl_decode.fillna(-1, inplace=True)

In [30]:
# merge all evidence
df_evidence = df_evidence.merge(df_evidence_blood_eqtl, on='hgnc_names', how='left')
df_evidence = df_evidence.merge(df_evidence_brain_eqtl, on='hgnc_names', how='left')
df_evidence = df_evidence.merge(df_evidence_pqtl_interval, on='hgnc_names', how='left')
df_evidence = df_evidence.merge(df_evidence_pqtl_decode, on='hgnc_names', how='left')

In [31]:
# set NAN to 0 and -1 to nan
# 1: significant, -1: tested not significant, 0: not tested
df_evidence.fillna(0, inplace=True)

In [32]:
# merge with drug count
# drugs for the genes
df_drugs = pd.read_csv('data/gene_drug_Mergedindication_beta_direction.tsv', sep='\t')
# remove drugs with direction of 'N'
# df_drugs = df_drugs[df_drugs['direction'] != 'N']
print(len(df_drugs))

525


In [33]:
ageing_drugs = df_drugs[df_drugs['agingdrug'] == 'Y']
print(len(ageing_drugs))

36


In [34]:
# count drugs for each gene
drug_cnt = df_drugs.groupby('gene').count().reset_index()
ageing_drug_cnt = ageing_drugs.groupby('gene').count().reset_index()

In [35]:
drug_cnt = drug_cnt[['gene', 'drug.x']]
ageing_drug_cnt = ageing_drug_cnt[['gene', 'drug.x']]
# rename columns
drug_cnt.columns = ['hgnc_names', 'drug_cnt']
ageing_drug_cnt.columns = ['hgnc_names', 'ageing_drug_cnt']
# merge drug count
df_evidence = df_evidence.merge(drug_cnt, on='hgnc_names', how='left')
df_evidence = df_evidence.merge(ageing_drug_cnt, on='hgnc_names', how='left')

In [36]:
# set NAN to 0
df_evidence.fillna(0, inplace=True)
df_evidence.columns

Index(['hgnc_names', 'druggability_tier', 'eQTL_MR_blood', 'eQTL_coloc_blood',
       'eQTL_MR_brain', 'eQTL_coloc_brain', 'pqtl_interval',
       'pqtl_coloc_interval', 'pqtl_decode', 'pqtl_coloc_decode', 'drug_cnt',
       'ageing_drug_cnt'],
      dtype='object')

In [37]:
# convert to int
df_evidence['eQTL_MR_blood'] = df_evidence['eQTL_MR_blood'].astype(int)
df_evidence['eQTL_MR_brain'] = df_evidence['eQTL_MR_brain'].astype(int)
df_evidence['eQTL_coloc_blood'] = df_evidence['eQTL_coloc_blood'].astype(int)
df_evidence['eQTL_coloc_brain'] = df_evidence['eQTL_coloc_brain'].astype(int)
df_evidence['pqtl_interval'] = df_evidence['pqtl_interval'].astype(int)
df_evidence['pqtl_decode'] = df_evidence['pqtl_decode'].astype(int)
df_evidence['pqtl_coloc_decode'] = df_evidence['pqtl_coloc_decode'].astype(int)
df_evidence['pqtl_coloc_interval'] = df_evidence['pqtl_coloc_interval'].astype(int)
df_evidence['drug_cnt'] = df_evidence['drug_cnt'].astype(int)
df_evidence['ageing_drug_cnt'] = df_evidence['ageing_drug_cnt'].astype(int)

In [38]:
# count evidence: -1 as 0
df_evidence['evidence_count'] = df_evidence['eQTL_MR_blood'].replace(-1, 0) + df_evidence['eQTL_MR_brain'].replace(-1, 0) + \
                                df_evidence['eQTL_coloc_blood'].replace(-1, 0) + df_evidence['eQTL_coloc_brain'].replace(-1, 0) + \
                                df_evidence['pqtl_interval'].replace(-1, 0) + df_evidence['pqtl_decode'].replace(-1, 0) + \
                                df_evidence['pqtl_coloc_decode'].replace(-1, 0) + df_evidence['pqtl_coloc_interval'].replace(-1, 0)

In [39]:
df_evidence['evidence_count'].value_counts()

evidence_count
2    30
1    27
3     5
4     2
Name: count, dtype: int64

In [40]:
# sort by evidence count
df_evidence.sort_values(by=['evidence_count', 'ageing_drug_cnt', 'drug_cnt'], ascending=False, inplace=True)

In [41]:
# change column order
df_evidence = df_evidence[['hgnc_names', 'druggability_tier', 
                           'eQTL_MR_blood', 'eQTL_MR_brain', 'eQTL_coloc_blood', 'eQTL_coloc_brain',
                           'pqtl_interval', 'pqtl_decode', 'pqtl_coloc_interval', 'pqtl_coloc_decode', 
                           'drug_cnt', 'ageing_drug_cnt', 'evidence_count']]


In [42]:
# save to excel, sheet name is 'evidence'
df_evidence.to_excel('evidence/evidence.xlsx', sheet_name='evidence', index=False)

In [43]:
# change column 2 to 9 with:  1 to ✔, -1 to ✘, 0 to ''
df_evidence.iloc[:, 2:10] = df_evidence.iloc[:, 2:10].replace(1, '✔').replace(-1, '✘').replace(0, '')

In [44]:
# save to excel, sheet name is 'evidence'
df_evidence.to_excel('evidence/evidence_mark.xlsx', sheet_name='evidence', index=False)