In [6]:
#
# -ukbb.ipynb
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# 
# Keila Velazquez-Arcelay
# Description: Imports EFO, HPO, ORDO, CHEBI, and MONDO datasets, wrangles the files, and
# intersects with opentargets results containing introgressed circadian associations.
# 
# EFO: https://www.ebi.ac.uk/efo/
# MONDO: https://obofoundry.org/ontology/mondo.html
# HPO: https://hpo.jax.org/app/download/ontology
# 
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

'\n# \n# Keila Velazquez-Arcelay\n# Description: Imports EFO, HPO, ORDO, CHEBI, and MONDO datasets, wrangles the files, and\n# intersects with opentargets results containing introgressed circadian associations.\n# \n# EFO: https://www.ebi.ac.uk/efo/\n# MONDO: https://obofoundry.org/ontology/mondo.html\n# HPO: https://hpo.jax.org/app/download/ontology\n# \n'

In [7]:
EFO = '/dors/capra_lab/data/gene_ontology/efo.obo'
EFO_CSV = '/dors/capra_lab/data/gene_ontology/EFO.csv'
HPO = '/dors/capra_lab/data/gene_ontology/hp.obo'
HPO_CSV = '/dors/capra_lab/data/gene_ontology/HP.csv'
MONDO = '/dors/capra_lab/data/gene_ontology/mondo.obo'
MONDO_CSV = '/dors/capra_lab/data/gene_ontology/MONDO.csv'
ORDO_CSV = '/dors/capra_lab/data/gene_ontology/ORDO.csv'
CHEBI_CSV = '/dors/capra_lab/data/gene_ontology/CHEBI.csv'

In [8]:
import pandas as pd
import re, os
import numpy as np

In [10]:

def parse_file(file):
    # Wrangle EFO, HPO, and MONDO datasets
    l = []
    with open(file, 'r+') as f:
        for line in f:
            if line.startswith('[Term]'):
                ID = (next(f, line))
                NAME = (next(f, line))
                DESCR = (next(f, line))
                l.append("{};{}".format(ID,NAME))
        l = [n.strip() for n in l]
        l = [sub.replace('id: ', '') for sub in l]
        l = [n.split('\n;name: ') for n in l]
        df = pd.DataFrame(l, columns=['traitEfos','traitEfos_Name'])
        df['traitEfos'] = df['traitEfos'].str.replace(':','_')
    return df


In [None]:
# 
opentargets = pd.read_csv('../bin/raw/opentargets_output_browning18_associations_all.txt', sep='\t')
opentargets['traitEfos'] = opentargets['traitEfos'].str.replace("\['|'\]","")
opentargets = opentargets[opentargets['traitEfos'].notnull()]
opentargets = opentargets[~opentargets['traitEfos'].isin(['[]'])]
opentargets['traitEfos'] = [n.split('\', \'') for n in opentargets['traitEfos']]
opentargets = opentargets.explode('traitEfos').drop_duplicates()


In [44]:

# LOAD FILES
efo = parse_file(EFO)
efo = efo[efo['traitEfos'].str.contains('EFO_')]
hpo = parse_file(HPO)
mondo = parse_file(MONDO)

ordo = pd.read_csv(ORDO_CSV, sep=',').iloc[:,:2].drop_duplicates()
ordo.columns = ['traitEfos','traitEfos_Name']
ordo['traitEfos'] = ordo['traitEfos'].replace('.*/','', regex=True)
chebi = pd.read_csv(CHEBI_CSV, sep=',', low_memory=False).iloc[:,:2].drop_duplicates()
chebi.columns = ['traitEfos','traitEfos_Name']
chebi['traitEfos'] = chebi['traitEfos'].replace('.*/','', regex=True)

traits = pd.concat([efo,hpo,mondo,ordo,chebi])

opentargets_new = pd.merge(opentargets,traits,on='traitEfos')


In [45]:
len(opentargets['var_id'].drop_duplicates())

506638

In [46]:
len(opentargets_new['var_id'].drop_duplicates())

506360

In [None]:
#opentargets['traitEfos'] = opentargets['traitEfos'].replace('[]',np.NaN)
#opentargets = opentargets.dropna()






In [94]:
opentargets_new[opentargets_new['pval']<=0.00000005][['traitEfos','var_id']].drop_duplicates().value_counts('traitEfos')


traitEfos
EFO_0004526    288
EFO_0004980    259
EFO_0004528    252
EFO_0005091    246
EFO_0004339    243
              ... 
EFO_0003911      1
EFO_1001228      1
EFO_0003839      1
EFO_1001870      1
CHEBI_46195      1
Length: 193, dtype: int64

In [86]:
opentargets_new['var_id'].drop_duplicates()

0            1_7341713_G_A
2           1_21268169_C_T
3           1_21268641_C_T
4           1_21271643_C_T
5           1_21276142_C_T
                ...       
378412    13_113799845_C_T
384184     11_45544911_A_C
384861     12_72075140_G_T
386264     10_71479444_C_T
386266    12_106133815_T_C
Name: var_id, Length: 10405, dtype: object

In [92]:
opentargets_new[['traitEfos','var_id']].drop_duplicates().value_counts('traitEfos')

traitEfos
EFO_0007010    5572
EFO_0007937    4150
EFO_0008111    3766
EFO_0007874    1963
EFO_0007814    1647
               ... 
EFO_0010400       1
EFO_0010536       1
EFO_0009851       1
EFO_0005058       1
EFO_0010988       1
Length: 2480, dtype: int64

In [48]:
opentargets_new = opentargets_new[['var_id','traitEfos','traitEfos_Name','traitReported','beta','pval']]
                 

In [50]:
opentargets

Unnamed: 0,studyId,eaf,beta,se,nTotal,nCases,oddsRatio,pval,traitCategory,traitEfos,traitReported,numAssocLoci,var_id
0,GCST90010242,0.048000,-0.265458,0.094162,1323,,,0.004986,measurement,EFO_0007937,Neurogenic locus notch homolog protein 3 levels,1.0,1_788362_T_C
3,FINNGEN_R5_ST19_INJURY_MUSCLE_TENDON_WRIST_HAN...,0.004193,0.773300,0.266900,201216,2180.0,2.166905,0.003763,"injury, poisoning or other complication",EFO_0000546,Injury of muscle and tendon at wrist and hand ...,0.0,1_788906_G_A
4,FINNGEN_R5_M13_ARTHROSIS_KNEE,0.004178,0.303800,0.100400,170017,22796.0,1.354998,0.002490,musculoskeletal or connective tissue disease,EFO_0004616,Gonarthrosis,13.0,1_788906_G_A
5,FINNGEN_R5_M13_ARTHROSIS,0.004195,0.235100,0.082700,184454,37233.0,1.265035,0.004471,Uncategorised,EFO_0002506,Arthrosis,19.0,1_788906_G_A
6,FINNGEN_R5_M13_TROCHANTERICBURSITIS,0.004214,1.080800,0.328100,169245,1604.0,2.947036,0.000988,musculoskeletal or connective tissue disease,MONDO_0002471,Trochanteric bursitis,0.0,1_788906_G_A
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17160157,GCST007090,0.004400,0.238500,0.083000,403124,24955.0,1.269344,0.004077,musculoskeletal or connective tissue disease,EFO_0004616,Knee osteoarthritis,12.0,9_138126544_T_G
17160158,SAIGE_495,0.000233,2.060000,0.686013,401837,26332.0,7.845970,0.002610,Uncategorised,EFO_0000270,Asthma,41.0,9_138131817_G_A
17160159,GCST90002340,0.003196,0.955600,0.333510,521594,,,0.004179,measurement,EFO_0005091,Monocyte count,1036.0,9_138131817_G_A
17160160,GCST90002310,0.000663,0.539475,0.181204,563946,,,0.002920,measurement,EFO_0004509,Hemoglobin concentration,893.0,9_138148887_G_T


In [49]:
# SAVE
opentargets_new.to_csv('../bin/raw/opentargets_introgressed_associations.tab', sep='\t', index=False)