In [68]:
import requests
import json
import csv
from pprint import pprint
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

In [96]:
## get the chemicals by route from our cdc exposure data
def get_route_object(route, data):
    try:
        route_obj = data[route]
        route_obj.update({'route': route})
        return route_obj
    except Exception as e:
        print(e)
        return None

with open('cdc_exposure_routes.json', 'r') as routes:
    routes = json.load(routes)
    
## get chebi 2 pathway map
# curl -O "https://reactome.org/download/current/ChEBI2Reactome.txt"

# Parse chebi pathways tab sep file from reactome into json
chebi_pathways = []
with open('ChEBI2Reactome.txt', 'r') as cheb2path:
    cheb2path = csv.reader(cheb2path, delimiter='\t')
    for index, line in enumerate(cheb2path):
        chebi_pathways.append({
            'chebi': line[0],
            'reactome': line[1],
            'pathway': line[3],
            'evidence': line[4],
            'species': line[5]
            
        })

In [112]:
## generate a pandas dataframe for all chems with given route
# route_chems = get_route_object(route='Volatile Organic Compounds (VOC)', data=routes)
route_chems = get_route_object(route='Metals and Metalloids', data=routes)

seed_data = []
for route_chem in route_chems['chemicals']:
    seed_data.append({
        'route': route_chems['route'],
        'chebi': route_chem['chebi'],
        'chemical': route_chem['label'],
    }
    )
chem_datframe = pd.DataFrame(data=seed_data)

In [113]:
chem_datframe

Unnamed: 0,chebi,chemical,route
0,30513,antimony,Metals and Metalloids
1,27563,arsenic,Metals and Metalloids
2,82392,arsenobetaine,Metals and Metalloids
3,49900,arsenous acid,Metals and Metalloids
4,27131,trimethylarsine oxide,Metals and Metalloids
5,30514,caesium,Metals and Metalloids
6,29036,copper,Metals and Metalloids
7,27889,lead,Metals and Metalloids
8,27568,selenium,Metals and Metalloids
9,30440,thallium,Metals and Metalloids


In [114]:
## get all pathways that the chems are involved in into new dataframe
pathways_list = []
for index, row in chem_datframe.iterrows():
    chebi = row[0]
    chemical = row[1]
    route = row[2]
    for path in chebi_pathways:
        if chebi == path['chebi']:
            path.update({
                'chemical': chemical,
                'route': route,
            })
            pathways_list.append(path)
all_dataframe = pd.DataFrame(data=pathways_list)  
all_dataframe.head()

Unnamed: 0,chebi,chemical,evidence,pathway,reactome,route,species
0,29036,copper,IEA,Detoxification of Reactive Oxygen Species,R-ATH-3299685,Metals and Metalloids,Arabidopsis thaliana
1,29036,copper,IEA,Ion transport by P-type ATPases,R-ATH-936837,Metals and Metalloids,Arabidopsis thaliana
2,29036,copper,IEA,Elastic fibre formation,R-BTA-1566948,Metals and Metalloids,Bos taurus
3,29036,copper,IEA,Crosslinking of collagen fibrils,R-BTA-2243919,Metals and Metalloids,Bos taurus
4,29036,copper,IEA,Detoxification of Reactive Oxygen Species,R-BTA-3299685,Metals and Metalloids,Bos taurus


In [115]:
# api utility methods
def get_pathway_participants(pathway_id):
    url = "https://reactome.org/ContentService/data/participants/{}/participatingPhysicalEntities".format(pathway_id)
    results = requests.get(url).json()
    return results

def reactome_id2reference_entity(db_id):
    url = 'https://reactome.org/ContentService/data/query/enhanced/{}'.format(db_id)
    results = requests.get(url).json()
    return results['referenceEntity']

def execute_query(query):
    endpoint = SPARQLWrapper('https://query.wikidata.org/sparql')
    endpoint.setQuery(query)
    endpoint.setReturnFormat(JSON)
    return endpoint.query().convert()

def mygene_map_uniprot_entrez(uniprot):
    url ='http://mygene.info/v3/query'
    params = {'q': uniprot,
              'fields': 'entrezgene',
             }
    results = requests.get(url, params).json()
    final = None
    try:
        if results['total'] == 1:
            final = str(results['hits'][0]['entrezgene'])
    except Exception as e:
        pass
    return final

def get_monarch_diseases(entrez):
    entrez = 'NCBIGene:{}'.format(entrez)
    url = 'https://api-dev.monarchinitiative.org/api/bioentity/gene/{}/diseases/'.format(entrez)
    results = requests.get(url).json()
    diseases = []
    if len(results['associations']) > 0:
        for assoc in results['associations']:
            diseases.append({
                'mondo': assoc['object']['id'], 
                'disease': assoc['object']['label']
            })
    return diseases

In [118]:
# gets pathway participants (proteins) from reactome and makes new combined dataframe
participant_uniprots = []
for index, row in all_dataframe.iterrows():
    participants = get_pathway_participants(pathway_id=row[4])
    for part in participants:
        new_doc = {
            'chebi': row[0],
            'chemical': row[1],
            'evidence': row[2],
            'pathway': row[3],
            'reactome': row[4],
            'route': row[5],
            'species': row[6]
        }

        if part['className']  ==  'Protein':       
            part_ref = reactome_id2reference_entity(part['dbId'])
            if 'UniProt' == part_ref['databaseName']:
                new_doc.update({'uniprot': part_ref['identifier'] })
                participant_uniprots.append(new_doc)
all_data_with_proteins = pd.DataFrame(data=participant_uniprots)
all_data_with_proteins.head()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108


Unnamed: 0,chebi,chemical,evidence,pathway,reactome,route,species,uniprot
0,29036,copper,IEA,Ion transport by P-type ATPases,R-ATH-936837,Metals and Metalloids,Arabidopsis thaliana,Q9LT02
1,29036,copper,IEA,Elastic fibre formation,R-BTA-1566948,Metals and Metalloids,Bos taurus,F1N0H9
2,29036,copper,IEA,Elastic fibre formation,R-BTA-1566948,Metals and Metalloids,Bos taurus,Q2KJ89
3,29036,copper,IEA,Elastic fibre formation,R-BTA-1566948,Metals and Metalloids,Bos taurus,B0JYR0
4,29036,copper,IEA,Detoxification of Reactive Oxygen Species,R-BTA-3299685,Metals and Metalloids,Bos taurus,Q95108
5,29036,copper,IEA,Detoxification of Reactive Oxygen Species,R-BTA-3299685,Metals and Metalloids,Bos taurus,Q95108
6,29036,copper,IEA,Detoxification of Reactive Oxygen Species,R-BTA-3299685,Metals and Metalloids,Bos taurus,A6H7J6
7,29036,copper,IEA,Detoxification of Reactive Oxygen Species,R-BTA-3299685,Metals and Metalloids,Bos taurus,A6H7J6
8,29036,copper,IEA,Detoxification of Reactive Oxygen Species,R-BTA-3299685,Metals and Metalloids,Bos taurus,Q9BGI1
9,29036,copper,IEA,Detoxification of Reactive Oxygen Species,R-BTA-3299685,Metals and Metalloids,Bos taurus,Q29RJ1


In [123]:
# map to ncbi gene 
all_data_with_proteins['ncbi_gene'] = all_data_with_proteins['uniprot'].apply(mygene_map_uniprot_entrez)

In [126]:
# drop the rows with no gene id
all_data_with_proteins = all_data_with_proteins[all_data_with_proteins.ncbi_gene.notnull()]

In [127]:
all_data_with_proteins.head()

Unnamed: 0,chebi,chemical,evidence,pathway,reactome,route,species,uniprot,ncbi_gene
0,29036,copper,IEA,Ion transport by P-type ATPases,R-ATH-936837,Metals and Metalloids,Arabidopsis thaliana,Q9LT02,832428
2,29036,copper,IEA,Elastic fibre formation,R-BTA-1566948,Metals and Metalloids,Bos taurus,Q2KJ89,535185
4,29036,copper,IEA,Detoxification of Reactive Oxygen Species,R-BTA-3299685,Metals and Metalloids,Bos taurus,Q95108,281557
5,29036,copper,IEA,Detoxification of Reactive Oxygen Species,R-BTA-3299685,Metals and Metalloids,Bos taurus,Q95108,281557
8,29036,copper,IEA,Detoxification of Reactive Oxygen Species,R-BTA-3299685,Metals and Metalloids,Bos taurus,Q9BGI1,282885


In [128]:
#get all diseases from monarchinitiative by gene id
gene2disease = []
genes = all_data_with_proteins['ncbi_gene'].tolist()
for index, gene in enumerate(genes):
    diseases = get_monarch_diseases(gene)
    for disease in diseases:
        disease.update({'gene': gene})
        gene2disease.append(disease)
gene2disease_df = pd.DataFrame(data=gene2disease)
gene2disease_df.head()

Unnamed: 0,disease,gene,mondo
0,Dun coat colour in Dexter cattle,282105,OMIA:001249-9913
1,"Coat colour, brown in dog",403479,OMIA:001249-9615
2,"cutis laxa with severe pulmonary, gastrointest...",8425,MONDO:0013170
3,Duchenne muscular dystrophy,8425,MONDO:0010679
4,"cutis laxa, autosomal dominant 1",10516,MONDO:0007411


In [129]:
# merge gene2disease with all data for final ouput
merged_final = pd.merge(all_data_with_proteins, gene2disease_df, how='inner', left_on='ncbi_gene', right_on='gene')

In [130]:
merged_final

Unnamed: 0,chebi,chemical,evidence,pathway,reactome,route,species,uniprot,ncbi_gene,disease,gene,mondo
0,29036,copper,IEA,Melanin biosynthesis,R-BTA-5662702,Metals and Metalloids,Bos taurus,F1MBG9,282105,Dun coat colour in Dexter cattle,282105,OMIA:001249-9913
1,29036,copper,IEA,Melanin biosynthesis,R-CFA-5662702,Metals and Metalloids,Canis familiaris,E2RJM3,403479,"Coat colour, brown in dog",403479,OMIA:001249-9615
2,29036,copper,TAS,Elastic fibre formation,R-HSA-1566948,Metals and Metalloids,Homo sapiens,Q8N2S1,8425,"cutis laxa with severe pulmonary, gastrointest...",8425,MONDO:0013170
3,29036,copper,TAS,Elastic fibre formation,R-HSA-1566948,Metals and Metalloids,Homo sapiens,Q8N2S1,8425,Duchenne muscular dystrophy,8425,MONDO:0010679
4,29036,copper,TAS,Elastic fibre formation,R-HSA-1566948,Metals and Metalloids,Homo sapiens,Q9UBX5,10516,"cutis laxa, autosomal dominant 1",10516,MONDO:0007411
5,29036,copper,TAS,Elastic fibre formation,R-HSA-1566948,Metals and Metalloids,Homo sapiens,Q9UBX5,10516,pulmonary hypertension,10516,MONDO:0005149
6,29036,copper,TAS,Elastic fibre formation,R-HSA-1566948,Metals and Metalloids,Homo sapiens,Q9UBX5,10516,emphysema,10516,MONDO:0005024
7,29036,copper,TAS,Elastic fibre formation,R-HSA-1566948,Metals and Metalloids,Homo sapiens,Q9UBX5,10516,autosomal recessive cutis laxa type 1,10516,MONDO:0019572
8,29036,copper,TAS,Elastic fibre formation,R-HSA-1566948,Metals and Metalloids,Homo sapiens,Q9UBX5,10516,autosomal dominant cutis laxa,10516,MONDO:0019571
9,29036,copper,TAS,Elastic fibre formation,R-HSA-1566948,Metals and Metalloids,Homo sapiens,Q9UBX5,10516,hereditary sensorimotor neuropathy with hypere...,10516,MONDO:0017237
