In [1]:
import pandas as pd
from pathlib import Path
import tools.obo_tools as ot
from tools.hetnet_file_processing import read_reactome, determine_evidence
from tools.processing import regularize_colnames, head, expand_col_on_char, combine_group_rows_on_char
from hetnet_ml.src import graph_tools as gt

load_dir = Path('../2_pipeline/00_download_data/out/').resolve()
ext_dir = Path('../0_data/external/').resolve()
network_dir = load_dir.parent.parent.joinpath('03_Adding_GO_Annotations/out')

In [2]:
all_nodes = gt.remove_colons(pd.read_csv(network_dir.joinpath('nodes_all.csv'), dtype=str))
edges = gt.remove_colons(pd.read_csv(network_dir.joinpath('edges.csv'), dtype=str))

In [3]:
PWs = all_nodes.query('label == "Pathway"')
print("Pathways:  {:6,}".format(len(PWs)))
print(" Kegg:     {:6,}".format(PWs['id'].str.startswith('KEGG:').sum()))
print(" Reactome: {:6,}".format(PWs['id'].str.startswith('REACT:').sum()))

Pathways:   2,363
 Kegg:        503
 Reactome:  1,860


In [4]:
kegg = PWs[PWs['id'].str.startswith('KEGG:')]['id'].values
react = PWs[PWs['id'].str.startswith('REACT:')]['id'].values

In [5]:
print('Kegg edges: {:,}'.format(len(edges.query('start_id in @kegg or end_id in @kegg'))))
print('Reactome Edges: {:,}'.format(len(edges.query('start_id in @react or end_id in @react'))))

Kegg edges: 661,027
Reactome Edges: 1,253,874


In [6]:
edges.query('start_id in @kegg or end_id in @kegg')['type'].value_counts()

associated_with_CawPW    446328
associated_with_DawPW    185911
part_of_GpoPW             28788
Name: type, dtype: int64

In [7]:
edges.query('start_id in @react or end_id in @react')['type'].value_counts()

associated_with_CawPW    769595
associated_with_DawPW    377258
part_of_GpoPW            107021
Name: type, dtype: int64

A lot more reactome than Kegg...

In [8]:
all_node_ids = all_nodes['id'].values
new_nodes = []
new_edges = []

In [9]:
all_nodes.head(2)

Unnamed: 0,id,name,label,tree_numbers,drug_bank_ids,alt_disease_ids,gene_symbol,alt_gene_ids,bio_gridids,pharm_gkbids,uni_prot_ids,uniprot_id
0,MESH:C089250,(0.017ferrocene)amylose,Compound,D01.490.200/C089250|D02.691.550.200/C089250|D0...,,,,,,,,
1,MESH:C114385,001-C8-NBD,Compound,D03.383.129.462.580/C114385|D12.644.456/C114385,,,,,,,,


In [10]:
edges.head(2)

Unnamed: 0,start_id,end_id,type,parent_ixn,pub_med_ids,organism_id,abbv,source,evidence,direct_evidence,corrected_pvalue,inference_gene_symbol,qualifier,db_reference,evidence_code,with_or_from,date,assigned_by,experiments,support_type
0,MESH:C000121,4313,decreases_activity_CdaG,decreases^activity,25899827,9606,CdaG,CTD,curated,,,,,,,,,,,
1,MESH:C000121,4313,decreases_expression_CdeG,decreases^expression,25899827,9606,CdeG,CTD,curated,,,,,,,,,,,


# Reactome Genes

In [11]:
ncbi = read_reactome(load_dir.joinpath('NCBI2Reactome_PE_All_Levels.txt'))
hncbi = ncbi.query('taxon == "Homo sapiens"').copy()

In [12]:
hncbi['external_id'] = hncbi['external_id'].astype(str)

In [13]:
hncbi.head(2)

Unnamed: 0,external_id,pe_reactome_id,pe_name,reactome_id,reactome_url,reactome_name,evidence_code,taxon
0,1,R-HSA-6806490,A1BG [extracellular region],R-HSA-109582,https://reactome.org/PathwayBrowser/#/R-HSA-10...,Hemostasis,TAS,Homo sapiens
1,1,R-HSA-8848894,A1BG [platelet alpha granule lumen],R-HSA-109582,https://reactome.org/PathwayBrowser/#/R-HSA-10...,Hemostasis,TAS,Homo sapiens


In [14]:
hncbi['reactome_id'] = 'REACT:' + hncbi['reactome_id']

# Make a map from internal REACTOME id to external IDs
pe_ncbi_map = hncbi.set_index('pe_reactome_id')['external_id'].to_dict()

ncbi_edges = hncbi[['external_id', 'reactome_id', 'evidence_code']].copy()
ncbi_edges.columns = ['start_id', 'end_id', 'evidence_code']
ncbi_edges['type'] = 'part_of_GpoPW'
ncbi_edges.head(2)

Unnamed: 0,start_id,end_id,evidence_code,type
0,1,REACT:R-HSA-109582,TAS,part_of_GpoPW
1,1,REACT:R-HSA-109582,TAS,part_of_GpoPW


In [15]:
print('Genes in Reactome: {:,}'.format(hncbi['external_id'].nunique()))
print('Mapped to Network: {:,}'.format(hncbi.query('external_id in @all_node_ids')['external_id'].nunique()))
print('Un-mappable: {:,}'.format(hncbi.query('external_id not in @all_node_ids')['external_id'].nunique()))

Genes in Reactome: 10,696
Mapped to Network: 10,610
Un-mappable: 86


In [16]:
react_ids = set(ncbi_edges['end_id'])
print('Unique Reactome Pathways: {:,}'.format(len(react_ids)))
print('Reactome Pathways not in Network: {:,}'.format(len(react_ids - set(all_nodes['id']))))

Unique Reactome Pathways: 2,239
Reactome Pathways not in Network: 423


423 is a fairly significant number... We can add these pathways to the newtork...

In [17]:
len(react_ids - set(react))

423

In [18]:
missing_react_ids = react_ids - set(react)
keep_cols = ['reactome_id', 'reactome_name']
new_pw_nodes = hncbi.query('reactome_id in @missing_react_ids')
new_pw_nodes = new_pw_nodes.drop_duplicates(subset=keep_cols)[keep_cols]
new_pw_nodes.columns = [c.split('_')[1] for c in keep_cols]
new_pw_nodes['label'] = 'Pathway'
new_pw_nodes.head()

Unnamed: 0,id,name,label
28,REACT:R-HSA-8956321,Nucleotide salvage,Pathway
43,REACT:R-HSA-8957275,Post-translational protein phosphorylation,Pathway
137,REACT:R-HSA-6807070,PTEN Regulation,Pathway
151,REACT:R-HSA-8878166,Transcriptional regulation by RUNX2,Pathway
152,REACT:R-HSA-8939211,ESR-mediated signaling,Pathway


In [19]:
ncbi_edges['source'] = 'Reactome'
ncbi_edges['evidence'] = ncbi_edges['evidence_code'].apply(determine_evidence)

In [20]:
new_nodes.append(new_pw_nodes)
new_edges.append(ncbi_edges)

## Chem PW

In [21]:
chem_pw = read_reactome(load_dir.joinpath('ChEBI2Reactome_PE_All_Levels.txt'))
chem_pw.head(2)

Unnamed: 0,external_id,pe_reactome_id,pe_name,reactome_id,reactome_url,reactome_name,evidence_code,taxon
0,10033,R-ALL-9014945,warfarin [cytosol],R-BTA-1430728,https://reactome.org/PathwayBrowser/#/R-BTA-14...,Metabolism,IEA,Bos taurus
1,10033,R-ALL-9014945,warfarin [cytosol],R-BTA-196854,https://reactome.org/PathwayBrowser/#/R-BTA-19...,Metabolism of vitamins and cofactors,IEA,Bos taurus


In [22]:
chem_hpw = chem_pw.query('taxon == "Homo sapiens"').copy()
chem_hpw.shape[0]

38843

In [23]:
chem_hpw['external_id'] = 'CHEBI:' + chem_hpw['external_id'].astype(str)
chebi_ids = chem_hpw['external_id'].unique()

Need to map from Chebi to MeSH if possible... 
Could also map to DrugBank

In [24]:
from biothings_client import get_client
mc = get_client('chem')

In [25]:
res = mc.getchems(chebi_ids)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-2081...done.


In [26]:
no_mychem = []
for r in res:
    if 'notfound' in r:
        no_mychem.append(r['query'])
print('ChEBI IDs not found in mychem.info: {:,}'.format(len(no_mychem)))

ChEBI IDs not found in mychem.info: 210


In [27]:
chem_hpw.query('external_id in @no_mychem').drop_duplicates(subset=['external_id']).head(10)

Unnamed: 0,external_id,pe_reactome_id,pe_name,reactome_id,reactome_url,reactome_name,evidence_code,taxon
16,CHEBI:10033,R-ALL-9014945,warfarin [cytosol],R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,TAS,Homo sapiens
1452,CHEBI:13166,R-HSA-5357708,Sec-tRNA(Sec) [cytosol],R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,IEA,Homo sapiens
1460,CHEBI:13170,R-HSA-5357730,Ser-tRNA(Sec) [cytosol],R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,IEA,Homo sapiens
1960,CHEBI:133202,R-ALL-8938276,(ADP-D-ribosyl)(n)-acceptor [cytosol],R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,TAS,Homo sapiens
2015,CHEBI:133203,R-ALL-8938282,(ADP-D-ribosyl)(n+1)-acceptor [cytosol],R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,TAS,Homo sapiens
3750,CHEBI:138387,R-ALL-9020583,(18S)-hydroperoxyicosapentaenoic acid [cytosol],R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,TAS,Homo sapiens
3859,CHEBI:138490,R-ALL-9022655,"5S,6S-epoxy-18(S)-HEPE [cytosol]",R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,TAS,Homo sapiens
3974,CHEBI:138563,R-ALL-9023708,"5S,6S-epoxy-18(R)-HEPE [cytosol]",R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,TAS,Homo sapiens
4147,CHEBI:138601,R-ALL-9024576,4(S)-Hp-17(R)-HDHA [cytosol],R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,TAS,Homo sapiens
4192,CHEBI:138602,R-ALL-9024479,7(S)-Hp-17(R)-HDHA [cytosol],R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,TAS,Homo sapiens


Lots of tRNAs and other quite specific things.. I don't htink these will add much as they're not the small molecules we're interested in...

Maybe warfarin? not sure why that one isn't mapped...

In [28]:
bad_keys = {'query', '_id', 'notfound'}
mesh_loc = {'drugcentral': 'mesh_descriptor_ui', 'ginas': 'MESH'}

chebi_to_mesh = dict()

for r in res:
    msh_id = set()
    for k, v in mesh_loc.items():
        potential_id = r.get(k, dict()).get('xrefs', dict()).get(v)
        if potential_id is not None:
            if type(potential_id) == str:
                msh_id.add('MESH:'+potential_id)
            elif type(potential_id == list):
                for pid in potential_id:
                    msh_id.add('MESH:'+pid)
    if len(msh_id) == 1:
        chebi_to_mesh[r['query']] = msh_id.pop()
        continue
    elif len(msh_id) > 1:
        chebi_to_mesh[r['query']] = '|'.join(msh_id)
        continue
    dbid = r.get('drugbank', dict()).get('id')
    if dbid is not None:
        chebi_to_mesh[r['query']] = 'DB:'+ dbid

In [29]:
drug_bank_ids = [v[3:] for v in chebi_to_mesh.values() if v.startswith('DB:')]
print('ChEBI IDs only mappable to Drugbank {:,}'.format(len(drug_bank_ids)))

ChEBI IDs only mappable to Drugbank 226


In [30]:
print('DrugBank IDs already in the network: {:,}'.format(len(all_nodes.query('drug_bank_ids in @drug_bank_ids'))))

DrugBank IDs already in the network: 127


In [31]:
dbid_to_msh = all_nodes.query('drug_bank_ids in @drug_bank_ids').set_index('drug_bank_ids')['id'].to_dict()
dbid_to_msh = {'DB:'+k: v for k, v in dbid_to_msh.items()}

In [32]:
chebi_to_mesh = {k: dbid_to_msh.get(v, v) for k, v in chebi_to_mesh.items()}

In [33]:
print('ChEBI IDs now mappable to MeSH: {:,}'.format(len([v for v in chebi_to_mesh.values() if v.startswith('MESH:')])))

ChEBI IDs now mappable to MeSH: 598


In [34]:
chem_hpw['mesh_id'] = chem_hpw['external_id'].map(chebi_to_mesh)

In [35]:
chm_exp = expand_col_on_char(chem_hpw.dropna(subset=['mesh_id']), 'mesh_id', '|')
chem_hpw = pd.concat([chm_exp, chem_hpw[chem_hpw['mesh_id'].isnull()]], sort=False)
print(len(chem_hpw))
print(len(chem_hpw.query('mesh_id in @all_node_ids')))

39314
16304


In [36]:
chebi_to_db = {k: v[3:] for k,v in chebi_to_mesh.items() if v.startswith('DB:')}

In [37]:
mychem_no_mesh_db = set(chebi_ids) - set(no_mychem) - set(chebi_to_mesh.keys())
chem_hpw.query('external_id in @mychem_no_mesh_db').drop_duplicates(subset=['external_id']).head(5)

Unnamed: 0,external_id,pe_reactome_id,pe_name,reactome_id,reactome_url,reactome_name,evidence_code,taxon,mesh_id
62,CHEBI:10036,R-ALL-5696412,arachidyl ester [endoplasmic reticulum lumen],R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,TAS,Homo sapiens,
125,CHEBI:10049,R-ALL-2509848,XTP [cytosol],R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,TAS,Homo sapiens,
477,CHEBI:10545,R-ALL-76342,e- [endoplasmic reticulum lumen],R-HSA-140180,https://reactome.org/PathwayBrowser/#/R-HSA-14...,COX reactions,TAS,Homo sapiens,
576,CHEBI:10668,R-HSA-6782371,"unspliced tRNA(Leu)(CAA) containing C-34,48 [n...",R-HSA-6782315,https://reactome.org/PathwayBrowser/#/R-HSA-67...,tRNA modification in the nucleus and cytosol,IEA,Homo sapiens,
776,CHEBI:10723,R-ALL-77343,tdec2-CoA [mitochondrial matrix],R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,TAS,Homo sapiens,


Some of these are weird... electron... beta-particle... 

In [38]:
print('Chem Pathway Mappings')
print(' Edges Total:     {:8,}'.format(len(chem_hpw)))
e_map = len(chem_hpw.query('mesh_id in @all_node_ids'))
e_umap = len(chem_hpw.query('mesh_id not in @all_node_ids'))
print(' Number mapped:   {:8,}'.format(e_map))
print(' Number unmapped: {:8,}'.format(e_umap))
print('Nodes')

n_map = chem_hpw.query('mesh_id in @all_node_ids')['external_id'].nunique()
n_umap = chem_hpw.query('mesh_id not in @all_node_ids')['external_id'].nunique()

print(' Total Chems:     {:8,}'.format(chem_hpw['external_id'].nunique()))
print(' Chems mapped:    {:8,}'.format(n_map))
print(' Chems unmapped:  {:8,}'.format(n_umap))
print('Mapping Fractions')
print(' Nodes:           {:8.3%}'.format(n_map / (n_map + n_umap)))
print(' Eodes:           {:8.3%}'.format(e_map / (e_map + e_umap)))

Chem Pathway Mappings
 Edges Total:       39,314
 Number mapped:     16,304
 Number unmapped:   23,010
Nodes
 Total Chems:        2,081
 Chems mapped:         559
 Chems unmapped:     1,533
Mapping Fractions
 Nodes:            26.721%
 Eodes:            41.471%


Fairly unsuccessful endavour...  only about 1/4 of the nodes, and about 2/5 of the edges...

Chebi terms are more specific than MeSH... However, these could be interesting bridging nodes, as having relationships with both pathways and reactions.  We will add them in as new nodes.

In [39]:
new_chems = chem_hpw.query('mesh_id not in @all_node_ids').copy()

new_chems['pe_name'] = new_chems['pe_name'].str.split(' \[', expand=True)[0]
new_chems = new_chems[['external_id', 'pe_name']].drop_duplicates()
new_chems.columns = [c.split('_')[-1] for c in new_chems.columns]
new_chems['label'] = 'Compound'
new_chems.head(2)

Unnamed: 0,id,name,label
26,CHEBI:114785,Erlotinib,Compound
80,CHEBI:140420,eribaxaban,Compound


In [40]:
len(new_chems)

1968

#### String matching for Chebi to Mesh

In [41]:
# Perhaps we can do some string matching ot get some more matches...
new_chems['l_name'] = new_chems['name'].str.lower()
lower_chem_names = new_chems['l_name'].values
all_nodes['l_name'] = all_nodes['name'].str.lower()

In [42]:
name_to_mesh = all_nodes.query('l_name in @lower_chem_names and label != "Gene"').set_index('l_name')['id'].to_dict()
new_chems['mesh_id'] = new_chems['l_name'].map(name_to_mesh)
new_chems.head(2)

Unnamed: 0,id,name,label,l_name,mesh_id
26,CHEBI:114785,Erlotinib,Compound,erlotinib,
80,CHEBI:140420,eribaxaban,Compound,eribaxaban,


In [43]:
# Add these new mappings to the chebi to mesh mapper
chebi_to_mesh_by_name = new_chems.dropna(subset=['mesh_id']).set_index('id')['mesh_id'].to_dict()
chebi_to_mesh = {**chebi_to_mesh_by_name, **chebi_to_mesh}

In [44]:
new_chems = new_chems[new_chems['mesh_id'].isnull()][['id', 'name', 'label']]
len(new_chems)

1868

picked up about 100 via string mapping

#### One to Many Chebi to MeSH mappings

Some ChEBI IDs may map to multipe MeSH IDs...  If only one of those many MeSH ID is already contained in the network we will keep that mapping...


In [45]:
dupidx = chem_hpw.drop_duplicates(subset=['external_id', 'mesh_id'])['external_id'].duplicated(keep=False)
dupidx = dupidx[dupidx].index
dup_ids = (chem_hpw.loc[dupidx]
                    .sort_values('external_id')
                    .query('mesh_id in @all_node_ids')
                    .drop_duplicates(subset=['external_id'], keep=False)) # We will fix those with multiple MeSH ids next...
dup_ids

Unnamed: 0,external_id,pe_reactome_id,pe_name,reactome_id,reactome_url,reactome_name,evidence_code,taxon,mesh_id
27,CHEBI:114785,R-ALL-1173285,Erlotinib [cytosol],R-HSA-1236382,https://reactome.org/PathwayBrowser/#/R-HSA-12...,Constitutive Signaling by Ligand-Responsive EG...,TAS,Homo sapiens,MESH:D000069347
6255,CHEBI:16134,R-ALL-29382,NH3 [cytosol],R-HSA-112310,https://reactome.org/PathwayBrowser/#/R-HSA-11...,Neurotransmitter release cycle,TAS,Homo sapiens,MESH:D000641
6944,CHEBI:16243,R-ALL-8953397,quercetin [nucleoplasm],R-HSA-8935690,https://reactome.org/PathwayBrowser/#/R-HSA-89...,Digestion,TAS,Homo sapiens,MESH:D011794
13340,CHEBI:17688,R-ALL-30661,Nicotine [endoplasmic reticulum lumen],R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,TAS,Homo sapiens,MESH:D009538
19232,CHEBI:3175,R-ALL-9627030,brimonidine [extracellular region],R-HSA-109582,https://reactome.org/PathwayBrowser/#/R-HSA-10...,Hemostasis,TAS,Homo sapiens,MESH:D000068438
19259,CHEBI:31932,R-ALL-6786175,OM [cytosol],R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,TAS,Homo sapiens,MESH:D000068557
19831,CHEBI:41423,R-ALL-2309786,celecoxib [cytosol],R-HSA-1430728,https://reactome.org/PathwayBrowser/#/R-HSA-14...,Metabolism,TAS,Homo sapiens,MESH:D000068579
20258,CHEBI:49040,R-ALL-265479,EZE [extracellular region],R-HSA-8963676,https://reactome.org/PathwayBrowser/#/R-HSA-89...,Intestinal absorption,TAS,Homo sapiens,MESH:D000069438
21513,CHEBI:68579,R-ALL-9015055,rivaroxaban [extracellular region],R-HSA-109582,https://reactome.org/PathwayBrowser/#/R-HSA-10...,Hemostasis,TAS,Homo sapiens,MESH:D000069552
21840,CHEBI:8874,R-ALL-9635054,rivastigmine [extracellular region],R-HSA-112311,https://reactome.org/PathwayBrowser/#/R-HSA-11...,Neurotransmitter clearance,TAS,Homo sapiens,MESH:D000068836


In [46]:
resolved_dups = dup_ids.set_index('external_id')['mesh_id'].to_dict()
chebi_to_mesh.update(resolved_dups)


Some ChEBIs will be mapped to multiple MeSH ids... that are already in the network. We need to see how many are already in the network, and if they need to be resolved to 'more correct' identifier.

In [47]:
(chem_hpw.query('mesh_id in @all_node_ids')
         .drop_duplicates(subset=['external_id', 'mesh_id'])['external_id']
         .duplicated(keep='first')
         .sum())

10

Ten ChEBI ids map to multiple mesh IDs that are already in the network...  This is a small enough number that maybe we can maunally determine what the correct one should be...

In [48]:
idx = (chem_hpw.query('mesh_id in @all_node_ids')
         .drop_duplicates(subset=['external_id', 'mesh_id'])['external_id']
         .duplicated(keep=False))
idx = idx[idx].index
doubled_ids = chem_hpw.loc[idx].sort_values('external_id')['mesh_id'].values

In [49]:
duped_mesh_to_chebi = chem_hpw.loc[idx].set_index('mesh_id')['external_id'].to_dict()
chebi_to_name = chem_hpw.loc[idx].set_index('external_id')['pe_name'].to_dict()

duped = all_nodes.query('id in @doubled_ids').copy()
duped['chebi'] = duped['id'].map(duped_mesh_to_chebi)
duped['chebi_name'] = duped['chebi'].map(chebi_to_name)

grpd = duped.groupby('chebi')
groups = list(grpd.groups.keys())
print(len(groups))

9


In [50]:
group_number = 0
cols = ['id', 'name', 'chebi', 'chebi_name']

grpd.get_group(groups[group_number])[cols]

Unnamed: 0,id,name,chebi,chebi_name
82292,MESH:D004295,Dihydroxyphenylalanine,CHEBI:15765,L-Dopa [cytosol]
109388,MESH:D007980,Levodopa,CHEBI:15765,L-Dopa [cytosol]


Dihydroxyphenlalanine is for the D-form form, where as Levodopa or L-Dopa is the L-form... Because the chebi name referenfces L-Dopa, we'll go with the L-form

In [51]:
chebi_to_mesh.update({'CHEBI:15765': 'MESH:D007980'})

In [52]:
group_number = 1

grpd.get_group(groups[group_number])[cols]

Unnamed: 0,id,name,chebi,chebi_name
68409,MESH:D002245,Carbon Dioxide,CHEBI:16526,CO2 [nucleoplasm]
84513,MESH:D004367,Dry Ice,CHEBI:16526,CO2 [nucleoplasm]


This means CO2, not Dry ice...

In [53]:
chebi_to_mesh.update({'CHEBI:16526': 'MESH:D002245'})

In [54]:
group_number = 2

grpd.get_group(groups[group_number])[cols]

Unnamed: 0,id,name,chebi,chebi_name
149608,MESH:D020156,Salicylic Acid,CHEBI:16914,SAL [mitochondrial matrix]
153893,MESH:D012980,Sodium Salicylate,CHEBI:16914,SAL [mitochondrial matrix]


So these both make sense, theres no glaring error, so we will map all chebi terms to both mesh nodes

In [55]:
group_number = 3
grpd.get_group(groups[group_number])[cols]

Unnamed: 0,id,name,chebi,chebi_name
75475,MESH:C400424,Crinone,CHEBI:17026,P4 [extracellular region]
142611,MESH:D011374,Progesterone,CHEBI:17026,P4 [extracellular region]


Crinone is a product rather than the compound, so we will stick to Progesterone

In [56]:
chebi_to_mesh.update({'CHEBI:17026': 'MESH:D011374'})

In [57]:
group_number = 4
grpd.get_group(groups[group_number])[cols]

Unnamed: 0,id,name,chebi,chebi_name
152758,MESH:D058428,Silica Gel,CHEBI:30563,SiO2 [cytosol]
152774,MESH:D012822,Silicon Dioxide,CHEBI:30563,SiO2 [cytosol]


In [58]:
chebi_to_mesh.update({'CHEBI:30563': 'MESH:D012822'})

The gel is for Chromotography, so we want Silicon Dioxide

In [59]:
group_number = 5
grpd.get_group(groups[group_number])[cols]

Unnamed: 0,id,name,chebi,chebi_name
53240,MESH:D000068196,Albumin-Bound Paclitaxel,CHEBI:45863,PTXL [endoplasmic reticulum lumen]
133550,MESH:C495179,Pacliex,CHEBI:45863,PTXL [endoplasmic reticulum lumen]
133551,MESH:D017239,Paclitaxel,CHEBI:45863,PTXL [endoplasmic reticulum lumen]


Paclitaxel is the compound while the other two are formulations.... we will stick with just the compound

In [60]:
chebi_to_mesh.update({'CHEBI:45863': 'MESH:D017239'})

In [61]:
group_number = 6
grpd.get_group(groups[group_number])[cols]

Unnamed: 0,id,name,chebi,chebi_name
73911,MESH:C024989,coenzyme Q10,CHEBI:46245,CoQ [mitochondrial inner membrane]
165263,MESH:D014451,Ubiquinone,CHEBI:46245,CoQ [mitochondrial inner membrane]


These are esssentiall synonyms... Both of these are highly represented and explicit, so we will continue to use both.

In [62]:
group_number = 7
grpd.get_group(groups[group_number])[cols]

Unnamed: 0,id,name,chebi,chebi_name
55013,MESH:D000613,Aminobutyrates,CHEBI:59888,GABA [extracellular region]
93401,MESH:D005680,gamma-Aminobutyric Acid,CHEBI:59888,GABA [extracellular region]


Chebi usses the more spefcific term, GABA, so we will keep that one only.

In [63]:
chebi_to_mesh.update({'CHEBI:59888': 'MESH:D005680'})

In [64]:
group_number = 8
grpd.get_group(groups[group_number])[cols]

Unnamed: 0,id,name,chebi,chebi_name
114062,MESH:C030814,menatetrenone,CHEBI:78277,MK4 [endoplasmic reticulum lumen]
167630,MESH:D024482,Vitamin K 2,CHEBI:78277,MK4 [endoplasmic reticulum lumen]


The Chebi name is 'MK4' which more closely aligns with menatetrenone.  

In [65]:
chebi_to_mesh.update({'CHEBI:78277': 'MESH:C030814'})

#### Finishing the mappings and making the edges and new nodes

So now we've updated our Chebi to MeSH map:

1. to make some chebi to mesh through a DrugBank Intermediary
2. to map some chebi to mesh by name
3. To resolve some 1 chebi to multiple mesh issues...

So now we will:

1. reapply the chebi to mesh mappings
2. remove mesh ids that aren't already in the network
3. keep any new chem nodes as ChEBI identifiers
4. Keep track of any new drugbank or chebi mappings for current nodes

In [66]:
# still have some drugbank ids in the chebi_to_mesh mappers, so lets remove...
chebi_to_mesh = {k: v for k, v in chebi_to_mesh.items() if not v.startswith('DB:')}

In [67]:
# Make a mesh to chebi map
mesh_to_chebi = dict()
for k, v in chebi_to_mesh.items():
    if not v.startswith('DB:'):
        if '|' in v:
            for vsplit in v.split('|'):
                mesh_to_chebi[vsplit] = k
        else:
            mesh_to_chebi[v] = k
len(mesh_to_chebi), len(chebi_to_mesh)

(674, 675)

In [68]:
# Now that we've got some new mapping files, re-load and start over...
chem_pw = read_reactome(load_dir.joinpath('ChEBI2Reactome_PE_All_Levels.txt'))
chem_hpw = chem_pw.query('taxon == "Homo sapiens"').copy()
chem_hpw['external_id'] = 'CHEBI:' + chem_hpw['external_id']
chem_hpw['mesh_id'] = chem_hpw['external_id'].map(chebi_to_mesh)

# Expand the multiple mesh_id_mappings...
print(chem_hpw['mesh_id'].count())
chm_exp = expand_col_on_char(chem_hpw.dropna(subset=['mesh_id']), 'mesh_id', '|')
chm_mappable = chm_exp.query('mesh_id in @all_node_ids').copy()

# Get the unmappable ones...
null_lines = chem_hpw['mesh_id'].isnull()
not_in_net_lines = chm_exp.query('mesh_id not in @all_node_ids')
chm_unmappable = pd.concat([chem_hpw[null_lines], not_in_net_lines], sort=False)
chm_unmappable['mesh_id'] = chm_unmappable['external_id']

# Create new Chem Nodes with the Unmappables
new_chems = chm_unmappable.copy()
new_chems['pe_name'] = new_chems['pe_name'].str.split(' \[', expand=True)[0]
new_chems = new_chems[['external_id', 'pe_name']].drop_duplicates()
new_chems.columns = [c.split('_')[-1] for c in new_chems.columns]
new_chems['label'] = 'Compound'

# Now make the edges...
print(len(chem_hpw))
cpw_edges = pd.concat([chm_mappable, chm_unmappable], sort=False)
print(len(cpw_edges))

# Make a map from reactome_pe to our chem id
pe_chem_map = cpw_edges.set_index('pe_reactome_id')['mesh_id'].to_dict()

# Finish edge mapping
cpw_edges = cpw_edges[['mesh_id', 'reactome_id', 'evidence_code']].dropna().copy()
cpw_edges.columns = ['start_id', 'end_id', 'evidence_code']
cpw_edges['end_id'] = 'REACT:' + cpw_edges['end_id']
cpw_edges['type'] = 'part_of_CpoPW'
cpw_edges.head(2)

17528
38843
38864


Unnamed: 0,start_id,end_id,evidence_code,type
0,MESH:D014859,REACT:R-HSA-1430728,TAS,part_of_CpoPW
1,MESH:D014859,REACT:R-HSA-196854,TAS,part_of_CpoPW


In [69]:
cpw_edges['source'] = 'Reactome'
cpw_edges['evidence'] = cpw_edges['evidence_code'].apply(determine_evidence)

### Get name info for the newest CHEBI items added to the network

In [70]:
new_chems['id'].nunique(), len(new_chems)

(1445, 1874)

In [71]:
duped_chebis = new_chems[new_chems['id'].duplicated()]['id'].unique()

In [72]:
res1 = mc.getchems(duped_chebis, fields=['chebi.name'])

querying 1-130...done.


In [73]:
chebi_to_name = {r['query']: r['chebi']['name'] for r in res1 if 'notfound' not in r}

In [74]:
len(chebi_to_name)

101

In [75]:
new_chems['new_name'] = new_chems['id'].map(chebi_to_name)
new_chems['new_name'] = new_chems['new_name'].fillna(new_chems['name'])

In [76]:
idx = new_chems.drop_duplicates(subset=['id', 'new_name'])['id'].duplicated()
idx = idx[idx].index
new_chems.loc[idx, 'id'].nunique()

29

29 Is a small enough number to curate...

In [77]:
chebi_to_name_1 = {'CHEBI:138674': "10(S),17(S)-dihydroxy-omega 6-docosapentaenoic acid",
    'CHEBI:15748': "D-glucuronate",
    'CHEBI:17843': "transfer RNA",
    'CHEBI:25372': "molybdopterin cofactor",
    'CHEBI:25676': "oligopeptide",
    'CHEBI:29167': "tRNA-Cys",
    'CHEBI:29168': "tRNA-Gln",
    'CHEBI:29169': "tRNA-Leu",
    'CHEBI:29170': "tRNA-Ala",
    'CHEBI:29171': "tRNA-Arg",
    'CHEBI:29173': "tRNA-Met",
    'CHEBI:29174': "tRNA-Ile",
    'CHEBI:29175': "tRNA-Glu",
    'CHEBI:29176': "tRNA-Gly",
    'CHEBI:29177': "tRNA-Pro",
    'CHEBI:29178': "tRNA-His",
    'CHEBI:29179': "tRNA-Ser",
    'CHEBI:29180': "tRNA-Thr",
    'CHEBI:29182': "tRNA-Tyr",
    'CHEBI:29183': "tRNA-Val",
    'CHEBI:29184': "tRNA-Phe",
    'CHEBI:29185': "tRNA-Lys",
    'CHEBI:29186': "tRNA-Asp",
    'CHEBI:33697': "ribonucleic acid",
    'CHEBI:33699': "messenger RNA",
    'CHEBI:4705': "double-stranded DNA",
    'CHEBI:59524': "lipoarabinomannan",
    'CHEBI:67208': "double-stranded RNA",
    'CHEBI:9160': "single-stranded DNA",}

In [78]:
new_chems['new_name'] = new_chems['id'].map({**chebi_to_name, **chebi_to_name_1})
new_chems['new_name'] = new_chems['new_name'].fillna(new_chems['name'])

new_chems.drop_duplicates(subset=['id', 'new_name'])['id'].duplicated().sum()

0

In [79]:
new_chems = new_chems.drop('name', axis=1).rename(columns={'new_name': 'name'}).drop_duplicates(subset=['id', 'name'])
new_chems[['id', 'name', 'label']].head(2)

Unnamed: 0,id,name,label
62,CHEBI:10036,arachidyl ester,Compound
125,CHEBI:10049,XTP,Compound


In [80]:
new_chems['drug_bank_ids'] = new_chems['id'].map(chebi_to_db)
new_chems['mesh_ids'] = new_chems['id'].map(chebi_to_mesh)

In [81]:
new_chems.head(2)

Unnamed: 0,id,label,name,drug_bank_ids,mesh_ids
62,CHEBI:10036,Compound,arachidyl ester,,
125,CHEBI:10049,Compound,XTP,,


In [82]:
reactome_ids = set(new_pw_nodes['id']) | set(all_node_ids)

new_reactome_ids = cpw_edges.query('end_id not in @reactome_ids')['end_id'].unique()
new_reactome_ids  = [c.split(':')[1] for c in new_reactome_ids]

new_pw_nodes1 = chem_hpw.query('reactome_id in @new_reactome_ids').drop_duplicates(subset=['reactome_id'])
new_pw_nodes1 = new_pw_nodes1[['reactome_id', 'reactome_name']]
new_pw_nodes1['reactome_id'] = 'REACT:' + new_pw_nodes1['reactome_id']
new_pw_nodes1.columns = ['id', 'name']
new_pw_nodes1['label'] = 'Pathway'

In [83]:
new_pw_nodes1

Unnamed: 0,id,name,label
21427,REACT:R-HSA-2408499,Formation of selenosugars for excretion,Pathway
79122,REACT:R-HSA-1855156,IPs transport between ER lumen and nucleus,Pathway
84118,REACT:R-HSA-1660524,PIPs transport between plasma and early endoso...,Pathway
84375,REACT:R-HSA-1855196,IP3 and IP4 transport between cytosol and nucleus,Pathway
93447,REACT:R-HSA-1855170,IPs transport between nucleus and cytosol,Pathway
93449,REACT:R-HSA-1855184,IPs transport between cytosol and ER lumen,Pathway
93452,REACT:R-HSA-1855215,IPs transport between ER lumen and cytosol,Pathway
128761,REACT:R-HSA-1660502,PIPs transport between early and late endosome...,Pathway
128763,REACT:R-HSA-1660508,PIPs transport between late endosome and Golgi...,Pathway
128771,REACT:R-HSA-1660537,PIPs transport between early endosome and Golg...,Pathway


In [84]:
new_nodes.append(new_pw_nodes1)
new_nodes.append(new_chems)
new_edges.append(cpw_edges)

# MicroRNA to Pathways

In [85]:
mir = read_reactome(load_dir.joinpath('miRBase2Reactome_PE_All_Levels.txt'))
hmir = mir.query('taxon == "Homo sapiens"').copy()
hmir_names = hmir['pe_name'].str.split(' ', expand=True)[0]

In [86]:
hmir.head(2)

Unnamed: 0,external_id,pe_reactome_id,pe_name,reactome_id,reactome_url,reactome_name,evidence_code,taxon
0,MI0000071,R-HSA-8938437,miR-17 [cytosol],R-HSA-1257604,https://reactome.org/PathwayBrowser/#/R-HSA-12...,PIP3 activates AKT signaling,TAS,Homo sapiens
1,MI0000071,R-HSA-8938437,miR-17 [cytosol],R-HSA-162582,https://reactome.org/PathwayBrowser/#/R-HSA-16...,Signal Transduction,TAS,Homo sapiens


External IDs are in mirbase form... need them in RNACentral Form... Luckily RNACentral has cross-references

In [87]:
mirbase = pd.read_csv(load_dir.joinpath('mirbase.tsv'), sep='\t', header=None, dtype=str,
            names=['int_id', 'db_name', 'ext_id', 'tax_id', 'mir'], usecols=list(range(5)))

In [88]:
hmirbase = mirbase.query('tax_id == "9606"').copy()
hmirbase['int_id'] = hmirbase['int_id'] + '_9606'
mirbase_to_rnac = hmirbase.set_index('ext_id')['int_id'].to_dict()

In [89]:
hmir['rnac_id'] = mir['external_id'].map(mirbase_to_rnac)
hmir.head(2)

Unnamed: 0,external_id,pe_reactome_id,pe_name,reactome_id,reactome_url,reactome_name,evidence_code,taxon,rnac_id
0,MI0000071,R-HSA-8938437,miR-17 [cytosol],R-HSA-1257604,https://reactome.org/PathwayBrowser/#/R-HSA-12...,PIP3 activates AKT signaling,TAS,Homo sapiens,URS000032AA22_9606
1,MI0000071,R-HSA-8938437,miR-17 [cytosol],R-HSA-162582,https://reactome.org/PathwayBrowser/#/R-HSA-16...,Signal Transduction,TAS,Homo sapiens,URS000032AA22_9606


In [90]:
len(hmir), len(hmir.query('rnac_id in @all_node_ids'))

(248, 248)

All now have a maping to nodes already in the network

In [91]:
pe_mir_map = hmir.set_index('pe_reactome_id')['rnac_id'].to_dict()

mir_edges = hmir[['rnac_id', 'reactome_id', 'evidence_code']].copy()
mir_edges.columns = ['start_id', 'end_id', 'evidence_code']
mir_edges['end_id'] = 'REACT:' + mir_edges['end_id']
mir_edges['type'] = 'part_of_NpoPW'
mir_edges.head(2)

Unnamed: 0,start_id,end_id,evidence_code,type
0,URS000032AA22_9606,REACT:R-HSA-1257604,TAS,part_of_NpoPW
1,URS000032AA22_9606,REACT:R-HSA-162582,TAS,part_of_NpoPW


In [92]:
mir_edges['source'] = 'Reactome'
mir_edges['evidence'] = mir_edges['evidence_code'].apply(determine_evidence)

In [93]:
mir_reactome_ids = mir_edges['end_id']
len(set(mir_reactome_ids) - set(all_node_ids) - set(new_pw_nodes['id']) - set(new_pw_nodes1['id']))

0

All pathways already covered, no need to add new nodes.

In [94]:
new_edges.append(mir_edges)

# New node type - Reactions

In [95]:
ncbi_rxn = read_reactome(load_dir.joinpath('NCBI2Reactome_PE_Reactions.txt'))
ncbi_rxn.head(2)

Unnamed: 0,external_id,pe_reactome_id,pe_name,reactome_id,reactome_url,reactome_name,evidence_code,taxon
0,1,R-HSA-6806490,A1BG [extracellular region],R-HSA-481007,https://reactome.org/PathwayBrowser/#/R-HSA-48...,Exocytosis of platelet alpha granule contents,TAS,Homo sapiens
1,1,R-HSA-8848894,A1BG [platelet alpha granule lumen],R-HSA-481007,https://reactome.org/PathwayBrowser/#/R-HSA-48...,Exocytosis of platelet alpha granule contents,TAS,Homo sapiens


In [96]:
ncbi_rxn['external_id'] = ncbi_rxn['external_id'].astype(str)

In [97]:
hncbi_rxn = ncbi_rxn.query('taxon == "Homo sapiens"').copy()

In [98]:
# Update the Gene PE map...
pe_ncbi_map = {**hncbi_rxn.set_index('pe_reactome_id')['external_id'].to_dict(), **pe_ncbi_map}

hncbi_rxn['reactome_id'] = 'REACT:' + hncbi_rxn['reactome_id']
ncbi_rxn_edges = hncbi_rxn[['external_id', 'reactome_id', 'evidence_code']].copy()
ncbi_rxn_edges.columns = ['start_id', 'end_id', 'evidence_code']
ncbi_rxn_edges['type'] = 'part_of_GpoRX'
ncbi_rxn_edges.head(2)

Unnamed: 0,start_id,end_id,evidence_code,type
0,1,REACT:R-HSA-481007,TAS,part_of_GpoRX
1,1,REACT:R-HSA-481007,TAS,part_of_GpoRX


In [99]:
ncbi_rxn_edges['source'] = 'Reactome'
ncbi_rxn_edges['evidence'] = ncbi_rxn_edges['evidence_code'].apply(determine_evidence)

In [100]:
print('Genes in Reactome: {:,}'.format(hncbi['external_id'].nunique()))
print('Mapped to Network: {:,}'.format(hncbi.query('external_id in @all_node_ids')['external_id'].nunique()))
print('Un-mappable: {:,}'.format(hncbi.query('external_id not in @all_node_ids')['external_id'].nunique()))

Genes in Reactome: 10,696
Mapped to Network: 10,610
Un-mappable: 86


In [101]:
keep_cols = ['reactome_id', 'reactome_name']
reaction_nodes = hncbi_rxn.drop_duplicates(subset=keep_cols)[keep_cols]
reaction_nodes.columns = [c.split('_')[1] for c in keep_cols]
reaction_nodes['label'] = 'Reaction'
reaction_nodes.head()

Unnamed: 0,id,name,label
0,REACT:R-HSA-481007,Exocytosis of platelet alpha granule contents,Reaction
2,REACT:R-HSA-6798748,Exocytosis of secretory granule lumen proteins,Reaction
4,REACT:R-HSA-6800434,Exocytosis of ficolin-rich granule lumen proteins,Reaction
6,REACT:R-HSA-158832,The acetyl group from acetyl-CoA is transferre...,Reaction
8,REACT:R-HSA-174967,NAT2 acetylation,Reaction


In [102]:
new_nodes.append(reaction_nodes)
new_edges.append(ncbi_rxn_edges)

## Compound - Reactions

In [103]:
chm_rxn = read_reactome(load_dir.joinpath('ChEBI2Reactome_PE_Reactions.txt'))
chm_rxn.head(2)

Unnamed: 0,external_id,pe_reactome_id,pe_name,reactome_id,reactome_url,reactome_name,evidence_code,taxon
0,10033,R-ALL-9014945,warfarin [cytosol],R-BTA-159790,https://reactome.org/PathwayBrowser/#/R-BTA-15...,VKORC1 reduces vitamin K epoxide to MK4 (vitam...,IEA,Bos taurus
1,10033,R-ALL-9014945,warfarin [cytosol],R-BTA-9026967,https://reactome.org/PathwayBrowser/#/R-BTA-90...,VKORC1 inhibitors binds VKORC1 dimer,IEA,Bos taurus


In [104]:
print(len(chm_rxn))
hchm_rxn = chm_rxn.query('taxon == "Homo sapiens"').copy()
print(len(hchm_rxn))
hchm_rxn['external_id'] = 'CHEBI:' + hchm_rxn['external_id'].astype(str)
hchm_rxn['reactome_id'] = 'REACT:' + hchm_rxn['reactome_id']

185171
24280


All chebi ids have previously been queried for....

In [105]:
hchm_rxn['mesh_id'] = hchm_rxn['external_id'].map(chebi_to_mesh)
hchm_rxn = expand_col_on_char(hchm_rxn, 'mesh_id', '|')
len(hchm_rxn), hchm_rxn['mesh_id'].count()

(24287, 12514)

Still only about 1/2 mapped... however, we may have already put some of these nodes in with chebi ids

In [106]:
new_chem_ids = new_chems['id'].values
len(hchm_rxn.query('external_id not in @new_chem_ids and mesh_id not in @all_node_ids'))

0

All chemicals accounted for, don't need to add new nodes of type Compound

In [107]:
rxns = reaction_nodes['id'].tolist()
hchm_rxn.query('reactome_id not in @rxns')['reactome_id'].nunique()

321

Some new reactions to add though...

In [108]:
keep_cols = ['reactome_id', 'reactome_name']
reaction_nodes1 = hchm_rxn.query('reactome_id not in @rxns')[keep_cols].copy()
reaction_nodes1 = reaction_nodes1.drop_duplicates(subset=keep_cols)
reaction_nodes1.columns = [c.split('_')[1] for c in keep_cols]
reaction_nodes1['label'] = 'Reaction'
reaction_nodes1.head(2)

Unnamed: 0,id,name,label
13,REACT:R-HSA-1497824,BH4 is oxidised to the BH3 radical during the ...,Reaction
15,REACT:R-HSA-3785704,DSB inducing agents induce double strand DNA b...,Reaction


In [109]:
fill_idx = hchm_rxn.query('mesh_id not in @all_node_ids').index
hchm_rxn.loc[fill_idx, 'mesh_id'] = hchm_rxn.loc[fill_idx, 'external_id']

# Update the pe chem map
pe_chem_map = {**hchm_rxn.set_index('pe_reactome_id')['mesh_id'].to_dict(), **pe_chem_map}

ch_rxn_edges = hchm_rxn[['mesh_id', 'reactome_id', 'evidence_code']].copy()
ch_rxn_edges.columns = ['start_id', 'end_id', 'evidence_code']
ch_rxn_edges['type'] = 'part_of_CpoRX'
ch_rxn_edges.head(2)

Unnamed: 0,start_id,end_id,evidence_code,type
0,MESH:D014859,REACT:R-HSA-159790,TAS,part_of_CpoRX
1,MESH:D014859,REACT:R-HSA-9026967,TAS,part_of_CpoRX


In [110]:
ch_rxn_edges['source'] = 'Reactome'
ch_rxn_edges['evidence'] = ch_rxn_edges['evidence_code'].apply(determine_evidence)

In [111]:
new_nodes.append(reaction_nodes1)
new_edges.append(ch_rxn_edges)

## MicroRNA - Rxn

In [112]:
mi_rxn = read_reactome(load_dir.joinpath('miRBase2Reactome_PE_Reactions.txt'))
mi_rxn.head(2)

Unnamed: 0,external_id,pe_reactome_id,pe_name,reactome_id,reactome_url,reactome_name,evidence_code,taxon
0,MI0000071,R-HSA-8938437,miR-17 [cytosol],R-HSA-8935785,https://reactome.org/PathwayBrowser/#/R-HSA-89...,"RUNX1 mRNA translation is inhibited by miR-17,...",TAS,Homo sapiens
1,MI0000071,R-HSA-8938437,miR-17 [cytosol],R-HSA-8938440,https://reactome.org/PathwayBrowser/#/R-HSA-89...,miR-17 binds RUNX1 mRNA,TAS,Homo sapiens


In [113]:
hmi_rxn = mi_rxn.query('taxon == "Homo sapiens"').copy()
hmi_rxn['reactome_id'] = 'REACT:' + hmi_rxn['reactome_id']
hmi_rxn['rnac_id'] = hmi_rxn['external_id'].map(mirbase_to_rnac)
hmi_rxn['rnac_id'].isnull().sum()

0

All micronra mapped succesfully

In [114]:
all_rxns = set(rxns) | set(reaction_nodes1['id'])

In [115]:
len(hmi_rxn.query('reactome_id not in @all_rxns'))

0

All reactions in the network already

In [116]:
# Update the pe mirna map
pe_mir_map = {**hmi_rxn.set_index('pe_reactome_id')['rnac_id'].to_dict(), **pe_mir_map}

mi_rxn_edges = hmi_rxn[['rnac_id', 'reactome_id', 'evidence_code']].copy()
mi_rxn_edges.columns = ['start_id', 'end_id', 'evidence_code']
mi_rxn_edges['type'] = 'part_of_NpoRX'
mi_rxn_edges.head(2)

Unnamed: 0,start_id,end_id,evidence_code,type
0,URS000032AA22_9606,REACT:R-HSA-8935785,TAS,part_of_NpoRX
1,URS000032AA22_9606,REACT:R-HSA-8938440,TAS,part_of_NpoRX


In [117]:
mi_rxn_edges['source'] = 'Reactome'
mi_rxn_edges['evidence'] = mi_rxn_edges['evidence_code'].apply(determine_evidence)

In [118]:
new_edges.append(mi_rxn_edges)

# GO Terms to Reactome Values

In [119]:
go_cols = ['db', 'db_object_id', 'db_object_symbol','qualifier', 'go_id', 'db_reference', 'evidence_code',
 'with_or_from', 'aspect', 'db_object_name', 'db_object_synonym', 'db_object_type', 'taxon', 'date', 'assigned_by',
 'annotation_extension', 'gene_product_form_id']

keep_cols = ['start_id', 'end_id', 'type', 'qualifier', 'evidence_code', 'with_or_from', 'date', 'assigned_by']

In [120]:
go_react = pd.read_csv(load_dir.joinpath('gene_association.reactome.gz'), sep='\t', 
                       names=go_cols, header=None, comment='!', dtype=str)
go_react.head(2)

Unnamed: 0,db,db_object_id,db_object_symbol,qualifier,go_id,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,UniProtKB,P01111,RASN_HUMAN,,GO:0005886,REACTOME:R-HSA-5672950,TAS,,C,,,protein,taxon:9606,20130528,Reactome,,
1,UniProtKB,P01112,RASH_HUMAN,,GO:0005886,REACTOME:R-HSA-5672950,TAS,,C,,,protein,taxon:9606,20130730,Reactome,,


In [121]:
go_react['db_reference'] = go_react['db_reference'].str.replace('REACTOME:', 'REACT:')

In [122]:
hgo_react = go_react.query('taxon == "taxon:9606"').copy()
len(hgo_react)

96119

In [123]:
all_node_ids = set(all_nodes['id']) | set(pd.concat(new_nodes, sort=False)['id'])

print(len(hgo_react.query('db_reference in @all_node_ids')))
print(len(hgo_react.query('go_id in @all_node_ids')))

95476
96117


In [124]:
hgo_react['aspect'].value_counts()

C    75112
P    18910
F     2097
Name: aspect, dtype: int64

In [125]:
pw_ids = pd.concat([all_nodes]+new_nodes, sort=False).query('label == "Pathway"')['id'].values
rxn_ids = pd.concat([all_nodes]+new_nodes, sort=False).query('label == "Reaction"')['id'].values

len(pw_ids), len(rxn_ids)

(2800, 12327)

In [126]:
hgo_rxns = hgo_react.query('db_reference in @rxn_ids')
hgo_pws = hgo_react.query('db_reference in @pw_ids')

In [127]:
aspect_map = {'P': 'involved_in_PWinBP', 'F': 'enables_PWeMF', 'C': 'part_of_PWpoCC'}
gpw_edges = hgo_pws.rename(columns={'db_reference': 'start_id', 'go_id': 'end_id'})
gpw_edges['type'] = gpw_edges['aspect'].map(aspect_map)
gpw_edges = gpw_edges[keep_cols]
gpw_edges.head(2)

Unnamed: 0,start_id,end_id,type,qualifier,evidence_code,with_or_from,date,assigned_by
5,REACT:R-HSA-5673001,GO:0000165,involved_in_PWinBP,,TAS,,20190308,Reactome
6,REACT:R-HSA-5673001,GO:0000165,involved_in_PWinBP,,TAS,,20190308,Reactome


In [128]:
aspect_map = {k: v.replace('PW', 'RX') for k, v in aspect_map.items()}
grx_edges = hgo_rxns.rename(columns={'db_reference': 'start_id', 'go_id': 'end_id'})
grx_edges['type'] = grx_edges['aspect'].map(aspect_map)
grx_edges = grx_edges[keep_cols]
grx_edges.head(2)

Unnamed: 0,start_id,end_id,type,qualifier,evidence_code,with_or_from,date,assigned_by
0,REACT:R-HSA-5672950,GO:0005886,part_of_RXpoCC,,TAS,,20130528,Reactome
1,REACT:R-HSA-5672950,GO:0005886,part_of_RXpoCC,,TAS,,20130730,Reactome


In [129]:
gpw_edges['source'] = 'Reactome'
gpw_edges['evidence'] = gpw_edges['evidence_code'].apply(determine_evidence)

In [130]:
grx_edges['source'] = 'Reactome'
grx_edges['evidence'] = grx_edges['evidence_code'].apply(determine_evidence)

In [131]:
new_edges.append(gpw_edges)
new_edges.append(grx_edges)

# Complexes

In [132]:
cplx = pd.read_csv(load_dir.joinpath('homo_sapiens.tsv'), sep='\t')
cplx.columns = regularize_colnames(cplx.columns)
cplx.head(2)

Unnamed: 0,complex_ac,recommended_name,aliases_for_complex,taxonomy_identifier,identifiers_and_stoichiometry_of_molecules_in_complex,confidence,experimental_evidence,go_annotations,cross_references,description,complex_properties,complex_assembly,ligand,disease,agonist,antagonist,comment,source
0,CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Polybromo-associated SWI/SNF ATP-dependent chr...,9606,Q86U86(0)|Q68CP9(0)|Q8WUB8(0)|P60709(0)|O94805...,ECO:0005547(biological system reconstruction e...,-,GO:0016363(nuclear matrix)|GO:2000045(regulati...,pubmed:11790558(see-also)|pubmed:18809673(see-...,An ATP-dependent chromatin remodeling complex ...,MW approximately 2 MDa. May contain 2 instance...,-,-,Coffin-Siris syndrome (CSS) [Orphanet:1465]: a...,-,-,-,"psi-mi:""MI:0469""(IntAct)"
1,CPX-1201,Neural progenitor-specific SWI/SNF ATP-depende...,neural progenitor-specific BAF ATP-dependent c...,9606,P51531(0)|O14497(0)|Q969G3(0)|Q6STE5(0)|Q8WUB8...,ECO:0005547(biological system reconstruction e...,-,GO:2000045(regulation of G1/S transition of mi...,pubmed:11790558(see-also)|pubmed:18809673(see-...,An ATP-dependent chromatin remodeling complex ...,MW approximately 2 MDa. May contain 2 instance...,-,-,Coffin-Siris syndrome (CSS) [Orphanet:1465]: a...,-,-,-,"psi-mi:""MI:0469""(IntAct)"


In [133]:
exp_cplx = expand_col_on_char(cplx, 'cross_references', '|')
exp_cplx.head(2)

Unnamed: 0,complex_ac,recommended_name,aliases_for_complex,taxonomy_identifier,identifiers_and_stoichiometry_of_molecules_in_complex,confidence,experimental_evidence,go_annotations,cross_references,description,complex_properties,complex_assembly,ligand,disease,agonist,antagonist,comment,source
0,CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Polybromo-associated SWI/SNF ATP-dependent chr...,9606,Q86U86(0)|Q68CP9(0)|Q8WUB8(0)|P60709(0)|O94805...,ECO:0005547(biological system reconstruction e...,-,GO:0016363(nuclear matrix)|GO:2000045(regulati...,pubmed:11790558(see-also),An ATP-dependent chromatin remodeling complex ...,MW approximately 2 MDa. May contain 2 instance...,-,-,Coffin-Siris syndrome (CSS) [Orphanet:1465]: a...,-,-,-,"psi-mi:""MI:0469""(IntAct)"
1,CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Polybromo-associated SWI/SNF ATP-dependent chr...,9606,Q86U86(0)|Q68CP9(0)|Q8WUB8(0)|P60709(0)|O94805...,ECO:0005547(biological system reconstruction e...,-,GO:0016363(nuclear matrix)|GO:2000045(regulati...,pubmed:18809673(see-also),An ATP-dependent chromatin remodeling complex ...,MW approximately 2 MDa. May contain 2 instance...,-,-,Coffin-Siris syndrome (CSS) [Orphanet:1465]: a...,-,-,-,"psi-mi:""MI:0469""(IntAct)"


In [134]:
react_cplx = exp_cplx[exp_cplx['cross_references'].str.startswith('reactome')].copy()
react_cplx[['complex_ac', 'recommended_name', 'cross_references']].head(2)

Unnamed: 0,complex_ac,recommended_name,cross_references
99,CPX-56,GLI1-SUFU complex,reactome:R-HSA-5610605(identity)
100,CPX-56,GLI1-SUFU complex,reactome:R-HSA-5610531(identity)


Some contain the term '(identity)', lets see how many do

In [135]:
len(react_cplx), react_cplx['cross_references'].str.contains('identity').sum()

(389, 386)

In [136]:
react_cplx[~react_cplx['cross_references'].str.contains('identity')][['complex_ac', 'recommended_name', 'cross_references']]

Unnamed: 0,complex_ac,recommended_name,cross_references
370,CPX-2108,epsilon DNA polymerase complex,reactome:R-HSA-68483(subset)
2223,CPX-382,Interleukin-12-receptor complex,reactome:R-HSA-8854487(subset)
2903,CPX-383,Interleukin-23-receptor complex,reactome:R-HSA-447186(subset)


In [137]:
react_cplx['xref'] = (react_cplx['cross_references']
                         .str.replace('reactome:', '', regex=False)
                         .str.replace('(identity)', '', regex=False)
                         .str.replace('(subset)', '', regex=False)
                         .apply(lambda s: s.split('.')[0]))

In [138]:
react_cplx['xref'].nunique(), react_cplx['complex_ac'].nunique()

(364, 272)

Relationship appears to be many-to-many

In [139]:
react_cplx_map = react_cplx.loc[:, ['complex_ac', 'recommended_name', 'xref']]
react_cplx_map.head(2)

Unnamed: 0,complex_ac,recommended_name,xref
99,CPX-56,GLI1-SUFU complex,R-HSA-5610605
100,CPX-56,GLI1-SUFU complex,R-HSA-5610531


In [140]:
react_complexes = react_cplx_map['xref'].unique()

In [141]:
cplx_to_pw = pd.read_csv(load_dir.joinpath('Complex_2_Pathway_human.txt'), sep='\t')
cplx_to_pw.head(2)

Unnamed: 0,complex,pathway,top_level_pathway
0,R-ALL-1006146,R-HSA-977606,R-HSA-168256
1,R-ALL-113656,R-HSA-113501,R-HSA-1640170


In [142]:
len(cplx_to_pw)

19987

In [143]:
len(cplx_to_pw.query('complex in @react_complexes'))

1110

Only about 5 percent are ones that we have a complex-portal xref....

In [144]:
cplx_to_pw['pathway'] = 'REACT:' + cplx_to_pw['pathway']
len(cplx_to_pw.query('complex in @react_complexes and pathway in @pw_ids'))

1110

All pathways have other linking members!

In [145]:
cplx_pw_edges = cplx_to_pw.merge(react_cplx_map, how='inner', left_on='complex', right_on='xref')
cplx_pw_edges = cplx_pw_edges.rename(columns={'complex_ac': 'start_id', 'pathway': 'end_id'})[['start_id', 'end_id']].copy()
cplx_pw_edges['type'] = 'part_of_XpoPW'

cplx_pw_edges.head(2)

Unnamed: 0,start_id,end_id,type
0,CPX-930,REACT:R-HSA-167161,part_of_XpoPW
1,CPX-915,REACT:R-HSA-167161,part_of_XpoPW


In [146]:
cplx_pw_edges['source'] = 'Reactome'
cplx_pw_edges['evidence'] = 'curated'

In [147]:
new_edges.append(cplx_pw_edges)

# Reactome to disease edges...

Reactome has links to diseases, but not easily downloadable.  The neo4j dump was downloaded to and cypher queries run to extract edges between diseases and reactome elements

In [148]:
react_dis = pd.read_csv(ext_dir.joinpath('AllReactome-to-DOID.csv'))

In [149]:
react_dis.head(2)

Unnamed: 0,labels(n1),n1.displayName,n1.schemaClass,n1.stId,type(e),d.displayName,d.identifier,d.databaseName
0,"[DatabaseObject,PhysicalEntity,GenomeEncodedEn...",MUTYH-3 R231H [nucleoplasm],EntityWithAccessionedSequence,R-HSA-9606297,disease,lung squamous cell carcinoma,3907,DOID
1,"[DatabaseObject,PhysicalEntity,GenomeEncodedEn...",p16INK4A D84G [cytosol],EntityWithAccessionedSequence,R-HSA-9632461,disease,lung squamous cell carcinoma,3907,DOID


In [150]:
react_dis.columns = [s.replace('n1.', 'item_').replace('d.', 'dis_') for s in react_dis.columns]
react_dis.columns = regularize_colnames(react_dis.columns)

In [151]:
react_dis.head(2)

Unnamed: 0,labelsn1,item_display_name,item_schema_class,item_st_id,typee,dis_display_name,dis_identifier,dis_database_name
0,"[DatabaseObject,PhysicalEntity,GenomeEncodedEn...",MUTYH-3 R231H [nucleoplasm],EntityWithAccessionedSequence,R-HSA-9606297,disease,lung squamous cell carcinoma,3907,DOID
1,"[DatabaseObject,PhysicalEntity,GenomeEncodedEn...",p16INK4A D84G [cytosol],EntityWithAccessionedSequence,R-HSA-9632461,disease,lung squamous cell carcinoma,3907,DOID


In [152]:
react_dis['item_schema_class'].value_counts()

EntityWithAccessionedSequence    3035
Complex                          1023
Reaction                          670
Pathway                           552
FailedReaction                    415
InstanceEdit                      397
ReferenceDatabase                 374
DefinedSet                        346
ChemicalDrug                      315
CandidateSet                      144
BlackBoxEvent                      73
GenomeEncodedEntity                26
SimpleEntity                       22
ProteinDrug                         6
OtherEntity                         5
Polymer                             3
Depolymerisation                    1
Polymerisation                      1
TopLevelPathway                     1
Name: item_schema_class, dtype: int64

In [153]:
react_dis['reactome_id'] = 'REACT:' + react_dis['item_st_id']
react_dis['doid'] = 'DOID:'+ react_dis['dis_identifier'].astype(str)

In [154]:
#disease ontology parent_to_chold map
do_nodes = ot.get_ontology_nodes(load_dir.joinpath('doid.obo'))
do_child_map = ot.get_children_map(ot.get_ontology_edges(load_dir.joinpath('doid.obo'), 'DOID'))

doid_to_name = do_nodes.set_index('id')['name'].to_dict()

In [155]:
react_dis['do_name'] = react_dis['doid'].map(doid_to_name)

In [156]:
dis_in_reactome = react_dis['doid'].unique()

parents = []
children = []

for dis in dis_in_reactome:
    for c in do_child_map[dis]:
        parents.append(dis)
        children.append(c)
        
do_children = pd.DataFrame({'p_dis':parents, 'c_dis':children})
do_children['p_name'] = do_children['p_dis'].map(doid_to_name)
do_children['c_name'] = do_children['c_dis'].map(doid_to_name)

In [157]:
child_counts = do_children['p_name'].value_counts().rename('num_child').to_frame()
child_counts['num_instances'] = react_dis['do_name'].value_counts()

In [158]:
child_counts.head(50)

Unnamed: 0,num_child,num_instances
disease,9383,7
cancer,2047,1271
eye disease,732,6
carcinoma,598,11
brain disease,493,9
cardiovascular system disease,470,7
inherited metabolic disorder,426,57
urinary system disease,368,6
disease of mental health,358,6
syndrome,308,4


In [159]:
bad_diseases = ['disease', 'syndrome']
react_dis = react_dis.query('do_name not in @bad_diseases').copy()

In [160]:
# Add the parent to the children
do_child_map = {k: {k}|v for k, v in do_child_map.items()}
react_dis['all_children'] = react_dis['doid'].map(do_child_map)

In [161]:
react_dis['all_children'] = react_dis['all_children'].fillna(react_dis['doid']).apply(lambda s: '|'.join(list(s)) if type(s) == set else s)

In [162]:
react_dis = expand_col_on_char(react_dis, 'all_children', '|')

In [163]:
len(react_dis)

2760968

In [164]:
disease_info = expand_col_on_char(all_nodes, 'alt_disease_ids', '|').dropna(subset=['alt_disease_ids'])
disease_info['alt_disease_ids'] = disease_info['alt_disease_ids'].str.replace('DO:DOID', 'DOID')

dis_map = disease_info.set_index('alt_disease_ids')['id'].to_dict()

In [165]:
react_dis['dis_id'] = react_dis['all_children'].map(dis_map)

In [166]:
react_dis.count()

labelsn1             2760968
item_display_name    2760968
item_schema_class    2760968
item_st_id           2737741
typee                2760968
dis_display_name     2760968
dis_identifier       2760968
dis_database_name    2760968
reactome_id          2737741
doid                 2760968
do_name              2759887
all_children         2760968
dis_id                562048
dtype: int64

## Disease to Phsical Entities...

In [167]:
dis_to_item = react_dis.query('item_schema_class == "EntityWithAccessionedSequence"').copy()

In [168]:
dis_to_item['item_id'] = dis_to_item['item_st_id'].map(pe_ncbi_map)
dis_to_item['item_id'] = dis_to_item['item_id'].fillna(dis_to_item['item_st_id'].map(pe_chem_map))
dis_to_item['item_id'] = dis_to_item['item_id'].fillna(dis_to_item['item_st_id'].map(pe_mir_map))

In [169]:
dis_to_item.count()

labelsn1             870324
item_display_name    870324
item_schema_class    870324
item_st_id           870324
typee                870324
dis_display_name     870324
dis_identifier       870324
dis_database_name    870324
reactome_id          870324
doid                 870324
do_name              869861
all_children         870324
dis_id               175805
item_id              745955
dtype: int64

In [170]:
id_to_type = pd.concat([all_nodes]+new_nodes, sort=False).set_index('id')['label'].to_dict()

dis_to_item['item_type'] = dis_to_item['item_id'].map(id_to_type)
dis_to_item['item_type'].value_counts()

Gene    745118
Name: item_type, dtype: int64

Appears to be some kind of curated Gene-to-Disesae Links

In [171]:
dis_to_item = dis_to_item.dropna(subset=['item_type', 'item_id', 'dis_id'])
dis_to_item.head(2)

Unnamed: 0,labelsn1,item_display_name,item_schema_class,item_st_id,typee,dis_display_name,dis_identifier,dis_database_name,reactome_id,doid,do_name,all_children,dis_id,item_id,item_type
826,"[DatabaseObject,PhysicalEntity,GenomeEncodedEn...",PIK3R1 R574T [plasma membrane],EntityWithAccessionedSequence,R-HSA-2399540,disease,breast cancer,1612,DOID,REACT:R-HSA-2399540,DOID:1612,breast cancer,DOID:5675,MESH:D000230,5295,Gene
829,"[DatabaseObject,PhysicalEntity,GenomeEncodedEn...",PIK3R1 R574T [plasma membrane],EntityWithAccessionedSequence,R-HSA-2399540,disease,breast cancer,1612,DOID,REACT:R-HSA-2399540,DOID:1612,breast cancer,DOID:5050,MESH:D002286,5295,Gene


In [172]:
dis_to_item_edges = dis_to_item[['item_id', 'dis_id']].copy()
dis_to_item_edges = dis_to_item_edges.rename(columns={'item_id': 'start_id', 'dis_id': 'end_id'})
dis_to_item_edges['type'] = 'associated_with_GawD'
dis_to_item_edges['source'] = 'Reactome'
dis_to_item_edges['evidence'] = 'curated'

In [173]:
dis_to_item_edges.head(4)

Unnamed: 0,start_id,end_id,type,source,evidence
826,5295,MESH:D000230,associated_with_GawD,Reactome,curated
829,5295,MESH:D002286,associated_with_GawD,Reactome,curated
830,5295,MESH:D058922,associated_with_GawD,Reactome,curated
833,5295,MESH:D018270,associated_with_GawD,Reactome,curated


In [174]:
new_edges.append(dis_to_item_edges)

### Pathway to Disease Links

In [175]:
pw_dis = react_dis.query('item_schema_class == "Pathway"').copy()
pw_dis.count()

labelsn1             215841
item_display_name    215841
item_schema_class    215841
item_st_id           215841
typee                215841
dis_display_name     215841
dis_identifier       215841
dis_database_name    215841
reactome_id          215841
doid                 215841
do_name              215742
all_children         215841
dis_id                45225
dtype: int64

In [176]:
pw_dis['reactome_id'] = 'REACT:'+pw_dis['item_st_id']
pw_dis = pw_dis.dropna(subset=['dis_id'])

In [177]:
pw_dis_edges = pw_dis.query('reactome_id in @pw_ids').copy()
pw_dis_edges = pw_dis_edges[['reactome_id', 'dis_id']].rename(columns={'reactome_id': 'start_id', 'dis_id': 'end_id'})
pw_dis_edges['type'] = 'associated_with_PWawD'
pw_dis_edges['source'] = 'Reactome'
pw_dis_edges['evidence'] = 'curated'
pw_dis_edges.head(4)

Unnamed: 0,start_id,end_id,type,source,evidence
5386,REACT:R-HSA-5339717,MESH:D000230,associated_with_PWawD,Reactome,curated
5389,REACT:R-HSA-5339717,MESH:D002286,associated_with_PWawD,Reactome,curated
5390,REACT:R-HSA-5339717,MESH:D058922,associated_with_PWawD,Reactome,curated
5393,REACT:R-HSA-5339717,MESH:D018270,associated_with_PWawD,Reactome,curated


In [178]:
len(pw_dis_edges)

41670

In [179]:
new_edges.append(pw_dis_edges)

### Reaction to Disease Links

In [180]:
rx_dis = react_dis.query('item_schema_class == "Reaction"').copy()
rx_dis.count()

labelsn1             458526
item_display_name    458526
item_schema_class    458526
item_st_id           458526
typee                458526
dis_display_name     458526
dis_identifier       458526
dis_database_name    458526
reactome_id          458526
doid                 458526
do_name              458477
all_children         458526
dis_id                89146
dtype: int64

In [181]:
rx_dis['reactome_id'] = 'REACT:'+rx_dis['item_st_id']
rx_dis = rx_dis.dropna(subset=['dis_id'])

In [182]:
rx_dis_edges = rx_dis.query('reactome_id in @rxn_ids').copy()
rx_dis_edges = rx_dis_edges[['reactome_id', 'dis_id']].rename(columns={'reactome_id': 'start_id', 'dis_id': 'end_id'})
rx_dis_edges['type'] = 'associated_with_RXawD'
rx_dis_edges['source'] = 'Reactome'
rx_dis_edges['evidence'] = 'curated'
rx_dis_edges.head(4)

Unnamed: 0,start_id,end_id,type,source,evidence
2426,REACT:R-HSA-2243937,MESH:D000230,associated_with_RXawD,Reactome,curated
2429,REACT:R-HSA-2243937,MESH:D002286,associated_with_RXawD,Reactome,curated
2430,REACT:R-HSA-2243937,MESH:D058922,associated_with_RXawD,Reactome,curated
2433,REACT:R-HSA-2243937,MESH:D018270,associated_with_RXawD,Reactome,curated


In [183]:
len(rx_dis_edges)

89138

In [184]:
new_edges.append(rx_dis_edges)

### Reaction to Disease Links

In [185]:
frx_dis = react_dis.query('item_schema_class == "FailedReaction"').copy()
frx_dis.count()

labelsn1             99318
item_display_name    99318
item_schema_class    99318
item_st_id           99318
typee                99318
dis_display_name     99318
dis_identifier       99318
dis_database_name    99318
reactome_id          99318
doid                 99318
do_name              99219
all_children         99318
dis_id               22059
dtype: int64

In [186]:
frx_dis['reactome_id'] = 'REACT:'+frx_dis['item_st_id']
frx_dis = frx_dis.dropna(subset=['dis_id'])

In [187]:
frx_dis_edges = frx_dis.query('reactome_id in @rxn_ids').copy()
frx_dis_edges = frx_dis_edges[['reactome_id', 'dis_id']].rename(columns={'reactome_id': 'start_id', 'dis_id': 'end_id'})
frx_dis_edges['type'] = 'disrupted_in_RXdiD'
frx_dis_edges['source'] = 'Reactome'
frx_dis_edges['evidence'] = 'curated'
frx_dis_edges.head(4)

Unnamed: 0,start_id,end_id,type,source,evidence
5306,REACT:R-HSA-5339711,MESH:D000230,disrupted_in_RXdiD,Reactome,curated
5309,REACT:R-HSA-5339711,MESH:D002286,disrupted_in_RXdiD,Reactome,curated
5310,REACT:R-HSA-5339711,MESH:D058922,disrupted_in_RXdiD,Reactome,curated
5313,REACT:R-HSA-5339711,MESH:D018270,disrupted_in_RXdiD,Reactome,curated


In [188]:
len(frx_dis_edges)

20860

In [189]:
new_edges.append(frx_dis_edges)

# Putting it all together

In [190]:
new_nodes_df = pd.concat(new_nodes, sort=False)
new_edges_df = pd.concat(new_edges, sort=False)

In [191]:
new_nodes_df.head(2)

Unnamed: 0,id,name,label,drug_bank_ids,mesh_ids
28,REACT:R-HSA-8956321,Nucleotide salvage,Pathway,,
43,REACT:R-HSA-8957275,Post-translational protein phosphorylation,Pathway,,


In [192]:
new_edges_df.head(2)

Unnamed: 0,start_id,end_id,evidence_code,type,source,evidence,qualifier,with_or_from,date,assigned_by
0,1,REACT:R-HSA-109582,TAS,part_of_GpoPW,Reactome,curated,,,,
1,1,REACT:R-HSA-109582,TAS,part_of_GpoPW,Reactome,curated,,,,


In [193]:
new_edges_df['type'].value_counts()

part_of_GpoPW            198300
associated_with_GawD     151533
part_of_GpoRX            141135
associated_with_RXawD     89138
part_of_RXpoCC            74905
associated_with_PWawD     41670
part_of_CpoPW             38864
part_of_CpoRX             24287
disrupted_in_RXdiD        20860
involved_in_PWinBP        16869
involved_in_RXinBP         2041
enables_RXeMF              1661
part_of_XpoPW              1191
part_of_NpoPW               248
part_of_NpoRX               146
Name: type, dtype: int64

In [194]:
print('Total number of new edges: {:,}'.format(len(new_edges_df)))
print('Number of unique new edges: {:,}'.format(len(new_edges_df.drop_duplicates(subset=['start_id', 'end_id', 'type']))))

Total number of new edges: 802,848
Number of unique new edges: 424,092


Why are so many edges duplicated? what kind are they?

In [195]:
ix = new_edges_df.duplicated(keep=False)
new_edges_df[ix].sort_values(['start_id', 'end_id', 'type']).head(10)

Unnamed: 0,start_id,end_id,evidence_code,type,source,evidence,qualifier,with_or_from,date,assigned_by
0,1,REACT:R-HSA-109582,TAS,part_of_GpoPW,Reactome,curated,,,,
1,1,REACT:R-HSA-109582,TAS,part_of_GpoPW,Reactome,curated,,,,
2,1,REACT:R-HSA-114608,TAS,part_of_GpoPW,Reactome,curated,,,,
3,1,REACT:R-HSA-114608,TAS,part_of_GpoPW,Reactome,curated,,,,
4,1,REACT:R-HSA-168249,TAS,part_of_GpoPW,Reactome,curated,,,,
5,1,REACT:R-HSA-168249,TAS,part_of_GpoPW,Reactome,curated,,,,
6,1,REACT:R-HSA-168249,TAS,part_of_GpoPW,Reactome,curated,,,,
7,1,REACT:R-HSA-168256,TAS,part_of_GpoPW,Reactome,curated,,,,
8,1,REACT:R-HSA-168256,TAS,part_of_GpoPW,Reactome,curated,,,,
9,1,REACT:R-HSA-168256,TAS,part_of_GpoPW,Reactome,curated,,,,


In [196]:
new_edges_df[ix]['type'].value_counts()

associated_with_GawD     150080
part_of_GpoPW            102808
part_of_GpoRX             53412
part_of_RXpoCC            51756
associated_with_RXawD     24754
part_of_CpoPW             16997
involved_in_PWinBP        16699
associated_with_PWawD     10292
disrupted_in_RXdiD         4781
part_of_CpoRX              3727
involved_in_RXinBP         1890
part_of_XpoPW               263
enables_RXeMF               182
part_of_NpoPW                12
part_of_NpoRX                 4
Name: type, dtype: int64

In [197]:
dup_types = new_edges_df[ix]['type'].unique()

dup_dfs = []

for dt in dup_types:
    dup_dfs.append(new_edges_df[ix].query('type == @dt').sort_values(['start_id', 'end_id', 'type']).head(6))
    
pd.concat(dup_dfs)

Unnamed: 0,start_id,end_id,evidence_code,type,source,evidence,qualifier,with_or_from,date,assigned_by
0,1,REACT:R-HSA-109582,TAS,part_of_GpoPW,Reactome,curated,,,,
1,1,REACT:R-HSA-109582,TAS,part_of_GpoPW,Reactome,curated,,,,
2,1,REACT:R-HSA-114608,TAS,part_of_GpoPW,Reactome,curated,,,,
3,1,REACT:R-HSA-114608,TAS,part_of_GpoPW,Reactome,curated,,,,
4,1,REACT:R-HSA-168249,TAS,part_of_GpoPW,Reactome,curated,,,,
5,1,REACT:R-HSA-168249,TAS,part_of_GpoPW,Reactome,curated,,,,
478,CHEBI:10545,REACT:R-HSA-1430728,TAS,part_of_CpoPW,Reactome,curated,,,,
479,CHEBI:10545,REACT:R-HSA-1430728,TAS,part_of_CpoPW,Reactome,curated,,,,
483,CHEBI:10545,REACT:R-HSA-196854,TAS,part_of_CpoPW,Reactome,curated,,,,
484,CHEBI:10545,REACT:R-HSA-196854,TAS,part_of_CpoPW,Reactome,curated,,,,


I see no reason for any of these duplications.... We'll just do a simple 'drop_duplicates' rather than merging any columns

In [198]:
new_edges_df = new_edges_df.drop_duplicates(subset=['start_id', 'end_id', 'type']).copy()

In [199]:
len(new_edges_df)

424092

In [200]:
all_node_ids = set(all_nodes['id']) | set(new_nodes_df['id'])

print(len(new_edges_df))
new_edges_df_filt = new_edges_df.query('start_id in @all_node_ids and end_id in @all_node_ids').copy()
print(len(new_edges_df_filt))

424092
421801


In [201]:
# Lests see how many of what kinds of edges had to be dropped...
new_edges_df.query('start_id not in @all_node_ids or end_id not in @all_node_ids')['type'].value_counts()

part_of_GpoRX    1483
part_of_GpoPW     806
enables_RXeMF       2
Name: type, dtype: int64

In [202]:
all_nodes_out = pd.concat([all_nodes.drop('l_name', axis=1), new_nodes_df], sort=False)
all_nodes_out['chebi_ids'] = all_nodes_out['id'].map(mesh_to_chebi)
all_nodes_out.head(2)

Unnamed: 0,id,name,label,tree_numbers,drug_bank_ids,alt_disease_ids,gene_symbol,alt_gene_ids,bio_gridids,pharm_gkbids,uni_prot_ids,uniprot_id,mesh_ids,chebi_ids
0,MESH:C089250,(0.017ferrocene)amylose,Compound,D01.490.200/C089250|D02.691.550.200/C089250|D0...,,,,,,,,,,
1,MESH:C114385,001-C8-NBD,Compound,D03.383.129.462.580/C114385|D12.644.456/C114385,,,,,,,,,,


In [203]:
new_edges_df_filt['abbv'] = new_edges_df_filt['type'].apply(lambda s: s.split('_')[-1])
all_edges = pd.concat([edges, new_edges_df_filt], sort=False)
all_edges.head(2)

Unnamed: 0,start_id,end_id,type,parent_ixn,pub_med_ids,organism_id,abbv,source,evidence,direct_evidence,corrected_pvalue,inference_gene_symbol,qualifier,db_reference,evidence_code,with_or_from,date,assigned_by,experiments,support_type
0,MESH:C000121,4313,decreases_activity_CdaG,decreases^activity,25899827,9606,CdaG,CTD,curated,,,,,,,,,,,
1,MESH:C000121,4313,decreases_expression_CdeG,decreases^expression,25899827,9606,CdeG,CTD,curated,,,,,,,,,,,


In [204]:
len(all_node_ids) == len(all_nodes_out)

True

In [205]:
all_edge_ids = all_edges[['start_id', 'end_id']].stack()
filt_nodes_out = all_nodes_out.query('id in @all_edge_ids')
len(filt_nodes_out)

111599

# Save to Disk

In [208]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')
// hack to get the filename for this notebook

<IPython.core.display.Javascript object>

In [209]:
out_dir = Path('../2_pipeline/').joinpath(nb_name.split('.')[0]).joinpath('out').resolve()
out_dir.mkdir(parents=True, exist_ok=True)

In [210]:
gt.add_colons(new_nodes_df, id_name='identifier').to_csv(out_dir.joinpath('new_nodes.csv'), index=False)
gt.add_colons(new_edges_df).to_csv(out_dir.joinpath('new_edges.csv'), index=False)

In [211]:
gt.add_colons(all_nodes_out, id_name='identifier').to_csv(out_dir.joinpath('nodes_all.csv'), index=False)
gt.add_colons(filt_nodes_out, id_name='identifier').to_csv(out_dir.joinpath('nodes_filt.csv'), index=False)

gt.add_colons(all_edges).to_csv(out_dir.joinpath('edges.csv'), index=False)