In [5]:
import networkx as nx
import os
import csv
import gzip
import collections
import re
import io
import json
import xml.etree.ElementTree as ET

import requests
import pandas

In [6]:
xml_path = os.path.join('../download', 'drugbank.xml.gz')
with gzip.open(xml_path) as xml_file:
    tree = ET.parse(xml_file)
root = tree.getroot()

In [8]:
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"

In [9]:
protein_rows = list()
for i, drug in enumerate(root):
    drugbank_id = drug.findtext(ns + "drugbank-id[@primary='true']")
    for category in ['target', 'enzyme', 'carrier', 'transporter']:
        proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat=category))
        for protein in proteins:
            row = {'drugbank_id': drugbank_id, 'category': category}
            row['organism'] = protein.findtext('{}organism'.format(ns))
            row['known_action'] = protein.findtext('{}known-action'.format(ns))
            actions = protein.findall('{ns}actions/{ns}action'.format(ns=ns))
            row['actions'] = '|'.join(action.text for action in actions)
            uniprot_ids = [polypep.text for polypep in protein.findall(
                "{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier".format(ns=ns))]            
            if len(uniprot_ids) != 1:
                continue
            row['uniprot_id'] = uniprot_ids[0]
            ref_text = protein.findtext("{ns}references[@format='textile']".format(ns=ns))
            pmids = re.findall(r'pubmed/([0-9]+)', ref_text)
            row['pubmed_ids'] = '|'.join(pmids)
            protein_rows.append(row)

protein_df = pandas.DataFrame.from_dict(protein_rows)

In [11]:
G_drugs = nx.read_pajek("../networks/drug_drug_network.net")

In [10]:
protein_df

Unnamed: 0,drugbank_id,category,organism,known_action,actions,uniprot_id,pubmed_ids
0,DB00001,target,Human,yes,inhibitor,P00734,10505536|10912644|11055889|11467439|11807012|1...
1,DB00002,target,Human,yes,antagonist,P00533,10480573|10601294|10628369|11408594|11431346|1...
2,DB00002,target,Human,unknown,,O75015,16336752
3,DB00002,target,Human,unknown,,P00736,17139284|17016423
4,DB00002,target,Human,unknown,,P02745,17139284|17016423
...,...,...,...,...,...,...,...
20291,DB09028,target,Human,unknown,agonist,P43681,25517706|11553677
20292,DB09028,target,Human,unknown,agonist,P36544,25517706|11553677
20293,DB09028,target,Human,unknown,agonist,P32297,25517706|11553677
20294,DB09028,target,Human,unknown,agonist,Q15825,25517706|11553677


In [33]:
for row in (protein_df.loc[protein_df['uniprot_id'] == 'Q8WX93'].iterrows()):
    print(row[1]['drugbank_id'])

In [22]:
G_targets = nx.Graph()

In [23]:
for node in G_drugs.nodes():
    for row in (protein_df.loc[protein_df['drugbank_id'] == node].iterrows()):
        #print(row[1]['uniprot_id'])
        G_targets.add_node(row[1]['uniprot_id'])

In [17]:
for row in (protein_df.loc[protein_df['drugbank_id'] == 'DB00002'].iterrows()):
    print(row[1]['uniprot_id'])

P00533
O75015
P00736
P02745
P02746
P02747
P08637
P09871
P12314
P12318
P31994
P31995


In [27]:
G_drugs.number_of_edges()

1200475

In [25]:
for node1 in G_targets.nodes():
    for node2 in G_targets.nodes():
        if node1 != node2:
            G_targets.add_edge(node1, node2, weight=0.2)

In [28]:
nx.write_pajek(G_targets, "../networks/targets_targets_network.net")