# Parse the DrugBank XML and extract half-life of drugs to tsv files

Run using Python 3 to avoid a non-ascii character error when writing to file with the csv module.

In [1]:
import os
import csv
import gzip
import collections
import re
import io
import json
import xml.etree.ElementTree as ET

import requests
import pandas

In [2]:
xml_path = os.path.join('download', 'drugbank.xml.gz')
with gzip.open(xml_path) as xml_file:
    tree = ET.parse(xml_file)
root = tree.getroot()

In [3]:
ns = '{http://www.drugbank.ca}'
for i,drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'

In [4]:
ns = '{http://www.drugbank.ca}'
# calculated properties 
cal_logp_template = "{ns}calculated-properties/{ns}property[{ns}kind='logP']/{ns}value"
cal_logs_template = "{ns}calculated-properties/{ns}property[{ns}kind='logS']/{ns}value"
cal_ws_template = "{ns}calculated-properties/{ns}property[{ns}kind='Water Solubility']/{ns}value"
cal_mw_template = "{ns}calculated-properties/{ns}property[{ns}kind='Molecular Weight']/{ns}value"
cal_psa_template = "{ns}calculated-properties/{ns}property[{ns}kind='Polar Surface Area (PSA)']/{ns}value"
cal_refr_template = "{ns}calculated-properties/{ns}property[{ns}kind='Refractivity']/{ns}value"
cal_pola_template = "{ns}calculated-properties/{ns}property[{ns}kind='Polarizability']/{ns}value"
cal_rbc_template = "{ns}calculated-properties/{ns}property[{ns}kind='Rotatable Bond Count']/{ns}value"
cal_hdac_template = "{ns}calculated-properties/{ns}property[{ns}kind='H Bond Acceptor Count']/{ns}value"
cal_hbdc_template = "{ns}calculated-properties/{ns}property[{ns}kind='H Bond Donor Count']/{ns}value"
cal_pkasa_template = "{ns}calculated-properties/{ns}property[{ns}kind='pKa (strongest acidic)']/{ns}value"
cal_pkasb_template = "{ns}calculated-properties/{ns}property[{ns}kind='pKa (strongest basic)']/{ns}value"
cal_pc_template = "{ns}calculated-properties/{ns}property[{ns}kind='Physiological Charge']/{ns}value"
cal_nr_template = "{ns}calculated-properties/{ns}property[{ns}kind='Number of Rings']/{ns}value"
cal_bioa_template = "{ns}calculated-properties/{ns}property[{ns}kind='Bioavailability']/{ns}value"
cal_rof_template = "{ns}calculated-properties/{ns}property[{ns}kind='Rule of Five']/{ns}value"
# experimental properties
exp_mw_template = "{ns}experimental-properties/{ns}property[{ns}kind='Molecular Weight']/{ns}value"
exp_logp_template = "{ns}experimental-properties/{ns}property[{ns}kind='logP']/{ns}value"
exp_pka_template = "{ns}experimental-properties/{ns}property[{ns}kind='pKa']/{ns}value"

rows = list()
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
    row['cal_logp'] = drug.findtext(cal_logp_template.format(ns = ns))
    row['logs'] = drug.findtext(cal_logs_template.format(ns = ns))
    row['water_solu'] = drug.findtext(cal_ws_template.format(ns = ns))
    row['cal_mol_weight'] = drug.findtext(cal_mw_template.format(ns = ns))
    row['pol_surf_area'] = drug.findtext(cal_psa_template.format(ns = ns))
    row['refractivity'] = drug.findtext(cal_refr_template.format(ns = ns))
    row['polarizability'] = drug.findtext(cal_pola_template.format(ns = ns))
    row['rota_bind_count'] = drug.findtext(cal_rbc_template.format(ns = ns))
    row['h_bond_acc_count'] = drug.findtext(cal_hdac_template.format(ns = ns))
    row['h_bond_don_count'] = drug.findtext(cal_hbdc_template.format(ns = ns))
    row['cal_pka_acidic'] = drug.findtext(cal_pkasa_template.format(ns = ns))
    row['cal_pka_basic'] = drug.findtext(cal_pkasb_template.format(ns = ns))
    row['physio_charge'] = drug.findtext(cal_pc_template.format(ns = ns))
    row['num_of_rings'] = drug.findtext(cal_nr_template.format(ns = ns))
    row['bioavailability'] = drug.findtext(cal_bioa_template.format(ns = ns))
    row['rule_of_five'] = drug.findtext(cal_rof_template.format(ns = ns))
    row['exp_mol_weight'] = drug.findtext(exp_mw_template.format(ns = ns))
    row['exp_logp'] = drug.findtext(exp_logp_template.format(ns = ns))
    row['exp_pka'] = drug.findtext(exp_pka_template.format(ns = ns))
    row['half-life'] = drug.findtext(ns + "half-life")
    rows.append(row)

In [5]:
def collapse_list_values(row):
    for key, value in row.items():
        if isinstance(value, list):
            row[key] = '|'.join(value)
    return row

rows = list(map(collapse_list_values, rows))

In [6]:
drugbank_df = pandas.DataFrame.from_dict(rows)
drugbank_df.head()

Unnamed: 0,type,drugbank_id,name,cal_logp,logs,water_solu,cal_mol_weight,pol_surf_area,refractivity,polarizability,...,cal_pka_acidic,cal_pka_basic,physio_charge,num_of_rings,bioavailability,rule_of_five,exp_mol_weight,exp_logp,exp_pka,half-life
0,biotech,DB00001,Lepirudin,,,,,,,,...,,,,,,,6963.425,,,Approximately 1.3 hours
1,biotech,DB00002,Cetuximab,,,,,,,,...,,,,,,,145781.6,,,114 hrs
2,biotech,DB00003,Dornase alfa,,,,,,,,...,,,,,,,29253.9,,,
3,biotech,DB00004,Denileukin diftitox,,,,,,,,...,,,,,,,57647.3,,,70-80 min
4,biotech,DB00005,Etanercept,,,,,,,,...,,,,,,,51234.9,,,102 +/- 30 hrs in individuals with rheumatoid ...


In [14]:
drugbank_df['half-life'].value_counts()

                                                                                                                                                                                                                                                                                                                                               6439
2 hours                                                                                                                                                                                                                                                                                                                                          16
10 hours                                                                                                                                                                                                                                                                                                                        

In [7]:
# number of extracted drugs 
drugbank_df.shape[0]

7759

In [8]:
# number of extracted drugs with half life information
drugbank_df.shape[0] - drugbank_df['half-life'].isin(['']).sum(axis=0)

1320

In [9]:
# number of extracted drugs with half life information
drugbank_df.shape[0] - drugbank_df.isin([None]).sum(axis=0)

type                7759
drugbank_id         7759
name                7759
cal_logp            6866
logs                6746
water_solu          6746
cal_mol_weight      6870
pol_surf_area       6868
refractivity        6868
polarizability      6863
rota_bind_count     6868
h_bond_acc_count    6868
h_bond_don_count    6868
cal_pka_acidic      5890
cal_pka_basic       6438
physio_charge       6868
num_of_rings        6868
bioavailability     6868
rule_of_five        5433
exp_mol_weight       153
exp_logp            1391
exp_pka              417
half-life           7759
dtype: int64

In [10]:
# drugs with logP, MW, and solubility information
other_info = drugbank_df.isin([None])['cal_logp'] * drugbank_df.isin([None])['water_solu'] * drugbank_df.isin([None])['cal_mol_weight']
drugbank_df1 = drugbank_df[~other_info]
drugbank_df1.head()

  .format(op=op_str, alt_op=unsupported[op_str]))


Unnamed: 0,type,drugbank_id,name,cal_logp,logs,water_solu,cal_mol_weight,pol_surf_area,refractivity,polarizability,...,cal_pka_acidic,cal_pka_basic,physio_charge,num_of_rings,bioavailability,rule_of_five,exp_mol_weight,exp_logp,exp_pka,half-life
13,small molecule,DB00014,Goserelin,0.3,-4.7,2.83e-02 g/l,1269.4105,495.89,325.84,131.22,...,9.27,10.82,2,6,0,,,-2.0,,4-5 hours
34,small molecule,DB00035,Desmopressin,-1.0,-4.0,1.10e-01 g/l,1069.217,435.41,279.78,106.19,...,9.5,11.77,1,4,0,,,-4.2,,Oral t<sub>1/2</sub>=1.5-2.5 hours. Intranasal...
48,small molecule,DB00050,Cetrorelix,1.33,-5.3,6.94e-03 g/l,1431.038,495.67,384.16,148.09,...,9.49,11.11,1,6,0,,,,,~62.8 hours
86,small molecule,DB00091,Cyclosporine,4.12,-5.1,9.52e-03 g/l,1202.6112,278.8,327.14,133.24,...,11.83,-2.4,0,1,0,,,,,"Biphasic and variable, approximately 7 hours (..."
88,small molecule,DB00093,Felypressin,-1.1,-4.4,4.53e-02 g/l,1040.219,405.32,264.79,103.93,...,11.39,10.18,2,4,0,,,,,


In [11]:
# number of extracted drugs with logP, MW, solubility, and half life information
drugbank_df1.shape[0] - drugbank_df['half-life'].isin(['']).sum(axis=0)

431