In [48]:
import urllib.request
import xml.etree.ElementTree as ET
import pandas as pd
import time

In [2]:
# Get Drugbank data from xml file
tree = ET.parse('drugbank_full_database.xml')
root = tree.getroot()

In [19]:
# Index 5 is Bivalirudin (first occurrence of SMILE string)
# SMILE string is found under "{http://www.drugbank.ca}calculated-properties" tag
for el in root[0]:
    print(str(el.tag) + ": " + str(el.text))

{http://www.drugbank.ca}drugbank-id: DB00001
{http://www.drugbank.ca}drugbank-id: BTD00024
{http://www.drugbank.ca}drugbank-id: BIOD00024
{http://www.drugbank.ca}name: Lepirudin
{http://www.drugbank.ca}description: Lepirudin is identical to natural hirudin except for substitution of leucine for isoleucine at the N-terminal end of the molecule and the absence of a sulfate group on the tyrosine at position 63. It is produced via yeast cells. Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.
{http://www.drugbank.ca}cas-number: 138068-37-8
{http://www.drugbank.ca}unii: Y43GF64R34
{http://www.drugbank.ca}state: liquid
{http://www.drugbank.ca}groups: 
    
{http://www.drugbank.ca}general-references: 
    
{http://www.drugbank.ca}synthesis-reference: None
{http://www.drugbank.ca}indication: For the treatment of heparin-induced thrombocytopenia
{http://www.drugbank.ca}pharmacodynamics: Lepirudin is used to break up clots and to reduce thrombocytopenia. It binds to thr

In [93]:
# Complicated to find SMILE string via XML
calculated_properties = root[5].findall("{http://www.drugbank.ca}calculated-properties")
for prop in calculated_properties:
    if prop:
        for el in prop:
            for sub_el in el:
                print(sub_el.text)
            print()

logP
-0.76
ALOGPS
None
None

logS
-4.7
ALOGPS
None
None

Water Solubility
4.64e-02 g/l
ALOGPS
None
None

logP
-14
ChemAxon
None
None

IUPAC Name
(4S)-4-[(2S)-2-[(2S)-2-[(2S)-2-{2-[(2S)-2-(2-{2-[2-(2-{[(2S)-1-[(2S)-2-{[(2S)-1-[(2R)-2-amino-3-phenylpropanoyl]pyrrolidin-2-yl]formamido}-5-carbamimidamidopentanoyl]pyrrolidin-2-yl]formamido}acetamido)acetamido]acetamido}acetamido)-3-carbamoylpropanamido]acetamido}-3-carboxypropanamido]-3-phenylpropanamido]-4-carboxybutanamido]-4-{[(2S,3S)-1-[(2S)-2-{[(1S)-3-carboxy-1-{[(1S)-3-carboxy-1-{[(1S)-1-{[(1S)-1-carboxy-3-methylbutyl]carbamoyl}-2-(4-hydroxyphenyl)ethyl]carbamoyl}propyl]carbamoyl}propyl]carbamoyl}pyrrolidin-1-yl]-3-methyl-1-oxopentan-2-yl]carbamoyl}butanoic acid
ChemAxon
None
None

Traditional IUPAC Name
bivalirudin
ChemAxon
None
None

Molecular Weight
2180.2853
ChemAxon
None
None

Monoisotopic Weight
2178.985813062
ChemAxon
None
None

SMILES
CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@H](CCC(O)=O)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)[C@H

In [None]:
########################################## HELPER FUNCTIONS ##########################################

In [28]:
# Get set of groups for a given XML element representing a drug.
def get_groups(drug_xml_element):
    groups_set = set()
    groups_list = drug_xml_element.findall("{http://www.drugbank.ca}groups")
    for groups in groups_list:
        for group in groups:
            groups_set.add(group.text)
    
    return groups_set

In [15]:
# Get drug ID.
def get_drug_id(drug_xml_element):
    return drug_xml_element.find("{http://www.drugbank.ca}drugbank-id").text

In [16]:
# URL to send HTTP request to for SMILE string is SMILE_URL_PREFIX + drug_id + SMILE_URL_SUFFIX
SMILE_URL_PREFIX = "https://www.drugbank.ca/structures/small_molecule_drugs/"
SMILE_URL_SUFFIX = ".smiles"

# Return SMILE string if it exists. Else return "error"
def get_smile_string(drug_xml_element):
    # Find first ID (which is the primary ID).
    drug_id = get_drug_id(drug_xml_element)
    url_str = SMILE_URL_PREFIX + drug_id + SMILE_URL_SUFFIX
    # Example URL: https://www.drugbank.ca/structures/small_molecule_drugs/DB00007.smiles
    try:
        conn = urllib.request.urlopen(url_str)
        # Decode bytes to string.
        return conn.read().decode("utf-8")
    except urllib.request.HTTPError:
        return "error"

In [17]:
# Get drug name.
def get_drug_name(drug_xml_element):
    return drug_xml_element.find("{http://www.drugbank.ca}name").text

In [59]:
# Store parsed XML drug data as dictionary, to be converted into pandas dataframe and then CSV
smile_data_dict = dict()
column_names = ["drug_id", "name", "smile"]
# Find all group names.
all_group_names = set()
for drug_xml_element in root:
    all_group_names.update(get_groups(drug_xml_element))
# Group names are {'withdrawn', 'illicit', 'investigational', 'experimental',
# 'approved', 'nutraceutical', 'vet_approved'}.

# Add each group name as binary column in dictionary.
for group_name in all_group_names:
    column_names.append(group_name)

# Initialize empty list for each column.
for col in column_names:
    smile_data_dict[col] = []
    
start_time = time.time()
# Generate dictionary.
for drug_xml_element in root:
# for i in range(1000):  # Takes 104 seconds
#     drug_xml_element = root[i]
    smile_str = get_smile_string(drug_xml_element)
    # Don't add current drug to dictionary if it does not have a SMILE string.
    if smile_str == "error":
        continue
    
    drug_id = get_drug_id(drug_xml_element)
    drug_name = get_drug_name(drug_xml_element)
    drug_groups = get_groups(drug_xml_element)
    
    smile_data_dict["drug_id"].append(drug_id)
    smile_data_dict["name"].append(drug_name)
    smile_data_dict["smile"].append(smile_str)
    for group in all_group_names:
        if group in drug_groups:
            smile_data_dict[group].append(1)
        else:
            smile_data_dict[group].append(0)

end_time = time.time()
print("Elapsed Time: " + str(end_time-start_time) + " sec")

Elapsed Time: 1497.4739336967468 sec


In [60]:
smile_data_dict

{'approved': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1

In [63]:
len(smile_data_dict["name"])

10630

In [62]:
# Convert data dictionary to dataframe and then CSV. Write CSV to disk.
smile_data_df = pd.DataFrame(data=smile_data_dict)
smile_data_df.to_csv("drugbank_smile_data.csv", index=False)