In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

In [2]:
data = []
tree = ET.parse('2024 drugbank full database.xml')
root = tree.getroot()

In [3]:
namespace = '{http://www.drugbank.ca}'

In [4]:
for drug in root.findall(f'{namespace}drug'):
    drugbank_id = drug.findtext(f'{namespace}drugbank-id')
    name = drug.findtext(f'{namespace}name')
    description = drug.findtext(f'{namespace}description')
    cas_number = drug.findtext(f'{namespace}cas-number')
    average_mass = drug.findtext(f'{namespace}average-mass')
    monoisotopic_mass = drug.findtext(f'{namespace}monoisotopic-mass')
    state = drug.findtext(f'{namespace}state')
    
    calculated_properties = []
    for prop in drug.findall(f'{namespace}calculated-properties/{namespace}property'):
        kind = prop.find(f'{namespace}kind').text if prop.find(f'{namespace}kind')is not None else None
        value = prop.find(f'{namespace}value').text if prop.find(f'{namespace}value')is not None else None
        source = prop.find(f'{namespace}source').text if prop.find(f'{namespace}source')is not None else None
        calculated_properties.append({'kind': kind, 'value': value, 'source': source})
        
    reactions = []
    for reaction in drug.findall(f'{namespace}reactions/{namespace}reaction'):
        sequence = reaction.findtext(f'{namespace}sequence')
        left_elements = []
        right_elements = []
        for left in reaction.findall(f'{namespace}left-element'):
            left_drugbank = left.findtext(f'{namespace}drugbank-id')
            left_name = left.findtext(f'{namespace}name')
            left_elements.append({'drugbank_id': left_drugbank, 'name': left_name})
        for right in reaction.findall(f'{namespace}right-element'):
            right_drugbank = right.findtext(f'{namespace}drugbank-id')
            right_name = right.findtext(f'{namespace}name')
            right_elements.append({'drugbank_id': right_drugbank, 'name': right_name})
        reactions.append({'sequence': sequence, 'left_elements': left_elements, 'right_elements': right_elements})

    articles = []
    links = []
    general_references = drug.find(f'{namespace}general-references')
    if general_references is not None:
        for article in general_references.findall(f'{namespace}articles/{namespace}article'):
            ref_id = article.findtext(f'{namespace}ref-id')
            pubmed_id = article.findtext(f'{namespace}pubmed-id')
            citation = article.findtext(f'{namespace}citation')
            articles.append({'ref_id': ref_id, 'pubmed_id': pubmed_id, 'citation': citation})
        for link in general_references.findall(f'{namespace}links/{namespace}link'):
            ref_id = link.findtext(f'{namespace}ref-id')
            title = link.findtext(f'{namespace}title')
            url = link.findtext(f'{namespace}url')
            links.append({'ref_id': ref_id, 'title': title, 'url': url})

    patents = []

    for patent in drug.findall(f'{namespace}patents/{namespace}patent'):
        number = patent.findtext(f'{namespace}number')
        country = patent.findtext(f'{namespace}country')
        approved = patent.findtext(f'{namespace}approved')
        expires = patent.findtext(f'{namespace}expires')
        pediatric_extension = patent.findtext(f'{namespace}pediatric-extension')
        patents.append({
            'number': number,
            'country': country,
            'approved': approved,
            'expires': expires,
            'pediatric_extension': pediatric_extension
        })

        
    data.append({
    'Drugbank ID': drugbank_id,
    'Name': name,
    'Description': description,
    'CAS Number': cas_number,
    'Average Mass': average_mass,
    'Monoisotopic Mass': monoisotopic_mass,
    'State': state,
    'Calculated Properties': calculated_properties,
    'Reactions': reactions,
    'Articles': articles,
    'Links': links,
    'Patents': patents
    })
        


In [5]:
df = pd.DataFrame(data)
df = df.sort_values(by='Drugbank ID')

df.to_excel('drug_data.xlsx', index=False)