# Map Drugbank IDs to CUIs

In [1]:
from collections import defaultdict

import pandas as pd
import xml.etree.ElementTree as ET

In [2]:
# Parse DrugBank's XML file to map obsolete DB IDs

def map_dbids(fname):
    """Parse the DrugBank XML to determine obsolete DrugBank ID mappings to the
    current DrugBank ID."""

    tree = ET.parse(fname)
    root = tree.getroot()

    namespace = {"DB": "http://www.drugbank.ca"}

    DB_ID_LEN = 7

    res = defaultdict(list)
    for drug in root.iterfind("DB:drug", namespace):
        primary_id = drug.find("DB:drugbank-id[@primary='true']", namespace).text
        assert primary_id.startswith("DB")

        for uid in drug.iterfind("DB:drugbank-id", namespace):
            id_val = uid.text

            if id_val.startswith("DB") and len(id_val) == DB_ID_LEN:
                res["primary_id"].append(primary_id)
                res["other_id"].append(id_val)

    return pd.DataFrame(res).drop_duplicates()

In [3]:
def read_umls(fname):
    """Read through MRCONSO.RRF and extract relevant info.

    Currently extracted information:
        1. DrugBank ID
        2. MeSH ID
        3. UNII

    Other data sources could be processed here, but diminishing returns kick
    in very quickly (they provide redundant data).

    For example, RxNorm mappings are almost a complete subset of the direct
    UNII mappings.

    Returns a pandas DataFrame with three columns.
    """
    res = defaultdict(list)
    with open(fname, "r") as fin:
        for line in fin:
            vals = line.rstrip("\n").split("|")

            cui, sab, code = vals[0], vals[11], vals[13]

            if sab in {"DRUGBANK", "MSH", "MTHSPL", "NCI_FDA"}:
                res["cui"].append(cui)
                res["code"].append(code)
                res["source"].append(sab)

    return pd.DataFrame(res).drop_duplicates()

In [4]:
info = read_umls("data/MRCONSO.RRF")

In [5]:
info.shape

(535618, 3)

In [6]:
info.head()

Unnamed: 0,code,cui,source
0,D012711,C0000005,MSH
2,D015060,C0000039,MSH
16,D015061,C0000052,MSH
30,D010742,C0000074,MSH
32,D015055,C0000084,MSH


In [7]:
info["source"].value_counts()

MSH         370689
MTHSPL      137672
NCI_FDA      20698
DRUGBANK      6559
Name: source, dtype: int64

---

In [8]:
# some of the drugbank ids given in the UMLS are obsolete
# we need to map them to their current values

drugbank = (info
    .query("source == 'DRUGBANK'")
    .drop("source", axis=1)
    .merge(
        map_dbids("data/drugbank_full.xml"),
        how="inner", left_on="code", right_on="other_id"
    )
    [["cui", "primary_id"]]
    .rename(columns={"primary_id": "drugbank_id"})
    .drop_duplicates()
)

In [9]:
drugbank.head()

Unnamed: 0,cui,drugbank_id
0,C0000378,DB06262
1,C0000379,DB01509
2,C0000477,DB06637
3,C0878240,DB06637
4,C1449659,DB06637


In [10]:
drugbank.shape

(6558, 2)

In [11]:
drugbank = drugbank.sort_values(["drugbank_id", "cui"])

---

In [12]:
drugbank.to_csv("maps/drugbank.tsv", sep='\t', index=False)