# Extract drug data from DrugBank

Purpose of this notebook is to extract and format Drug data for subsequent model annotation

## DRUGBANK ONLINE
To utilize this notebook: 

1. Go to [DrugBank database](https://go.drugbank.com/releases/latest) and create an account.
2. Follow the instructions to obtain a free academic license.
3. Download and unzip the database file `"drugbank_all_full_database.xml.zip"`.
4. Rename the file `"full database.xml"` to `"drugbank_all_full_database.xml"`.
5. Remember clear out any personal account information and ensure the downloaded DrugBank file remains local!

The functions `download_database_DrugBank` takes a given username and a password and downloads the data, taking care to change the filename in the process.

Fields for the DrugBank XML schema are found [here](https://docs.drugbank.com/xml/#introduction).

Wishart DS, Feunang YD, Guo AC, Lo EJ, Marcu A, Grant JR, Sajed T, Johnson D, Li C, Sayeeda Z, Assempour N, Iynkkaran I, Liu Y, Maciejewski A, Gale N, Wilson A, Chin L, Cummings R, Le D, Pon A, Knox C, Wilson M. DrugBank 5.0: a major update to the DrugBank database for 2018. Nucleic Acids Res. 2017 Nov 8. doi: 10.1093/nar/gkx1037.

## Setup
### Import packages

In [1]:
from warnings import warn
import pandas as pd
from collections import defaultdict
from xml.etree import ElementTree
import matplotlib.pyplot as plt

from rbc_gem_utils import (
    ROOT_PATH,
    INTERIM_PATH,
    DATABASE_PATH,
    ANNOTATION_PATH,
    get_annotation_df,
    read_rbc_model,
    check_database_version_online,
    check_version,
    show_versions,
    build_string,
    split_string,
    compare_tables,
    visualize_comparison,
    explode_column,
)
from rbc_gem_utils.database.drugbank import (
    DRUGBANK_NS,
    DRUGBANK_VERSION_EXPECTED,
    DRUGBANK_PATH,
    DRUGBANK_DB_TAG,
    DRUGBANK_PATHWAY_ELEMENTS,
    strip_ns_DrugBank,
    get_version_DrugBank,
    download_database_DrugBank,
)

from rbc_gem_utils.util import strip_plural, has_value_type

# Display versions of last time notebook ran and worked
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.1

Dependency Information
----------------------
beautifulsoup4                       4.12.3
bio                                   1.6.2
cobra                                0.29.0
depinfo                               2.2.0
kaleido                               0.2.1
matplotlib                            3.8.2
memote                               0.17.0
networkx                              3.2.1
notebook                              7.0.7
openpyxl                              3.1.2
pandas                                2.2.0
pre-commit                            3.6.0
pyvis                                 0.3.2
rbc-gem-utils[database,network,vis] missing
requests                             2.31.0
scipy                                1.12.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip        23.3.1
setuptools 68.2.2
wheel      0.41.2

Platform Information
-------------------

## Check DrugBank version
If the version does not match the expected version, it is because database has been updated since the last time this code was utilized. 
### Expected DrugBank version: 5.1.11
* Last release utilized: [5.1.11](https://go.drugbank.com/releases) published on **2024-01-03**
* Version in the DrugBank file is formatted as {major}.{minor}

In [2]:
if not check_database_version_online("DrugBank"):
    warn(
        "Online version of database has been updated since the last time notebook was used."
    )

version = get_version_DrugBank()
if check_version(version, DRUGBANK_VERSION_EXPECTED, verbose=True):
    database_dirpath = f"{ROOT_PATH}{DATABASE_PATH}{DRUGBANK_PATH}"
    annotation_dirpath = f"{ROOT_PATH}{ANNOTATION_PATH}"
else:
    database_dirpath = f"{ROOT_PATH}{INTERIM_PATH}{DRUGBANK_PATH}"
    annotation_dirpath = f"{ROOT_PATH}{INTERIM_PATH}"
    version = DRUGBANK_VERSION_EXPECTED

Current and expected versions match.


#### Download new files and update database
If an argument is not provided (`arg=None`), its default value for the repository used. 
Username and password must be provided for this function, be sure to remove personal information after use!

In [3]:
download = False
if download:
    # Download data
    download_database_DrugBank(
        username="USERNAME",
        password="PASSWORD",
        database_dirpath=database_dirpath,
        version=version,
    )
filepath = f"{database_dirpath}/drugbank_all_full_database.xml"

## Load RBC-GEM model

In [4]:
model = read_rbc_model(filetype="xml")
model

0,1
Name,RBC_GEM
Memory address,14c357c90
Number of metabolites,1967
Number of reactions,2788
Number of genes,653
Number of groups,74
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"


In [5]:
annotation_type = "genes"

df_model_mappings = get_annotation_df(
    getattr(model, annotation_type),
    ["uniprot", "hgnc.symbol", "drugbank", "dbsnp", "mim"],
).rename({"id": annotation_type}, axis=1)

for col in df_model_mappings.columns:
    df_model_mappings[col] = df_model_mappings[col].apply(lambda x: split_string(x))
    df_model_mappings = df_model_mappings.explode(col).drop_duplicates()
df_model_mappings = df_model_mappings.drop_duplicates()
drugbank_ids = set(df_model_mappings["drugbank"].dropna().unique())
uniprot_ids = set(df_model_mappings["uniprot"].dropna().unique())

print(df_model_mappings.nunique())
df_model_mappings

genes           653
uniprot         653
hgnc.symbol     654
drugbank       2065
dbsnp          4129
mim            1133
dtype: int64


Unnamed: 0,genes,uniprot,hgnc.symbol,drugbank,dbsnp,mim
0,RPE,Q96AT9,RPE,DB00153,,180480
1,RPIA,P49247,RPIA,DB01756,rs121918591,180430
1,RPIA,P49247,RPIA,DB01756,rs121918591,608611
2,SORD,Q00796,SORD,DB00157,rs145813597,618912
2,SORD,Q00796,SORD,DB00157,rs145813597,182500
...,...,...,...,...,...,...
652,A4GALT,Q9NPC4,A4GALT,,rs28940572,607922
652,A4GALT,Q9NPC4,A4GALT,,rs397514502,111400
652,A4GALT,Q9NPC4,A4GALT,,rs397514502,607922
652,A4GALT,Q9NPC4,A4GALT,,rs28940571,111400


## Set notebook options

In [6]:
database_dirpath = f"{ROOT_PATH}{DATABASE_PATH}"
annotation_dirpath = f"{ROOT_PATH}{ANNOTATION_PATH}"
# Compare to/Overwrite previous files
compare = True
display_nunique = True
overwrite = True
print_footer_notes = False
mapping_key = "uniprot"

#### Get query IDs

In [7]:
print(f"Mapping key: {mapping_key}")
query_ids = df_model_mappings[mapping_key].dropna().unique()
assert len(set(query_ids)) == len(query_ids), "Duplicate IDs in list to query"
model_search_mapping = df_model_mappings.set_index(annotation_type)[
    mapping_key
].to_dict()
print(f"Number of model genes associated with query: {len(model_search_mapping)}")
print(f"Number of unique IDs to query: {len(query_ids)}")
df_model_mappings[[annotation_type, mapping_key]].drop_duplicates()

Mapping key: uniprot
Number of model genes associated with query: 653
Number of unique IDs to query: 653


Unnamed: 0,genes,uniprot
0,RPE,Q96AT9
1,RPIA,P49247
2,SORD,Q00796
3,AKR7A2,O43488
4,SRM,P19623
...,...,...
648,SCD,O00767
649,RPS6KA4,O75676
650,GCNT2,Q8N0V5
651,GRIA1,P42261


## Parse DrugBank information into DataFrame

In [None]:
all_drug_dfs = {}
root = ElementTree.parse(
    f"{database_dirpath}{DRUGBANK_PATH}/drugbank_all_full_database.xml"
).getroot()
root

### Extract proteins

In [None]:
idx = 0
data = defaultdict(dict)
prefix = True
for drug in root:
    drugbank_id = drug.findtext(f"{DRUGBANK_NS}drugbank-id[@primary='true']")
    prefix = f"proteins."
    for ptype in ["targets", "enzymes", "carriers", "transporters"]:
        elements = drug.findall(
            f"{DRUGBANK_NS}{ptype}/{DRUGBANK_NS}{strip_plural(ptype)}"
        )
        for element in elements:
            for subelement in element.findall(f"{DRUGBANK_NS}polypeptide"):
                data[idx].update({f"drugbank-id": drugbank_id, f"{prefix}type": ptype})
                data[idx].update(
                    {
                        f"{prefix}{strip_ns_DrugBank(subelement.tag)}": subelement.text
                        for subelement in element
                        if has_value_type(subelement)
                    }
                )

                # Polypeptide
                key = "polypeptide"
                data[idx].update(
                    {
                        f"{prefix}{key}.uniprot-id": subelement.get("id"),
                        f"{prefix}{key}.source": subelement.get("source"),
                    }
                )
                data[idx].update(
                    {
                        f"{prefix}{key}.{strip_ns_DrugBank(subelem.tag)}": subelem.text
                        for subelem in subelement
                        if has_value_type(subelem)
                    }
                )
                subkey = "pfams"
                data[idx].update(
                    {
                        f"{prefix}{key}.{subkey}": build_string(
                            [
                                subelem.text
                                for subelem in subelement.findall(
                                    f"{DRUGBANK_NS}{subkey}/{DRUGBANK_NS}{strip_plural(subkey)}/{DRUGBANK_NS}identifier"
                                )
                                if has_value_type(subelem)
                            ]
                        )
                    }
                )

                idx += 1

df_drugs_proteins_data = (
    pd.DataFrame.from_dict(data, orient="index")
    .fillna("")
    .drop_duplicates()
    .reset_index(drop=True)
)
df_model_drugs_protein_data = df_drugs_proteins_data[
    df_drugs_proteins_data["proteins.polypeptide.uniprot-id"].isin(uniprot_ids)
]
df_model_drugs_protein_data = df_model_drugs_protein_data.drop_duplicates().reset_index(
    drop=True
)
print(df_model_drugs_protein_data.nunique())
df_model_drugs_protein_data.head()

### Extract SNPs associated with drugs

In [None]:
idx = 0
data = defaultdict(dict)
for drug in root:
    drugbank_id = drug.findtext(f"{DRUGBANK_NS}drugbank-id[@primary='true']")
    for key in ["snp-effects", "snp-adverse-drug-reactions"]:
        elements = drug.findall(
            f"{DRUGBANK_NS}{key}/{DRUGBANK_NS}{strip_plural(key.split('-')[-1])}"
        )
        for element in elements:
            data[idx].update({"drugbank-id": drugbank_id})
            data[idx].update(
                {
                    f"{strip_ns_DrugBank(subelement.tag)}": subelement.text
                    for subelement in element
                    if has_value_type(subelement)
                }
            )
            idx += 1
df_snp_data = (
    pd.DataFrame.from_dict(data, orient="index")
    .drop_duplicates()
    .reset_index(drop=True)
)
df_model_snp_data = (
    df_snp_data[df_snp_data["uniprot-id"].isin(uniprot_ids)]
    .drop_duplicates()
    .reset_index(drop=True)
)
df_model_snp_data = (
    df_model_snp_data.drop_duplicates()
    .sort_values("gene-symbol")
    .reset_index(drop=True)
)
print(df_model_snp_data.nunique())
df_model_snp_data.head()

### Extract pathways associated with drugs

In [None]:
idx = 0
data = defaultdict(dict)
for drug in root:
    drugbank_id = drug.findtext(f"{DRUGBANK_NS}drugbank-id[@primary='true']")
    key = "pathways"
    elements = drug.findall(f"{DRUGBANK_NS}{key}/{DRUGBANK_NS}{strip_plural(key)}")
    for element in elements:
        data[idx].update({"drugbank-id": drugbank_id})
        for subkey in DRUGBANK_PATHWAY_ELEMENTS:
            if subkey in {"smpdb-id", "name", "category"}:
                data[idx].update(
                    {f"{subkey}": element.findtext(f"{DRUGBANK_NS}{subkey}")}
                )
            elif subkey == "drugs":
                # All drugbank IDs in this field will be redundant
                # as long as they also appear in the original drugbank ID column
                continue
            else:
                data[idx].update(
                    {
                        f"uniprot-id": build_string(
                            [
                                subelem.text
                                for subelem in element.findall(
                                    f"{DRUGBANK_NS}{subkey}/{DRUGBANK_NS}uniprot-id"
                                )
                            ]
                        )
                    }
                )

        idx += 1

df_pathways_data = (
    pd.DataFrame.from_dict(data, orient="index")
    .fillna("")
    .drop_duplicates()
    .reset_index(drop=True)
)
df_pathways_data = explode_column(df_pathways_data, name="uniprot-id")
df_pathways_data = df_pathways_data[
    df_pathways_data["uniprot-id"].isin(uniprot_ids)
].reset_index(drop=True)
print(df_pathways_data.nunique())
df_pathways_data.head()

### Extract products
A list of commercially available products in Canada and the United States that contain the drug.

In [None]:
# idx  = 0
# data = defaultdict(dict)
# prefix = False
# for drug in root:
#     drugbank_id = drug.findtext(f"{DRUGBANK_NS}drugbank-id[@primary='true']")
#     # Get only drugbank IDs specified
#     key = "products"
#     prefix = f"{key}." if prefix else ""
#     elements = drug.findall(f"{DRUGBANK_NS}{key}/{DRUGBANK_NS}{strip_plural(key)}")
#     for element in elements:
#         data[idx].update({"drugbank-id": drugbank_id})
#         data[idx].update({
#             f"{prefix}{strip_ns_DrugBank(subelement.tag)}": subelement.text
#             for subelement in element
#             if has_value_type(subelement) and strip_ns_DrugBank(subelement.tag) in {
#                 "ndc-product-code",
#                 "dpd-id",
#             }
#         })
#         idx += 1

# df_products_data = pd.DataFrame.from_dict(
#     data,
#     orient="index"
# ).drop_duplicates().reset_index(drop=True)
# print(df_products_data.nunique())
# df_products_data = df_products_data.groupby("drugbank-id", as_index=False).agg(lambda x: build_string(set(x.dropna().unique())))
# df_products_data

#### Extract annotations

In [None]:
rename_mapping = {
    "drugbank-id": "drugbank",
    "proteins.polypeptide.uniprot-id": "uniprot",
    "proteins.polypeptide.pfams": "pfam",
    "proteins.polypeptide.gene-name": "hgnc.symbol",
}
df_annotations = df_model_drugs_protein_data.loc[:, list(rename_mapping)].rename(
    rename_mapping, axis=1
)
df_annotations = df_annotations.reset_index(drop=True).drop_duplicates()

rename_mapping = {
    "drugbank-id": "drugbank",
    "uniprot-id": "uniprot",
    "rs-id": "dbsnp",
    "gene-symbol": "hgnc.symbol",
}
df_annotations = df_annotations.merge(
    df_model_snp_data.loc[:, list(rename_mapping)].rename(rename_mapping, axis=1),
    left_on="uniprot",
    right_on="uniprot",
    suffixes=("", "_drop"),
    how="left",
)
rename_mapping = {
    "drugbank-id": "drugbank",
    "uniprot-id": "uniprot",
    "smpdb-id": "smpdb",
}
df_annotations = df_annotations.merge(
    df_pathways_data.loc[:, list(rename_mapping)].rename(rename_mapping, axis=1),
    left_on="uniprot",
    right_on="uniprot",
    suffixes=("", "_drop"),
    how="left",
)
# rename_mapping = {
#     "drugbank-id": "drugbank",
#     "ndc-product-code": "ndc",
#     "dpd-id": "cdpd",
# }
# df_annotations = df_annotations.merge(
#     df_products_data.loc[:, list(rename_mapping)].rename(rename_mapping, axis=1),
#     left_on="drugbank",
#     right_on="drugbank",
#     suffixes=("", "_drop"),
#     how="left"
# )

df_annotations = df_annotations.drop(
    labels=[col for col in df_annotations.columns if col.endswith("_drop")], axis=1
)
df_annotations = df_annotations.drop_duplicates()
df_annotations["pfam"] = df_annotations["pfam"].apply(split_string)
df_annotations = (
    df_annotations.explode("pfam").drop_duplicates().replace("", float("nan"))
)
df_annotations = (
    df_annotations.groupby("uniprot", as_index=False)
    .agg(lambda x: build_string(sorted(x.dropna().unique())))
    .replace("", float("nan"))
)
df_annotations = pd.merge(
    df_model_mappings[[annotation_type, mapping_key]].drop_duplicates(),
    df_annotations,
    left_on=mapping_key,
    right_on=mapping_key,
    how="left",
)
df_annotations = (
    df_annotations.sort_values("genes")
    .dropna(subset=["drugbank"])
    .reset_index(drop=True)
)

if compare:
    try:
        df_previous = pd.read_csv(
            f"{annotation_dirpath}/{annotation_type}_{DRUGBANK_DB_TAG}.tsv",
            sep="\t",
            index_col=0,
            dtype=str,
        )
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=[annotation_type])
    fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    df_comparision = compare_tables(
        df_previous.set_index(annotation_type),
        df_annotations.set_index(annotation_type),
    )
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)


if display_nunique:
    for col in df_annotations.columns:
        df = (
            df_annotations[col]
            .apply(lambda x: split_string(x))
            .explode(col)
            .drop_duplicates()
        )
        print(f"{df.name}: {df.nunique()}")


if overwrite:
    df_annotations.to_csv(
        f"{annotation_dirpath}/{annotation_type}_{DRUGBANK_DB_TAG}.tsv", sep="\t"
    )
else:
    df_annotations.to_csv(
        f"{ROOT_PATH}{INTERIM_PATH}/{annotation_type}_{DRUGBANK_DB_TAG}.tsv", sep="\t"
    )
df_annotations