# Extract annotations and data from DrugBank

The purpose of this notebook is to extract and format Drug data for subsequent model annotation.

## Notebook Requirements:
*  Model genes **must** have the at least one of following annotations stored in the `object.annotation`. Values are expected to be seperated by semicolons. Accepted keys currently include:
    * `"uniprot"`
    * `"drugbank"`

## DRUGBANK ONLINE
To utilize this notebook: 

1. Go to [DrugBank database](https://go.drugbank.com/releases/latest) and create an account.
2. Follow the instructions to obtain a free academic license.
3. Download and unzip the database file `"drugbank_all_full_database.xml.zip"`.
4. Rename the file `"full database.xml"` to `"drugbank_all_full_database.xml"`.
5. Remember clear out any personal account information and ensure the downloaded DrugBank file remains local!

The function `download_database_DrugBank` takes a given username and a password and downloads the data, taking care to change the filename in the process.
*  Note: Requires internet connection to download information from [DrugBank](https://go.drugbank.com/).

Fields for the DrugBank XML schema are found [here](https://docs.drugbank.com/xml/#introduction).

### Citations
Wishart DS, Feunang YD, Guo AC, Lo EJ, Marcu A, Grant JR, Sajed T, Johnson D, Li C, Sayeeda Z, Assempour N, Iynkkaran I, Liu Y, Maciejewski A, Gale N, Wilson A, Chin L, Cummings R, Le D, Pon A, Knox C, Wilson M. DrugBank 5.0: a major update to the DrugBank database for 2018. Nucleic Acids Res. 2017 Nov 8. doi: 10.1093/nar/gkx1037.

## Setup
### Import packages

In [None]:
from collections import defaultdict
from warnings import warn
from xml.etree import ElementTree

import matplotlib.pyplot as plt
import pandas as pd
from rbc_gem_utils import (
    GEM_NAME,
    build_string,
    check_database_release_online,
    compare_tables,
    explode_column,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
    visualize_comparison,
)
from rbc_gem_utils.database.drugbank import (
    DRUGBANK_DB_TAG,
    DRUGBANK_GENERAL_ELEMENTS,
    DRUGBANK_NS,
    DRUGBANK_PATHWAY_ELEMENTS,
    DRUGBANK_RELEASE_EXPECTED,
    DRUGBANK_URL,
    download_database_DrugBank,
    strip_ns_DrugBank,
)
from rbc_gem_utils.util import has_value_type, strip_plural

# Display versions of last time notebook ran and worked
show_versions()

## Set notebook options

In [None]:
db_tag = DRUGBANK_DB_TAG
expected_release = DRUGBANK_RELEASE_EXPECTED
database_url = DRUGBANK_URL
download_database = False

compare_figsize = (5, 5)
compare = True
display_nunique = True
overwrite = True

print_footer_notes = False

# Best mapping key is drugbank or uniprot
mapping_key = "uniprot"

## Check DrugBank release
* If the current release does not match the expected release, it is because database has been updated since the last time this code was utilized.
    * If the notebook works without needing any significant modifications, the only update needed is to the release in the [drugbank.py](../../src/rbc_gem_utils/database/drugbank.py) source code file to resolve the issue.
* Version in the DrugBank file is formatted as {major}.{minor}

In [None]:
use_interim = not check_database_release_online(
    db_tag,
    verbose=True,
)
# Use different directory paths for unexpected behavior
if use_interim:
    warn(
        "Online release of database has been updated since the last time notebook was used."
    )

database_dirpath = get_dirpath(
    "database", db_tag, use_temp="interim" if use_interim else None
)
annotation_dirpath = get_dirpath(
    "annotation", use_temp="interim" if use_interim else None
)

# Ensure directories exist
database_dirpath.mkdir(exist_ok=True, parents=True)
annotation_dirpath.mkdir(exist_ok=True, parents=True)

#### Download new files and update database
If an argument is not provided (`arg=None`), its default value for the repository used. 
Username and password must be provided for this function, be sure to remove personal information after use!

In [None]:
if download_database:
    # Download data
    download_database_DrugBank(
        username="USERNAME",
        password="PASSWORD",
        database_dirpath=database_dirpath,
        release=current_release,
    )
filepath = database_dirpath / "drugbank_all_full_database.xml"

## Load RBC-GEM model

In [None]:
model_dirpath = get_dirpath("model")
model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")
model

### Extract current annotations from model

In [None]:
annotation_type = "genes"
annotation_cols = ["uniprot", "drugbank"]

df_model_mappings = (
    get_annotation_df(model.genes, annotation_cols)
    .rename({"id": annotation_type}, axis=1)
    .dropna(subset=[mapping_key])
)

for col in df_model_mappings.columns:
    df_model_mappings = explode_column(df_model_mappings, name=col, sep=";")
df_model_mappings = df_model_mappings.sort_values(annotation_type)

print(df_model_mappings.nunique(dropna=True))
df_model_mappings = df_model_mappings.reset_index(drop=True)

uniprot_ids = set(df_model_mappings["uniprot"].dropna().unique())

df_model_mappings

#### Get query IDs

In [None]:
print(f"Mapping key: {mapping_key}")
query_ids = df_model_mappings[mapping_key].dropna().unique()
assert len(set(query_ids)) == len(query_ids), "Duplicate IDs in list to query"
model_search_mapping = df_model_mappings.set_index(annotation_type)[
    mapping_key
].to_dict()
print(f"Number of model genes associated with query: {len(model_search_mapping)}")
print(f"Number of unique IDs to query: {len(query_ids)}")
df_model_mappings[[annotation_type, mapping_key]].drop_duplicates()

## Parse DrugBank information for annotations into DataFrames

In [None]:
root = ElementTree.parse(filepath).getroot()
root

### Extract proteins

In [None]:
idx = 0
data = defaultdict(dict)
prefix = True
for drug in root:
    drugbank_id = drug.findtext(f"{DRUGBANK_NS}drugbank-id[@primary='true']")
    prefix = f"proteins."
    for ptype in ["targets", "enzymes", "carriers", "transporters"]:
        elements = drug.findall(
            f"{DRUGBANK_NS}{ptype}/{DRUGBANK_NS}{strip_plural(ptype)}"
        )
        for element in elements:
            for subelement in element.findall(f"{DRUGBANK_NS}polypeptide"):
                data[idx].update({f"drugbank-id": drugbank_id, f"{prefix}type": ptype})
                data[idx].update(
                    {
                        f"{prefix}{strip_ns_DrugBank(subelement.tag)}": subelement.text
                        for subelement in element
                        if has_value_type(subelement)
                    }
                )

                # Polypeptide
                key = "polypeptide"
                data[idx].update(
                    {
                        f"{prefix}{key}.uniprot-id": subelement.get("id"),
                        f"{prefix}{key}.source": subelement.get("source"),
                    }
                )
                data[idx].update(
                    {
                        f"{prefix}{key}.{strip_ns_DrugBank(subelem.tag)}": subelem.text
                        for subelem in subelement
                        if has_value_type(subelem)
                    }
                )
                subkey = "pfams"
                data[idx].update(
                    {
                        f"{prefix}{key}.{subkey}": build_string(
                            [
                                subelem.text
                                for subelem in subelement.findall(
                                    f"{DRUGBANK_NS}{subkey}/{DRUGBANK_NS}{strip_plural(subkey)}/{DRUGBANK_NS}identifier"
                                )
                                if has_value_type(subelem)
                            ]
                        )
                    }
                )

                idx += 1

df_drugs_proteins_data = (
    pd.DataFrame.from_dict(data, orient="index")
    .fillna("")
    .drop_duplicates()
    .reset_index(drop=True)
)
df_model_drugs_protein_data = df_drugs_proteins_data[
    df_drugs_proteins_data["proteins.polypeptide.uniprot-id"].isin(uniprot_ids)
]
df_model_drugs_protein_data = df_model_drugs_protein_data.drop_duplicates().reset_index(
    drop=True
)
if display_nunique:
    print(df_model_drugs_protein_data.nunique(dropna=True))
df_model_drugs_protein_data.head()

### Extract SNPs associated with drugs

In [None]:
idx = 0
data = defaultdict(dict)
for drug in root:
    drugbank_id = drug.findtext(f"{DRUGBANK_NS}drugbank-id[@primary='true']")
    for key in ["snp-effects", "snp-adverse-drug-reactions"]:
        elements = drug.findall(
            f"{DRUGBANK_NS}{key}/{DRUGBANK_NS}{strip_plural(key.split('-')[-1])}"
        )
        for element in elements:
            data[idx].update({"drugbank-id": drugbank_id})
            data[idx].update(
                {
                    f"{strip_ns_DrugBank(subelement.tag)}": subelement.text
                    for subelement in element
                    if has_value_type(subelement)
                }
            )
            idx += 1
df_snp_data = (
    pd.DataFrame.from_dict(data, orient="index")
    .drop_duplicates()
    .reset_index(drop=True)
)
df_model_snp_data = (
    df_snp_data[df_snp_data["uniprot-id"].isin(uniprot_ids)]
    .drop_duplicates()
    .reset_index(drop=True)
)
df_model_snp_data = (
    df_model_snp_data.drop_duplicates()
    .sort_values("gene-symbol")
    .reset_index(drop=True)
)
if display_nunique:
    print(df_model_snp_data.nunique(dropna=True))
df_model_snp_data.head()

### Extract pathways associated with drugs

In [None]:
idx = 0
data = defaultdict(dict)
for drug in root:
    drugbank_id = drug.findtext(f"{DRUGBANK_NS}drugbank-id[@primary='true']")
    key = "pathways"
    elements = drug.findall(f"{DRUGBANK_NS}{key}/{DRUGBANK_NS}{strip_plural(key)}")
    for element in elements:
        data[idx].update({"drugbank-id": drugbank_id})
        for subkey in DRUGBANK_PATHWAY_ELEMENTS:
            if subkey in {"smpdb-id", "name", "category"}:
                data[idx].update(
                    {f"{subkey}": element.findtext(f"{DRUGBANK_NS}{subkey}")}
                )
            elif subkey == "drugs":
                # All drugbank IDs in this field will be redundant
                # as long as they also appear in the original drugbank ID column
                continue
            else:
                data[idx].update(
                    {
                        f"uniprot-id": build_string(
                            [
                                subelem.text
                                for subelem in element.findall(
                                    f"{DRUGBANK_NS}{subkey}/{DRUGBANK_NS}uniprot-id"
                                )
                            ]
                        )
                    }
                )

        idx += 1

df_pathways_data = (
    pd.DataFrame.from_dict(data, orient="index")
    .fillna("")
    .drop_duplicates()
    .reset_index(drop=True)
)
df_pathways_data = explode_column(df_pathways_data, name="uniprot-id", sep=";")
df_pathways_data = df_pathways_data[
    df_pathways_data["uniprot-id"].isin(uniprot_ids)
].reset_index(drop=True)
if display_nunique:
    print(df_pathways_data.nunique(dropna=True))
df_pathways_data.head()

### Combine and format annotations

In [None]:
rename_mapping = {
    "drugbank-id": "drugbank",
    "proteins.polypeptide.uniprot-id": "uniprot",
    "proteins.polypeptide.pfams": "pfam",
    "proteins.polypeptide.gene-name": "hgnc.symbol",
    "gene-symbol": "hgnc.symbol",
    "drugbank-id": "drugbank",
    "uniprot-id": "uniprot",
    "rs-id": "dbsnp",
    "drugbank-id": "drugbank",
    "uniprot-id": "uniprot",
    "smpdb-id": "smpdb",
    "ndc-product-code": "ndc",
    "dpd-id": "cdpd",
}
# Start with protein data
df = df_model_drugs_protein_data
df = (
    df.loc[:, [x for x in rename_mapping if x in df.columns]]
    .rename(rename_mapping, axis=1)
    .drop_duplicates()
)
df_annotations = df.drop_duplicates()

# Add SNP data
df = df_model_snp_data
df = (
    df.loc[:, [x for x in rename_mapping if x in df.columns]]
    .rename(rename_mapping, axis=1)
    .drop_duplicates()
)
df_annotations = df_annotations.merge(
    df,
    left_on="uniprot",
    right_on="uniprot",
    suffixes=("", "_drop"),
    how="left",
).drop_duplicates()

# Add Pathway data
df = df_pathways_data
df = (
    df.loc[:, [x for x in rename_mapping if x in df.columns]]
    .rename(rename_mapping, axis=1)
    .drop_duplicates()
)
df_annotations = df_annotations.merge(
    df,
    left_on="uniprot",
    right_on="uniprot",
    suffixes=("", "_drop"),
    how="left",
).drop_duplicates()

# # Add product data
# df = df_products_data
# df = df.loc[:, [x for x in rename_mapping if x in df.columns]].rename(rename_mapping, axis=1).drop_duplicates()
# df_annotations = df_annotations.merge(
#     df,
#     left_on="uniprot",
#     right_on="uniprot",
#     suffixes=("", "_drop"),
#     how="left",
# ).drop_duplicates()


df_annotations = df_annotations.drop(
    labels=[col for col in df_annotations.columns if col.endswith("_drop")], axis=1
).drop_duplicates()
df_annotations = explode_column(df_annotations, "pfam", sep=";")
df_annotations = df_annotations.drop_duplicates().replace("", pd.NA)
df_annotations = (
    df_annotations.groupby("uniprot", as_index=False)
    .agg(lambda x: build_string(sorted(x.dropna().unique())))
    .replace("", pd.NA)
)
df_annotations = pd.merge(
    df_model_mappings[[annotation_type, mapping_key]].drop_duplicates(),
    df_annotations,
    left_on=mapping_key,
    right_on=mapping_key,
    how="left",
)
df_annotations = (
    df_annotations.sort_values("genes")
    .dropna(subset=["drugbank"])
    .reset_index(drop=True)
)

df_annotations = df_annotations.replace(float("nan"), pd.NA).replace("", pd.NA)
if compare:
    compare_on_index = [annotation_type]
    try:
        df_previous = pd.read_csv(
            annotation_dirpath / f"{annotation_type}_{db_tag}.tsv",
            sep="\t",
            index_col=None,
            dtype=str,
        )
        df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index),
        df_annotations.set_index(compare_on_index),
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)

if display_nunique:
    for col in df_annotations.columns:
        df = explode_column(df_annotations, name=col, sep=";")
        df = df[col].drop_duplicates()
        print(f"{df.name}: {df.nunique()}")

if overwrite:
    df_annotations.to_csv(
        annotation_dirpath / f"{annotation_type}_{db_tag}.tsv", sep="\t", index=False
    )

df_annotations

## Extract data from DrugBank into DataFrames

In [None]:
all_drug_dfs = {}
df_mappings = df_annotations[["genes", "uniprot", "drugbank"]].copy()
df_mappings["drugbank"] = df_mappings["drugbank"].str.split(";")
df_mappings = df_mappings.explode("drugbank").drop_duplicates()
print(df_mappings.nunique())
drugbank_ids = set(df_mappings["drugbank"].dropna().unique())
uniprot_ids = set(df_mappings["uniprot"].dropna().unique())
df_mappings

### Extract general information

In [None]:
idx = 0
data = defaultdict(dict)
for drug in root:
    # General information
    drugbank_id = drug.findtext(f"{DRUGBANK_NS}drugbank-id[@primary='true']")
    # Get only drugbank IDs specified
    if drugbank_ids and drugbank_id not in drugbank_ids:
        continue
    data[idx].update({"drugbank-id": drugbank_id})
    data[idx].update({attr: drug.get(attr) for attr in ["type", "created", "updated"]})
    for key in DRUGBANK_GENERAL_ELEMENTS:
        if key == "drugbank-id":
            continue

        if key in {"name", "cas-number"}:
            element = drug.find(f"{DRUGBANK_NS}{key}")
            if element is not None and has_value_type(element):
                data[idx].update({key: element.text})

    for key in {"products", "international-brands"}:
        subkey = "name"
        data[idx].update(
            {
                f"{key}": build_string(
                    [
                        element.findtext(f"{DRUGBANK_NS}{subkey}")
                        for element in drug.findall(
                            f"{DRUGBANK_NS}{key}/{DRUGBANK_NS}{strip_plural(key)}"
                        )
                    ]
                )
            }
        )
    key = "synonyms"
    data[idx].update(
        {
            f"{key}": build_string(
                [
                    element.text
                    for element in drug.findall(
                        f"{DRUGBANK_NS}{key}/{DRUGBANK_NS}{strip_plural(key)}"
                    )
                ]
            )
        }
    )

    idx += 1

df_drugbank_data = (
    pd.DataFrame.from_dict(data, orient="index")
    .fillna("")
    .drop_duplicates()
    .reset_index(drop=True)
)
all_drug_dfs["General"] = df_drugbank_data
df_drugbank_data = df_drugbank_data.drop(["created", "updated"], axis=1)
df_drugbank_data

### Extract drug categories and Mesh IDs

In [None]:
idx = 0
data = defaultdict(dict)
for drug in root:
    # General information
    drugbank_id = drug.findtext(f"{DRUGBANK_NS}drugbank-id[@primary='true']")
    # Get only drugbank IDs specified
    if drugbank_ids and drugbank_id not in drugbank_ids:
        continue
    key = "categories"
    # Drug catgories
    # For mesh-id: https://registry.identifiers.org/registry/mesh
    elements = drug.findall(f"{DRUGBANK_NS}{key}/{DRUGBANK_NS}{strip_plural(key)}")
    for element in elements:
        data[idx].update({"drugbank-id": drugbank_id})
        data[idx]["category"] = element.findtext(f"{DRUGBANK_NS}category")
        data[idx]["mesh-id"] = element.findtext(f"{DRUGBANK_NS}mesh-id")
        idx += 1


df_drug_category = (
    pd.DataFrame.from_dict(data, orient="index")
    .replace("", float("nan"))
    .drop_duplicates()
    .reset_index(drop=True)
)
all_drug_dfs["Categories"] = df_drug_category
df_drug_category

### Extract ATC codes

In [None]:
idx = 0
data = defaultdict(dict)
for drug in root:
    drugbank_id = drug.findtext(f"{DRUGBANK_NS}drugbank-id[@primary='true']")
    # Get only drugbank IDs specified
    if drugbank_ids and drugbank_id not in drugbank_ids:
        continue
    key = "atc-codes"
    elements = drug.findall(f"{DRUGBANK_NS}{key}/{DRUGBANK_NS}{strip_plural(key)}")
    for element in elements:
        data[idx].update(
            {
                "drugbank-id": drugbank_id,
                "substance.code": element.get("code"),
                "substance.description": drug.findtext(f"{DRUGBANK_NS}name"),
            }
        )
        for level, subelement in zip(
            ["chemical", "pharmacological", "therapeutic", "anatomical"], list(element)
        ):
            data[idx].update(
                {
                    f"{level}.description": subelement.text,
                    f"{level}.code": subelement.get("code"),
                }
            )
        idx += 1

df_atc_codes_data = pd.DataFrame.from_dict(data, orient="index")
df_atc_codes_data = df_atc_codes_data.loc[
    :, list(df_atc_codes_data.columns[:1]) + list(df_atc_codes_data.columns[1:][::-1])
]
df_atc_codes_data = (
    df_drugbank_data[["drugbank-id"]]
    .merge(
        df_atc_codes_data,
        left_on="drugbank-id",
        right_on="drugbank-id",
        how="left",
    )
    .drop_duplicates()
    .reset_index(drop=True)
)
all_drug_dfs["ATC"] = df_atc_codes_data

print(df_atc_codes_data.nunique())
df_atc_codes_data

### Extract drug interactions
Extracted drug interactions are confined to those that directly map into the reconstruction.

In [None]:
prefix = True

idx = 0
data = defaultdict(dict)
for drug in root:
    drugbank_id = drug.findtext(f"{DRUGBANK_NS}drugbank-id[@primary='true']")
    # Get only drugbank IDs specified
    if drugbank_ids and drugbank_id not in drugbank_ids:
        continue
    key = "drug-interactions"
    elements = drug.findall(f"{DRUGBANK_NS}{key}/{DRUGBANK_NS}{strip_plural(key)}")
    prefix = f"{key}." if prefix else ""
    for element in elements:
        interacting_id = element.findtext(f"{DRUGBANK_NS}drugbank-id")
        if interacting_id in drugbank_ids:
            data[idx].update(
                {
                    "drugbank-id": drugbank_id,
                    "name": drug.findtext(f"{DRUGBANK_NS}name"),
                }
            )
            data[idx].update(
                {
                    f"{prefix}{subkey}": element.findtext(f"{DRUGBANK_NS}{subkey}")
                    for subkey in ["drugbank-id", "name", "description"]
                }
            )
            idx += 1

df_drug_interactions = (
    pd.DataFrame.from_dict(data, orient="index")
    .fillna("")
    .drop_duplicates()
    .reset_index(drop=True)
)

# Drug interactions go two ways, generate a field to keep only unique interactions
df_drug_interactions["drug;drug"] = df_drug_interactions[
    ["drugbank-id", "drug-interactions.drugbank-id"]
].apply(lambda x: build_string(sorted(x.values)), axis=1)
df_drug_interactions = df_drug_interactions.drop_duplicates(subset=["drug;drug"])
df_drug_interactions = df_drug_interactions.reset_index(drop=True)
df_drug_interactions = df_drug_interactions.rename(
    {
        "drugbank-id": "drugbank_A",
        "name": "name_A",
        "drug-interactions.drugbank-id": "drugbank_B",
        "drug-interactions.name": "name_B",
        "drug;drug": "drugbank_A;drugbank_B",
    },
    axis=1,
)
all_drug_dfs["Interactions"] = df_drug_interactions

print(df_drug_interactions.nunique())
df_drug_interactions

### Extract protein data

In [None]:
idx = 0
data = defaultdict(dict)
prefix = False
for drug in root:
    drugbank_id = drug.findtext(f"{DRUGBANK_NS}drugbank-id[@primary='true']")
    if drugbank_ids and drugbank_id not in drugbank_ids:
        continue
    prefix = f"proteins." if prefix else ""
    for ptype in ["targets", "enzymes", "carriers", "transporters"]:
        elements = drug.findall(
            f"{DRUGBANK_NS}{ptype}/{DRUGBANK_NS}{strip_plural(ptype)}"
        )
        for element in elements:
            for subelement in element.findall(f"{DRUGBANK_NS}polypeptide"):
                data[idx].update({f"drugbank-id": drugbank_id, f"{prefix}type": ptype})
                data[idx].update(
                    {
                        f"{prefix}{strip_ns_DrugBank(subelement.tag)}": subelement.text
                        for subelement in element
                        if has_value_type(subelement)
                    }
                )

                # Polypeptide
                key = "polypeptide"
                data[idx].update(
                    {
                        f"{prefix}{key}.uniprot-id": subelement.get("id"),
                        f"{prefix}{key}.source": subelement.get("source"),
                    }
                )
                data[idx].update(
                    {
                        f"{prefix}{key}.{strip_ns_DrugBank(subelem.tag)}": subelem.text
                        for subelem in subelement
                        if has_value_type(subelem)
                    }
                )
                subkey = "pfams"
                data[idx].update(
                    {
                        f"{prefix}{key}.{subkey}": build_string(
                            [
                                subelem.text
                                for subelem in subelement.findall(
                                    f"{DRUGBANK_NS}{subkey}/{DRUGBANK_NS}{strip_plural(subkey)}/{DRUGBANK_NS}identifier"
                                )
                                if has_value_type(subelem)
                            ]
                        )
                    }
                )

                idx += 1

df_proteins = (
    pd.DataFrame.from_dict(data, orient="index")
    .fillna("")
    .drop_duplicates()
    .reset_index(drop=True)
)
df_proteins = df_proteins[
    df_proteins[f"{prefix}polypeptide.uniprot-id"].isin(uniprot_ids)
]
df_proteins = df_proteins.drop_duplicates().reset_index(drop=True)
df_proteins = (
    df_mappings[["genes", "uniprot"]]
    .merge(df_proteins, left_on="uniprot", right_on="polypeptide.uniprot-id")
    .drop_duplicates()
    .drop(["uniprot"], axis=1)
    .reset_index(drop=True)
)

all_drug_dfs["Proteins"] = df_proteins
df_proteins

### Extract drug associations with SNPs

In [None]:
idx = 0
data = defaultdict(dict)
for drug in root:
    drugbank_id = drug.findtext(f"{DRUGBANK_NS}drugbank-id[@primary='true']")
    if drugbank_ids and drugbank_id not in drugbank_ids:
        continue
    # Get only drugbank IDs specified
    for key in ["snp-effects", "snp-adverse-drug-reactions"]:
        elements = drug.findall(
            f"{DRUGBANK_NS}{key}/{DRUGBANK_NS}{strip_plural(key.split('-')[-1])}"
        )
        for element in elements:
            data[idx].update({"drugbank-id": drugbank_id})
            data[idx].update(
                {
                    f"{strip_ns_DrugBank(subelement.tag)}": subelement.text
                    for subelement in element
                    if has_value_type(subelement)
                }
            )
            idx += 1
df_snp_data = (
    pd.DataFrame.from_dict(data, orient="index")
    .fillna("")
    .drop_duplicates()
    .reset_index(drop=True)
)
df_snp_data = df_snp_data[df_snp_data["uniprot-id"].isin(uniprot_ids)]
df_snp_data = df_snp_data.drop_duplicates().reset_index(drop=True)
all_drug_dfs["SNP"] = df_snp_data

df_snp_data

## Export drug data for subsequent visualization

In [None]:
print(list(all_drug_dfs.keys()))
for sheet_name, df in all_drug_dfs.items():
    df.to_csv(database_dirpath / f"{sheet_name}_DrugBank.tsv", sep="\t")