# Extract annotations and data from DrugCentral

Purpose of this notebook is to extract and format drug data for subsequent model annotation.

## Notebook Requirements:
*  Model genes **must** have the at least one of following annotations stored in the `object.annotation`. Values are expected to be seperated by semicolons. Accepted keys currently include:
    * `"drugcentral"`
    * `"uniprot"`
* Currently uses the 2021 TSV file download.

### Citations
Avram S, Wilson TB, Curpan R, Halip L, Borota A, Bora A, Bologa CG, Holmes J, Knockel J, Yang JJ, Oprea TI. DrugCentral 2023 extends human clinical data and integrates veterinary drugs. Nucleic Acids Res. 2023 Jan 6;51(D1):D1276-D1287. doi: 10.1093/nar/gkac1085. PMID: 36484092; PMCID: PMC9825566.

## Setup
### Import packages

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from rbc_gem_utils import (
    GEM_NAME,
    build_string,
    compare_tables,
    explode_column,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    visualize_comparison,
)
from rbc_gem_utils.database import DRUGCENTRAL_DB_TAG

## Set notebook options

In [None]:
compare_figsize = (5, 5)
compare = True
display_nunique = True
overwrite = True
download_database = True
db_tag = DRUGCENTRAL_DB_TAG


use_interim = False
database_dirpath = get_dirpath(
    "database", db_tag, use_temp="interim" if use_interim else None
)
annotation_dirpath = get_dirpath(
    "annotation", use_temp="interim" if use_interim else None
)

# Ensure directories exist
database_dirpath.mkdir(exist_ok=True, parents=True)
annotation_dirpath.mkdir(exist_ok=True, parents=True)

mapping_key = "uniprot"

## Load DrugCentral data

In [None]:
df_drugcentral = pd.read_csv(
    database_dirpath / "drug.target.interaction.tsv",
    sep="\t",
    dtype=str,
)
# Filter out non-human entries
df_drugcentral = df_drugcentral[
    df_drugcentral["ORGANISM"].str.find("Homo sapiens") != -1
]
df_drugcentral

## Load RBC-GEM model

In [None]:
model_dirpath = get_dirpath("model")
model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")
model

### Extract current annotations from model

In [None]:
annotation_type = "genes"
annotation_cols = ["uniprot", "drugbank", "drugcentral", "hgnc.symbol"]

df_model_mappings = (
    get_annotation_df(model.genes, annotation_cols)
    .rename({"id": annotation_type}, axis=1)
    .dropna(subset=[mapping_key])
)
for col in df_model_mappings.columns:
    df_model_mappings = explode_column(df_model_mappings, name=col, sep=";")
df_model_mappings = df_model_mappings.sort_values(annotation_type)

print(df_model_mappings.nunique(dropna=True))
df_model_mappings = df_model_mappings.reset_index(drop=True)

df_model_mappings

### Map data to model

In [None]:
rename_mapping = {col: col.lower() for col in df_drugcentral.columns}
rename_mapping.update(
    {
        "ACCESSION": "uniprot",
        "GENE": "hgnc.symbol",
        "STRUCT_ID": "drugcentral",
        "drug_name": "name",
    }
)
df_model_drugcentral = pd.merge(
    df_model_mappings.loc[:, [annotation_type, mapping_key]],
    df_drugcentral.rename(rename_mapping, axis=1),
    left_on=mapping_key,
    right_on=mapping_key,
    how="left",
)

df_model_drugcentral = df_model_drugcentral[df_model_drugcentral["drugcentral"].notna()]
# df_model_drugcentral = df_model_drugcentral.groupby(annotation_type, as_index=False).agg(lambda x:  build_string(list(x.dropna().unique())))
df_model_drugcentral = df_model_drugcentral.loc[
    :,
    [
        "genes",
        "uniprot",
        "hgnc.symbol",
        "drugcentral",
        "drug_name",
        "target_class",
        "action_type",
    ],
]
df_model_drugcentral = df_model_drugcentral.drop_duplicates().reset_index(drop=True)

if display_nunique:
    for col in df_model_drugcentral.columns:
        df = explode_column(df_model_drugcentral, name=col, sep=";")
        df = df[col].drop_duplicates()
        print(f"{df.name}: {df.nunique()}")

df_model_drugcentral = df_model_drugcentral.reset_index(drop=True)


if overwrite:
    df_model_drugcentral.to_csv(
        database_dirpath / f"{db_tag}_{GEM_NAME}.tsv", sep="\t", index=False
    )

df_model_drugcentral

In [None]:
rename_mapping = {
    "ACCESSION": "uniprot",
    "GENE": "hgnc.symbol",
    "STRUCT_ID": "drugcentral",
}

df_annotations = pd.merge(
    df_model_mappings.loc[:, [annotation_type, mapping_key]],
    df_drugcentral.loc[:, list(rename_mapping)].rename(rename_mapping, axis=1),
    left_on=mapping_key,
    right_on=mapping_key,
    how="left",
)
df_annotations = df_annotations.dropna().drop_duplicates()
df_annotations = df_annotations.groupby(annotation_type).agg(
    lambda x: build_string(list(x.dropna().unique()))
)
df_annotations = df_annotations.reset_index(drop=False).dropna().drop_duplicates()
df_annotations = df_annotations.replace(float("nan"), pd.NA).replace("", pd.NA)
if compare:
    compare_on_index = [annotation_type]
    try:
        df_previous = pd.read_csv(
            f"{annotation_dirpath}/{annotation_type}_{db_tag}.tsv",
            sep="\t",
            index_col=None,
            dtype=str,
        )
        df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index),
        df_annotations.set_index(compare_on_index),
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)

if display_nunique:
    for col in df_annotations.columns:
        df = explode_column(df_annotations, name=col, sep=";")
        df = df[col].drop_duplicates()
        print(f"{df.name}: {df.nunique()}")

if overwrite:
    df_annotations.to_csv(
        annotation_dirpath / f"{annotation_type}_{db_tag}.tsv", sep="\t", index=False
    )

df_annotations