# Extract data from Continuous-Drug Combination DataBase

The purpose of this notebook is to extract and format drug combination data
Additionally, the purpose of this notebook is to reconcile DrugBank and PubChem IDs that are found in the data.


## Notebook Requirements:
*  Model genes **must** have the at least one of following annotations stored in the `object.annotation`. Values are expected to be seperated by semicolons. Accepted keys currently include:
    * `"drugbank"`
* Note: Requires internet connection to download information from the [C-DCDB](https://icc.ise.bgu.ac.il/medical_ai/CDCDB/).

### Citations
Shtar G, Azulay L, Nizri O, Rokach L, Shapira B. CDCDB: A large and continuously updated drug combination database. Sci Data. 2022 Jun 2;9(1):263. doi: 10.1038/s41597-022-01360-z. PMID: 35654801; PMCID: PMC9163158.

## Setup
### Import packages

In [None]:
from collections import defaultdict
from itertools import combinations

import matplotlib.pyplot as plt
import pandas as pd
from rbc_gem_utils import (
    GEM_NAME,
    build_string,
    compare_tables,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
    split_string,
    visualize_comparison,
)
from rbc_gem_utils.database import CDCDB_DB_TAG
from rbc_gem_utils.util import explode_column

# Display versions of last time notebook ran and worked
show_versions()

## Set notebook options

In [None]:
db_tag = CDCDB_DB_TAG

compare_figsize = (5, 5)
compare = True
display_nunique = True
overwrite = True

## Load RBC-GEM model

In [None]:
model_dirpath = get_dirpath("model")
model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")
model

#### Load annotations

In [None]:
annotation_type = "genes"
df_model_mappings = get_annotation_df(
    getattr(model, annotation_type), ["drugbank", "pubchem.compound"]
).rename({"id": annotation_type}, axis=1)
for col in df_model_mappings.columns:
    df = (
        df_model_mappings[col]
        .apply(lambda x: split_string(x))
        .explode(col)
        .drop_duplicates()
    )
    print(f"{df.name}: {df.nunique()}")

drugbank_ids = (
    explode_column(df_model_mappings, "drugbank")["drugbank"].dropna().unique()
)
pubchem_ids = (
    explode_column(df_model_mappings, "pubchem.compound")["pubchem.compound"]
    .dropna()
    .unique()
)
df_model_mappings

## Load C-DCDB data and identify combinations

In [None]:
use_interim = False

database_dirpath = get_dirpath(
    "database", db_tag, use_temp="interim" if use_interim else None
)
annotation_dirpath = get_dirpath(
    "annotation", use_temp="interim" if use_interim else None
)

# Ensure directories exist
database_dirpath.mkdir(exist_ok=True, parents=True)
annotation_dirpath.mkdir(exist_ok=True, parents=True)

df_database_simplified = pd.read_csv(
    database_dirpath / "web_preview.csv", index_col=0, dtype=str
)

### Format data

In [None]:
df_database_simplified = df_database_simplified.reset_index(drop=False)
# Space left here if additional metadata needs to be added

df_combinations = df_database_simplified.copy().fillna("")
df_combinations["drugs"] = df_combinations["drugs"].apply(
    lambda x: sorted(x.split(","))
)
df_combinations["drugbank_identifiers"] = df_combinations["drugbank_identifiers"].apply(
    lambda x: x.split(";")
)
df_combinations["pubchem_identifiers"] = df_combinations["pubchem_identifiers"].apply(
    lambda x: [s.lstrip("CID") for s in x.split(";")]
)
df_combinations

In [None]:
df_alias_mapping = df_combinations.loc[
    :, ["drugbank_identifiers", "pubchem_identifiers"]
].explode(["drugbank_identifiers", "pubchem_identifiers"])
df_alias_mapping = (
    df_alias_mapping.replace("NA", float("nan"))
    .drop_duplicates()
    .reset_index(drop=True)
)
df_alias_mapping = (
    df_alias_mapping.rename(
        {"drugbank_identifiers": "drugbank", "pubchem_identifiers": "pubchem.compound"},
        axis=1,
    )
    .dropna()
    .reset_index(drop=True)
)
if overwrite:
    df_alias_mapping.to_csv(database_dirpath / "aliases.tsv", sep="\t")
df_alias_mapping

#### Extract as model annotations

In [None]:
annotation_type = "genes"
# Format as model annotations
df_model_drugs = (
    df_model_mappings.set_index(annotation_type)["drugbank"]
    .apply(split_string)
    .explode()
    .dropna()
    .reset_index()
)
df_annotations = df_model_drugs.merge(
    df_alias_mapping, left_on="drugbank", right_on="drugbank", how="left"
)
df_model_drugs = (
    df_model_mappings.set_index(annotation_type)["pubchem.compound"]
    .apply(split_string)
    .explode()
    .dropna()
    .reset_index()
)

df_annotations = pd.concat(
    (
        df_annotations,
        df_model_drugs.merge(
            df_alias_mapping,
            left_on="pubchem.compound",
            right_on="pubchem.compound",
            how="left",
        ),
    ),
    axis=0,
)
df_annotations["pubchem.compound"] = df_annotations["pubchem.compound"].str.lstrip(
    "CID"
)
df_annotations = df_annotations.groupby(annotation_type, as_index=False).agg(
    lambda x: build_string(sorted(x.dropna().unique()))
)


if compare:
    try:
        df_previous = pd.read_csv(
            annotation_dirpath / f"{annotation_type}_{CDCDB_DB_TAG}.tsv",
            sep="\t",
            index_col=0,
            dtype=str,
        )
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=[annotation_type])
    fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    df_comparision = compare_tables(
        df_previous.set_index(annotation_type),
        df_annotations.set_index(annotation_type),
    )
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)


if display_nunique:
    for col in df_annotations.columns:
        df = (
            df_annotations[col]
            .apply(lambda x: split_string(x))
            .explode(col)
            .drop_duplicates()
        )
        print(f"{df.name}: {df.nunique()}")

if overwrite:
    df_annotations.to_csv(
        annotation_dirpath / f"{annotation_type}_{CDCDB_DB_TAG}.tsv", sep="\t"
    )

df_annotations

### Map against model

In [None]:
# TODO deal with combinations that have NA in the identifier
# TODO deal with PubChem compounds
indicies_to_keep = sorted(
    {
        idx
        for idx, row in df_combinations["drugbank_identifiers"].dropna().items()
        if not set(row).difference(set(drugbank_ids))
    }
    # .union([
    #     idx
    #     for idx, row in df_combinations["pubchem_identifiers"].dropna().items()
    #     if not set(row).difference(set(pubchem_ids))
)

df_model_combinations = df_combinations.loc[indicies_to_keep].reset_index(drop=True)
df_model_combinations

### Keep only pairwise combinations
* Combinations with more than two drug components are split into combination pairs

In [None]:
idx = 0
all_data = defaultdict(dict)
for i, row in df_model_combinations.iterrows():
    for identifiers in row[["drugbank_identifiers", "pubchem_identifiers"]].values:
        if "NA" in set(identifiers):
            continue
        values = row.to_dict()
        for combo in combinations(identifiers, 2):
            values.update({"node_A": combo[0], "node_B": combo[1]})
            all_data[idx] = values
            idx += 1
df_interactions = pd.DataFrame.from_dict(all_data, orient="index")
df_interactions["edge_type"] = "drug-drug"
df_interactions["edge_subtype"] = "effective-combination"
df_interactions["edge_source"] = df_interactions["source"].apply(
    lambda x: build_string([CDCDB_DB_TAG, x])
)
# Use the join method instead of the build_string to avoid losing drugs that interact with themselves for being "duplicate" values
df_interactions["drugs"] = df_interactions["drugs"].apply(lambda x: ";".join(x))
df_interactions["drugbank_identifiers"] = df_interactions["drugbank_identifiers"].apply(
    lambda x: ";".join(x)
)
df_interactions["pubchem_identifiers"] = df_interactions["pubchem_identifiers"].apply(
    lambda x: ";".join(x)
)
df_interactions = df_interactions.drop_duplicates(
    ["node_A", "node_B", "drugs", "drugbank_identifiers", "pubchem_identifiers"]
)
df_interactions = df_interactions.reset_index(drop=True)
df_interactions.to_csv(
    database_dirpath / f"EffectiveCombos_{CDCDB_DB_TAG}.tsv",
    sep="\t",
)
df_interactions