# Extract annotations and data from TCDB

The purpose of this notebook is to extract and format TCDB data for subsequent model annotation.

Additionally, the purpose of this notebook is to extract data for transport protein classification.

## Notebook Requirements:
*  Model genes **must** have the at least one of following annotations stored in the `object.annotation`. Values are expected to be seperated by semicolons. Accepted keys currently include:
    * `"tcdb"`
    * `"uniprot"`
* Note: Requires internet connection to download information from [Transport Classification Database (TCDB)](https://www.tcdb.org/).

### Citation
Saier MH, Reddy VS, Moreno-Hagelsieb G, Hendargo KJ, Zhang Y, Iddamsetty V, Lam KJK, Tian N, Russum S, Wang J, Medrano-Soto A. The Transporter Classification Database (TCDB): 2021 update. Nucleic Acids Res. 2021 Jan 8;49(D1):D461-D467. doi: 10.1093/nar/gkaa1004. PMID: 33170213; PMCID: PMC7778945.

## Setup
### Import packages

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from rbc_gem_utils import (
    GEM_NAME,
    build_string,
    compare_tables,
    explode_column,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
    visualize_comparison,
)
from rbc_gem_utils.database.tcdb import TCDB_DB_TAG, download_database_TCDB

# Display versions of last time notebook ran and worked
show_versions()

## Set notebook options

In [None]:
db_tag = TCDB_DB_TAG
download_database = True
use_interim = False

compare_figsize = (5, 5)
overwrite = True
display_nunique = True
compare = True

# TCDB is best if all mappings are up to date,
# UniProt may not work if TCDB maps to a secondary accession
mapping_key = "uniprot"

## Check TCDB release
* According to the [TCDB FAQ](https://www.tcdb.org/faq.php), updates are usually every week, but it mainly depends on the availability of new data.

In [None]:
database_dirpath = get_dirpath(
    "database", db_tag, use_temp="interim" if use_interim else None
)
annotation_dirpath = get_dirpath(
    "annotation", use_temp="interim" if use_interim else None
)

# Ensure directories exist
database_dirpath.mkdir(exist_ok=True, parents=True)
annotation_dirpath.mkdir(exist_ok=True, parents=True)

#### Download new files and update database

In [None]:
if download_database:
    download_database_TCDB(database_dirpath=database_dirpath)

## Load identifier and other data tables

In [None]:
df_accessions = pd.read_csv(
    database_dirpath / "acc2tcid.tsv",
    sep="\t",
    header=None,
    names=["uniprot", "tcdb"],
    on_bad_lines="error",
    dtype=str,
)
df_families = pd.read_csv(
    database_dirpath / "families.tsv",
    sep="\t",
    header=None,
    names=["Family", "fam_name"],
    on_bad_lines="error",
    dtype=str,
)
df_superfamilies = pd.read_csv(
    database_dirpath / "listSuperfamilies.tsv",
    sep="\t",
    on_bad_lines="warn",  # Known issue line:7723, field is duplicated so its okay to skip.
    dtype=str,
)

df_families = df_families.merge(
    df_superfamilies,
    left_on="Family",
    right_on="Family",
    how="outer",
).drop_duplicates()
df_families = df_families.rename({c: c.lower() for c in df_families.columns}, axis=1)
df_families

## Load RBC-GEM model

In [None]:
model_dirpath = get_dirpath("model")
model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")
model

### Extract current annotations from model

In [None]:
annotation_type = "genes"
annotation_cols = ["tcdb", "uniprot"]

df_model_mappings = (
    get_annotation_df(model.genes, annotation_cols)
    .rename({"id": annotation_type}, axis=1)
    .dropna(subset=[mapping_key])
)

for col in df_model_mappings.columns:
    df_model_mappings = explode_column(df_model_mappings, name=col, sep=";")
df_model_mappings = df_model_mappings.sort_values(annotation_type)

print(df_model_mappings.nunique(dropna=True))
df_model_mappings = df_model_mappings.reset_index(drop=True)
df_model_mappings

### Create data table

In [None]:
df_model_tcdb = df_model_mappings.merge(
    df_accessions,
    left_on=mapping_key,
    right_on=mapping_key,
    how="left",
    suffixes=("_MODEL", ""),
)
df_model_tcdb = df_model_tcdb.drop(
    labels=[x for x in df_model_tcdb.columns if x.endswith("_MODEL")], axis=1
)
df_model_tcdb = df_model_tcdb.dropna().drop_duplicates().reset_index(drop=True)

df_model_tcdb = (
    df_model_tcdb.merge(
        df_families,
        left_on="tcdb",
        right_on="#tcid",
        how="left",
    )
    .rename({c: c.lower() for c in df_families.columns}, axis=1)
    .drop("#tcid", axis=1)
)

for filename in ["pfam", "refseq", "pdb", "go", "getSubstrates"]:
    if filename == "getSubstrates":
        df = pd.read_csv(
            database_dirpath / f"{filename}.tsv",
            sep="\t",
            header=None,
            names=["tcdb", "chebi"],
        )
        df["chebi"] = df["chebi"].str.split("|")
        df["substrate"] = df["chebi"].apply(
            lambda x: build_string([s.split(";")[-1] for s in x])
        )
        df["chebi"] = df["chebi"].apply(
            lambda x: build_string([s.split(";")[0] for s in x])
        )

    else:
        df = pd.read_csv(
            database_dirpath / f"{filename}.tsv",
            sep="\t",
            header=None,
            names=[filename, "tcdb", "family"],
        ).drop("family", axis=1)

    df_model_tcdb = df_model_tcdb.merge(
        df.set_index("tcdb"),
        left_on="tcdb",
        right_on="tcdb",
        how="left",
    ).reset_index(drop=True)

df_model_tcdb = (
    df_model_tcdb.groupby(["genes", "tcdb", "uniprot"], as_index=False)
    .agg(lambda x: build_string(list(x.dropna().unique())))
    .drop_duplicates()
)
df_model_tcdb["subfamily"] = (
    df_model_tcdb["tcdb"].str.findall(r"^\w+\W\w+\W\w+\W\w+").explode()
)
df_model_tcdb["family"] = df_model_tcdb["tcdb"].str.findall(r"^\w+\W\w+\W\w+").explode()
df_model_tcdb["subclass"] = df_model_tcdb["tcdb"].str.findall(r"^\w+\W\w+").explode()
df_model_tcdb["class"] = df_model_tcdb["tcdb"].str.findall(r"^\w+").explode()
df_model_tcdb = df_model_tcdb.loc[
    :,
    [
        "genes",
        "uniprot",
        "tcdb",
        "subfamily",
        "family",
        "subclass",
        "class",
        "fam_name",
        "fam_abbreviation",
        "superfamily",
        "pfam",
        "refseq",
        "pdb",
        "go",
        "chebi",
        "substrate",
    ],
]
for fam in ["fam_name", "fam_abbreviation"]:
    df_model_tcdb[fam] = df_model_tcdb["family"].replace(
        df_families.set_index("family")[fam].to_dict()
    )

# df_model_tcdb = df_model_tcdb.rename({"go": "GO"}, axis=1)
df_model_tcdb_final = (
    df_model_tcdb.astype(str)
    .groupby(["genes", "tcdb"], as_index=False)
    .agg(lambda x: build_string(x.dropna().unique()))
)


df_model_tcdb_final = df_model_tcdb_final.replace(float("nan"), pd.NA).replace(
    "", pd.NA
)
if display_nunique:
    for col in df_model_tcdb_final.columns:
        df = explode_column(df_model_tcdb_final, name=col, sep=";")
        df = df[col].drop_duplicates()
        print(f"{df.name}: {df.nunique()}")

if overwrite:
    df_model_tcdb_final.to_csv(database_dirpath / f"{db_tag}_{GEM_NAME}.tsv", sep="\t")

df_model_tcdb_final

In [None]:
df_annotations = df_model_tcdb_final.loc[
    :, [annotation_type] + ["tcdb", "uniprot", "pfam", "refseq", "pdb", "go"]
]
# Explode and regroup to reduce duplicate mappings to genes
for col in df_annotations.columns:
    df_annotations = explode_column(df_annotations, name=col, sep=";")

df_annotations = df_annotations.groupby(["genes"], as_index=False).agg(
    lambda x: build_string(sorted(x.dropna().unique()))
)
df_annotations = df_annotations.replace(float("nan"), pd.NA).replace("", pd.NA)
if compare:
    compare_on_index = [annotation_type]
    try:
        df_previous = pd.read_csv(
            annotation_dirpath / f"{annotation_type}_{db_tag}.tsv",
            sep="\t",
            index_col=None,
            dtype=str,
        )
        df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index),
        df_annotations.set_index(compare_on_index),
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)

if display_nunique:
    for col in df_annotations.columns:
        df = explode_column(df_annotations, name=col, sep=";")
        df = df[col].drop_duplicates()
        print(f"{df.name}: {df.nunique()}")

if overwrite:
    df_annotations.to_csv(
        annotation_dirpath / f"{annotation_type}_{db_tag}.tsv", sep="\t", index=False
    )

df_annotations