# Create Tables for Evidence

Create tables sumarizing evidence for RBC-GEM reconstruction components.
## Setup
### Import packages

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from rbc_gem_utils import (
    GEM_NAME,
    build_string,
    compare_tables,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
    split_string,
    visualize_comparison,
)

# Display versions of last time notebook ran and worked
show_versions()

## Load RBC-GEM model

In [None]:
model_dirpath = get_dirpath("model")
model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")

model.reactions.sort()
model.genes.sort()
model.metabolites.sort()
model

## Load omic evidence
### Proteomic evidence

In [None]:
df_proteomic_evidence = pd.read_csv(
    get_dirpath("proteomics", use_temp="external") / "proteomic_evidence_table.tsv",
    sep="\t",
    index_col=0,
)
df_proteomic_evidence = pd.concat(
    (
        df_proteomic_evidence.sum(axis=1),
        pd.DataFrame.from_dict(
            {
                uniprot_id: build_string(
                    sorted(
                        [
                            f"{pubmed}"
                            for pubmed, is_detected in value_dict.items()
                            if bool(is_detected)
                        ]
                    )
                )
                for uniprot_id, value_dict in df_proteomic_evidence.T.to_dict().items()
            },
            orient="index",
        ),
    ),
    axis=1,
)
df_proteomic_evidence.columns = [
    "proteomic evidence (#studies)",
    "proteomic evidence (pubmed)",
]
df_proteomic_evidence

### Create evidence tables

In [None]:
compare = True
overwrite = True
compare_figsize = (5, 5)

#### Metabolites

In [None]:
attribute_type = "metabolites"
try:
    df_previous = pd.read_csv(
        get_dirpath("curation") / f"{attribute_type}_evidence.tsv",
        sep="\t",
        index_col=None,
    )
    df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
except FileNotFoundError:
    df_previous = pd.DataFrame([], columns=[attribute_type])

df_evidence = pd.DataFrame.from_dict(
    {
        idx: {
            "metabolites": metabolite.id,
            "metabolite": metabolite.id.replace(f"_{metabolite.compartment}", ""),
            "name": metabolite.name,
            "formula": metabolite.formula,
            "charge": metabolite.charge,
            "compartment": metabolite.compartment,
            "metabolomic evidence (#studies)": "",  # TODO
            "metabolomic evidence (pubmed)": "",  # TODO
        }
        for idx, metabolite in enumerate(model.metabolites)
    },
    orient="index",
)
df_evidence = df_evidence.set_index(attribute_type)
df_evidence["references"] = df_previous.set_index(attribute_type)["references"]
df_evidence["references"] = (
    df_evidence["references"]
    .fillna("")
    .apply(lambda x: build_string(sorted(split_string(x))))
)
df_evidence["notes"] = df_previous.set_index(attribute_type)["notes"]
df_evidence = df_evidence.reset_index(drop=False).sort_values(
    by=["metabolomic evidence (#studies)", attribute_type, "compartment"],
    ascending=[False, True, True],
)
df_evidence = df_evidence.reset_index(drop=True)
df_evidence = df_evidence.replace(float("nan"), pd.NA).replace("", pd.NA)
df_references = df_evidence[[attribute_type, "references"]].copy()

if compare:
    compare_on_index = [attribute_type]
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index), df_evidence.set_index(compare_on_index)
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)


if overwrite:
    df_evidence.to_csv(
        get_dirpath("curation") / f"{attribute_type}_evidence.tsv",
        sep="\t",
        index=False,
    )
    df_references.to_csv(
        get_dirpath("annotation") / f"{attribute_type}_References.tsv",
        sep="\t",
        index=False,
    )

df_evidence

#### Genes

In [None]:
attribute_type = "genes"
try:
    df_previous = pd.read_csv(
        get_dirpath("curation") / f"{attribute_type}_evidence.tsv",
        sep="\t",
        index_col=None,
    )
    df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
except FileNotFoundError:
    df_previous = pd.DataFrame([], columns=[attribute_type])

annotation_cols = ["uniprot", "ncbigene", "hgnc.symbol"]
df_evidence = get_annotation_df(
    getattr(model, attribute_type), ["name"] + annotation_cols
)
df_evidence = df_evidence.rename({"id": attribute_type}, axis=1).set_index(
    attribute_type
)
df_evidence["name"] = getattr(model, attribute_type).list_attr("name")
df_evidence = df_evidence.merge(
    df_proteomic_evidence,
    left_on="uniprot",
    right_index=True,
    how="left",
)
df_evidence["proteomic evidence (#studies)"] = (
    df_evidence["proteomic evidence (#studies)"].fillna(0).astype(int)
)

df_evidence["references"] = df_previous.set_index(attribute_type)["references"]
df_evidence["references"] = (
    df_evidence["references"]
    .fillna("")
    .apply(lambda x: build_string(sorted(split_string(x))))
)
df_evidence["notes"] = df_previous.set_index(attribute_type)["notes"]
df_evidence = df_evidence.reset_index(drop=False).sort_values(
    by=["proteomic evidence (#studies)", attribute_type], ascending=[False, True]
)
df_evidence = df_evidence.reset_index(drop=True)
df_evidence = df_evidence.replace(float("nan"), pd.NA).replace("", pd.NA)
df_references = df_evidence[[attribute_type, "references"]].copy()

if compare:
    compare_on_index = [attribute_type]
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index), df_evidence.set_index(compare_on_index)
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)


if overwrite:
    df_evidence.to_csv(
        get_dirpath("curation") / f"{attribute_type}_evidence.tsv",
        sep="\t",
        index=False,
    )
    df_references.to_csv(
        get_dirpath("annotation") / f"{attribute_type}_References.tsv",
        sep="\t",
        index=False,
    )

model_proteomics_count = df_evidence.set_index("genes")[
    "proteomic evidence (#studies)"
].to_dict()
df_evidence

#### Reactions

In [None]:
attribute_type = "reactions"
try:
    df_previous = pd.read_csv(
        get_dirpath("curation") / f"{attribute_type}_evidence.tsv",
        sep="\t",
        index_col=None,
    )
    df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
except FileNotFoundError:
    df_previous = pd.DataFrame([], columns=[attribute_type])

df_evidence = pd.DataFrame.from_dict(
    {
        idx: {
            "reactions": reaction.id,
            "name": reaction.name,
            "reaction": reaction.reaction,
            "gene reaction rule": reaction.gene_reaction_rule,
            "subsystem": reaction.subsystem,
            "spontaneous": int(float(reaction.annotation.get("spontaneous", 0))),
            "proteomic evidence (#studies)": build_string(
                sorted(
                    [
                        "{}({})".format(gene.id, model_proteomics_count[gene.id])
                        for gene in list(reaction.genes)
                        if model_proteomics_count[gene.id] != 0
                    ]
                )
            ),
        }
        for idx, reaction in enumerate(
            model.reactions.query(lambda x: not x.subsystem in {"Pseudoreactions"})
        )
    },
    orient="index",
)
df_evidence = df_evidence.set_index(attribute_type)
# Preserve reactions marked spontaneous based on reference data
df_evidence["spontaneous"] = df_previous.set_index(attribute_type)["spontaneous"]
df_evidence["references"] = df_previous.set_index(attribute_type)["references"]
df_evidence["references"] = (
    df_evidence["references"]
    .fillna("")
    .apply(lambda x: build_string(sorted(split_string(x))))
)
df_evidence["notes"] = df_previous.set_index(attribute_type)["notes"]
df_evidence = df_evidence.reset_index(drop=False).sort_values(
    by=["subsystem", "reactions"], ascending=[True, True]
)
df_evidence = df_evidence.reset_index(drop=True)
df_evidence = df_evidence.replace(float("nan"), pd.NA).replace("", pd.NA)
df_references = df_evidence[[attribute_type, "references"]].copy()

if compare:
    compare_on_index = [attribute_type]
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index), df_evidence.set_index(compare_on_index)
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)


if overwrite:
    df_evidence.to_csv(
        get_dirpath("curation") / f"{attribute_type}_evidence.tsv",
        sep="\t",
        index=False,
    )
    df_references.to_csv(
        get_dirpath("annotation") / f"{attribute_type}_References.tsv",
        sep="\t",
        index=False,
    )

df_evidence

In [None]:
set1 = set(
    df_evidence[df_evidence["references"].isna()].loc[:, attribute_type].unique()
)
set2 = set(
    df_evidence[df_evidence["references"].notna()].loc[:, attribute_type].unique()
)
set3 = set(
    df_evidence[df_evidence["proteomic evidence (#studies)"].isna()]
    .loc[:, attribute_type]
    .unique()
)
set4 = set(
    df_evidence[df_evidence["proteomic evidence (#studies)"].notna()]
    .loc[:, attribute_type]
    .unique()
)
set5 = set(df_evidence[df_evidence["spontaneous"] == 1].loc[:, attribute_type].unique())
assert len(df_evidence) == len(set1.union(set2).union(set3).union(set4).union(set5))
print(f"Known spontaneous: {len(set1.intersection(set3).intersection(set5))}")
print(f"No clear evidence: {len(set1.intersection(set3).difference(set5))}")
print(f"Only omic evidence: {len(set1.intersection(set4))}")
print(f"Only lit. evidence: {len(set2.intersection(set3))}")
print(f"Both evidence types: {len(set2.intersection(set4))}")
print(f"Total {attribute_type}: {len(df_evidence)}")