# Model Content Comparison 
The purpose of this notebook is to compare the current version of the RBC-GEM to earlier versions.
## Setup
### Import packages

In [None]:
from collections import defaultdict
from pathlib import Path

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from cobra.manipulation import rename_genes
from rbc_gem_utils import GEM_NAME, GEM_URL, MODEL_PATH, read_cobra_model, show_versions
from rbc_gem_utils.visualization import cmap_map

# Display versions of last time notebook ran and worked
show_versions()
plt.rcParams["font.family"] = "Arial"

## Load RBC-GEM models
### Current model

In [None]:
data_path = Path("data").resolve()
models_path = Path("models").resolve()
figures_path = Path("figures").resolve()
version = "1.2.0"

imagetype = "png"
transparent = True
save_figures = True

ftype = "xml"
current_model = read_cobra_model(models_path / f"{GEM_NAME.replace('-', '_')}.{ftype}")

current_version = version

current_model.metabolites.sort()
current_model.genes.sort()
current_model.reactions.sort()
current_model

### Earlier models

In [None]:
# Probably a better way to do this instead of erroring out.
download = False
version = "0.2.0"
ftype = "xml"

filepath = models_path / f"{GEM_NAME.replace('-', '_')}_{version}.{ftype}"

if download:
    response = requests.get(f"{GEM_URL}/{version}/{MODEL_PATH}/{GEM_NAME}.{ftype}")
    response.raise_for_status()

    # Write file
    # Is there a better way of checking whether binary file?
    if not response.encoding:
        with open(filepath, "wb") as file:
            file.write(response.content)
    else:
        with open(filepath, "w") as file:
            file.write(response.text)

prev_model_020 = read_cobra_model(filepath)
prev_model_020.metabolites.sort()
prev_model_020.genes.sort()
prev_model_020.reactions.sort()
prev_model_020.groups.sort()
# For comparison
group = prev_model_020.groups.get_by_id("Transport reactions")
group.id = "Transport, extracellular"


# For comparison rename genes and remove splice isoforms
rename_genes(
    prev_model_020,
    {g.id: g.annotation.get("hgnc.symbol") for g in prev_model_020.genes},
)
prev_model_020.repair()
prev_model_020

In [None]:
len(prev_model_020.reactions.query(lambda x: x.subsystem != "Pseudoreactions"))

In [None]:
model_list = [prev_model_020, current_model]
model_data = defaultdict(dict)

save_figures = True
transparent = False
imagetype = "svg"

### Comparison of subsystems

In [None]:
df_pathways = pd.read_csv(
    f"{data_path}/subsystems.tsv", sep="\t", index_col=None, dtype=str
)
df_pathways = df_pathways.fillna("")
# Group "Metabolism of other amino acids" with amino acids rather than treat as "other"
df_pathways["category"] = df_pathways["category"].replace(
    "Metabolism of other amino acids", "Amino acid metabolism"
)
df_pathways

In [None]:
# Categories that should be excluded from the figure
categories_to_exclude = {"Pseudoreactions", "Model total"}
# Main categories in figure and abbreviations, all unmapped categories are mapped to "Other"
categories_to_keep = {
    "Amino acid metabolism": mpl.cm.spring,
    "Carbohydrate metabolism": mpl.cm.Greens,
    "Lipid metabolism": mpl.cm.Blues,
    "Metabolism of cofactors and vitamins": mpl.cm.summer,
    "Nucleotide metabolism": mpl.cm.winter,
    "Reactive species": mpl.cm.Reds,
    "Transport reactions": mpl.cm.Purples,
    "Other": mpl.cm.gray_r,
}
use_abbrevs = True
abbrevs = {
    "Amino acid metabolism": "A",
    "Carbohydrate metabolism": "C",
    "Lipid metabolism": "L",
    "Metabolism of cofactors and vitamins": "V",
    "Nucleotide metabolism": "N",
    "Reactive species": "R",
    "Transport reactions": "T",
    "Other": "O",
}
barsize = 0.8
fontsize = 12
cmax = 0.8
cmin = 0.3


colormaps_normal = dict(
    zip(
        categories_to_keep,
        [
            cmap_map(lambda x: x * 1, categories_to_keep[k])(cmax)
            for k in categories_to_keep
        ],
    )
)
colormaps_light = dict(
    zip(
        categories_to_keep,
        [
            cmap_map(lambda x: x / 2 + 0.5, categories_to_keep[k])(cmax)
            for k in categories_to_keep
        ],
    )
)

In [None]:
categories = ["Removed", "Unchanged", "Added"]
colors_dict = dict(
    zip(categories, ["xkcd:light blue", "xkcd:purple", "xkcd:light red"])
)
attribute_types = ["reactions", "metabolites", "genes"]
edgecolor = "black"
linewidth = 0.5
data = defaultdict(dict)
for attribute_type in attribute_types:
    try:
        df_removed = pd.read_csv(
            f"{data_path}/{attribute_type}_removed.tsv",
            sep="\t",
            index_col=None,
            dtype=str,
        )
    except FileNotFoundError:
        df_removed = pd.DataFrame([], columns=[attribute_type], dtype=str)
    if attribute_type == "reactions":
        obj_lists = [
            getattr(model, attribute_type).query(lambda x: not x.boundary)
            for model in model_list
        ]
    else:
        obj_lists = [getattr(model, attribute_type) for model in model_list]

    shared = obj_lists[0].query(lambda x: x.id not in df_removed[attribute_type].values)
    removed = df_removed[attribute_type].values
    new = obj_lists[1].query(lambda x: x.id not in set(shared.list_attr("id")))
    data[attribute_type].update(
        dict(zip(categories, list(map(len, [removed, shared, new]))))
    )


df_data = pd.DataFrame.from_dict(data, orient="index")
df_data.index = df_data.index.str.capitalize()

fig, ax_general = plt.subplots(figsize=(6, 2))
total = df_data.cumsum(axis=1)
offsets = df_data[categories[0]].values
labels = [s.capitalize() for s in attribute_types]
tick_pos = [0.05, 0.11, 0.17]
for name, column in df_data.items():
    ax_general.barh(
        tick_pos,
        column.values,
        height=0.045,
        left=total[name] - column.values - offsets,
        color=colors_dict[name],
        edgecolor=edgecolor,
        linewidth=linewidth,
    )
    ax_general.set_yticks(tick_pos, labels=attribute_types)
    ax_general.set_yticklabels([s.capitalize() for s in attribute_types])

# Remove spines
ax_general.spines["right"].set_visible(False)
ax_general.spines["top"].set_visible(False)
ax_general.spines["left"].set_visible(False)
ax_general.axvline(0, linestyle=":", color="black", alpha=0.8, linewidth=linewidth)
xticks = list(np.linspace(0, 2400, 7))
ax_general.set_xticks(xticks)
ax_general.xaxis.set_major_formatter(lambda x, pos: int(abs(x)))
ax_general.yaxis.set_tick_params(left=False, labelsize="xx-large")
ax_general.xaxis.set_tick_params(labelsize="x-large")
legend = ax_general.legend(
    handles=[
        mpl.patches.Patch(
            label=label, facecolor=color, edgecolor=edgecolor, linewidth=linewidth
        )
        for label, color in colors_dict.items()
    ],
    frameon=False,
    edgecolor="black",
    ncol=3,
    loc="lower center",
    bbox_to_anchor=(0.4, 1, 0, 0),
    fontsize="xx-large",
    handlelength=1,
)

fig.tight_layout()

print(
    f"Number of unique genes in {prev_model_020.id}:\t\t{len([x.id for x in prev_model_020.genes])}\n"
    f"Number of unique metabolites in {prev_model_020.id}:\t{len(set([x.id.replace(f'_{x.compartment}', '') for x in prev_model_020.metabolites]))}\n"
    f"Number of biochemical reactions in {prev_model_020.id}:\t{len(prev_model_020.reactions.query(lambda x: x.subsystem != 'Pseudoreactions'))}\n"
)

print(
    f"Number of unique genes in {current_model.id}:\t\t{len([x.id for x in current_model.genes])}\n"
    f"Number of unique metabolites in {current_model.id}:\t{len(set([x.id.replace(f'_{x.compartment}', '') for x in current_model.metabolites]))}\n"
    f"Number of biochemical reactions in {current_model.id}:\t{len(current_model.reactions.query(lambda x: x.subsystem != 'Pseudoreactions'))}\n"
)
if save_figures:
    fig.savefig(
        figures_path / f"Fig1_Panel_GeneralComparision.{imagetype}",
        transparent=transparent,
        dpi=None if imagetype != "png" else 600,
    )

In [None]:
for idx, model in enumerate(model_list):
    data = defaultdict(dict)
    all_reactions = set()
    df_cat_subsystems = df_pathways.groupby("category")["name"].agg(lambda x: list(x))
    for category, subsystem_list in df_cat_subsystems.items():
        if category not in categories_to_keep and category not in categories_to_exclude:
            category = "Other"

        if category not in data:
            data[category] = {"Reactions": set(), "Metabolites": set(), "Genes": set()}
        # print([x for x in subsystem_list if not x in model.groups])
        subsystem_list = [x for x in subsystem_list if x in model.groups]
        reactions = [
            reaction
            for group in model.groups.get_by_any(subsystem_list)
            for reaction in group.members
        ]
        for reaction in reactions:
            all_reactions.update([reaction])
            data[category]["Reactions"].update([reaction])
            data[category]["Metabolites"].update(reaction.metabolites)
            data[category]["Genes"].update(reaction.genes)

    data["Model total"] = {
        "Reactions": model.reactions,
        "Metabolites": model.metabolites,
        "Genes": model.genes,
    }
    df_data = pd.DataFrame(
        {
            key: {k: len(x) for k, x in value_dict.items()}
            for key, value_dict in data.items()
        }
    ).T.sort_index()
    df_data = df_data.loc[df_data.index.difference(categories_to_exclude)]

    model_data[model.id][f"{attribute_type}-cat"] = df_data
    print(model.id, "\n", df_data, "\n")
df_comparison = pd.merge(
    model_data[model_list[0].id][f"{attribute_type}-cat"],
    model_data[model_list[1].id][f"{attribute_type}-cat"],
    left_index=True,
    right_index=True,
    suffixes=(f"_{model_list[0].id}", f"_{model_list[1].id}"),
)
df_comparison