# Annotate RBC-GEM
Read annotation tables and annotate the model 
## Setup
### Import packages

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from rbc_gem_utils import (
    GEM_NAME,
    compare_tables,
    explode_column,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
    split_string,
    visualize_comparison,
    write_cobra_model,
)
from rbc_gem_utils.annotation import set_sbo_default_annotations

# Display versions of last time notebook ran and worked
show_versions()

## Load RBC-GEM model

In [None]:
model_dirpath = get_dirpath("model")

old_model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")
new_model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")

## Set annotations on model

In [None]:
# Whether to update dictionaries or replace dictionaries entirely
reset_and_replace_annotation_dict = True
reset_sbo = True  # Using this option resets the model SBO terms to default values
annotation_types = ["reactions", "metabolites", "genes"]

for annotation_type in annotation_types:
    df_annotation = pd.read_csv(
        model_dirpath / f"{annotation_type}.tsv",
        sep="\t",
        index_col=None,
        dtype=str,
    )
    previous_id_mapping_df = pd.read_csv(
        get_dirpath("deprecatedIdentifiers")
        / f"{annotation_type}_deprecatedIdentifiers.tsv",
        sep="\t",
        index_col=None,
    )
    retired_col = [c for c in previous_id_mapping_df.columns if c.endswith("Retired")][
        0
    ]

    for idx, row in df_annotation.set_index(annotation_type).iterrows():
        try:
            item = getattr(new_model, annotation_type).get_by_id(idx)
        except KeyError:
            # Check if ID was retired for another name
            df_retired = previous_id_mapping_df[
                previous_id_mapping_df[retired_col].apply(
                    lambda x: idx in split_string(x)
                )
            ]
            if not df_retired.empty:
                print(
                    f"{idx} was renamed to {df_retired.iloc[:, 0].item()}, making ID correction"
                )
                idx = df_retired.iloc[:, 0].item()
                item = getattr(new_model, annotation_type).get_by_id(idx)
            else:
                print(f"Not found, was this removed? {annotation_type}, {idx}")
                continue
        values = {k: split_string(v) for k, v in row.dropna().to_dict().items()}
        values.update({k: v[0] for k, v in values.items() if len(v) == 1})
        if reset_and_replace_annotation_dict:
            item.annotation = values
        else:
            item.annotation.update(values)

if reset_sbo:
    set_sbo_default_annotations(new_model, annotation_types, verbose=False)

### Compare before overwriting

In [None]:
df_old_model = {}
for annotation_type in annotation_types:
    all_annotation_keys = set()
    for annotation in getattr(old_model, annotation_type).list_attr("annotation"):
        all_annotation_keys.update(list(annotation.keys()))
    # Get annotation df is agnostic to object type, swap ID column with the `annotation_type`
    df_old_model[annotation_type] = get_annotation_df(
        getattr(old_model, annotation_type), sorted(list(all_annotation_keys))
    ).rename({"id": annotation_type}, axis=1)
    df_old_model[annotation_type] = df_old_model[annotation_type].sort_values(
        annotation_type
    )

df_new_model = {}
for annotation_type in annotation_types:
    all_annotation_keys = set()
    for annotation in getattr(new_model, annotation_type).list_attr("annotation"):
        all_annotation_keys.update(list(annotation.keys()))
    # Get annotation df is agnostic to object type, swap ID column with the `annotation_type`
    df_new_model[annotation_type] = get_annotation_df(
        getattr(new_model, annotation_type), sorted(list(all_annotation_keys))
    ).rename({"id": annotation_type}, axis=1)
    df_new_model[annotation_type] = df_new_model[annotation_type].sort_values(
        annotation_type
    )

In [None]:
annotation_type = "reactions"
fig, ax = plt.subplots(1, 1, figsize=(5, 5))

df_comparision = compare_tables(
    df_old_model[annotation_type].set_index(annotation_type),
    df_new_model[annotation_type].set_index(annotation_type),
)
visualize_comparison(df_comparision)

In [None]:
annotation_type = "metabolites"
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
df_comparision = compare_tables(
    df_old_model[annotation_type].set_index(annotation_type),
    df_new_model[annotation_type].set_index(annotation_type),
)
visualize_comparison(df_comparision)

In [None]:
annotation_type = "genes"
fig, ax = plt.subplots(1, 1, figsize=(5, 20))

df_comparision = compare_tables(
    df_old_model[annotation_type].set_index(annotation_type),
    df_new_model[annotation_type].set_index(annotation_type),
)
visualize_comparison(df_comparision)

## Export model for all filetypes

In [None]:
for ftype in ["xml", "mat", "json", "yml"]:
    write_cobra_model(new_model, model_dirpath / f"{GEM_NAME}.{ftype}")

## Extract annotations from model

In [None]:
compare_figsize = (10, 5)
compare = True
display_nunique = True
overwrite = True

### Reactions

In [None]:
annotation_type = "reactions"
all_annotation_keys = set()
for annotation in getattr(new_model, annotation_type).list_attr("annotation"):
    all_annotation_keys.update(list(annotation.keys()))
# Get annotation df is agnostic to object type, swap ID column with the `annotation_type`
df_annotations = get_annotation_df(
    getattr(new_model, annotation_type), sorted(list(all_annotation_keys))
).rename({"id": annotation_type}, axis=1)


df_annotations = df_annotations.replace(float("nan"), pd.NA).replace("", pd.NA)
if compare:
    compare_on_index = [annotation_type]
    try:
        df_previous = pd.read_csv(
            model_dirpath / f"{annotation_type}.tsv", sep="\t", index_col=None
        )
        df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index),
        df_annotations.set_index(compare_on_index),
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)

if display_nunique:
    for col in df_annotations.columns:
        df = explode_column(df_annotations, name=col, sep=";")
        df = df[col].drop_duplicates()
        print(f"{df.name}: {df.nunique()}")

if overwrite:
    df_annotations.to_csv(
        model_dirpath / f"{annotation_type}.tsv", sep="\t", index=False
    )

df_annotations

### Metabolites

In [None]:
annotation_type = "metabolites"
all_annotation_keys = set()
for annotation in getattr(new_model, annotation_type).list_attr("annotation"):
    all_annotation_keys.update(list(annotation.keys()))
# Get annotation df is agnostic to object type, swap ID column with the `annotation_type`
df_annotations = get_annotation_df(
    getattr(new_model, annotation_type), sorted(list(all_annotation_keys))
).rename({"id": annotation_type}, axis=1)

df_annotations = df_annotations.replace(float("nan"), pd.NA).replace("", pd.NA)
if compare:
    compare_on_index = [annotation_type]
    try:
        df_previous = pd.read_csv(
            model_dirpath / f"{annotation_type}.tsv", sep="\t", index_col=None
        )
        df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index),
        df_annotations.set_index(compare_on_index),
    )

    fig, ax = plt.subplots(1, 1, figsize=compare_figsize)
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)

if display_nunique:
    for col in df_annotations.columns:
        df = explode_column(df_annotations, name=col, sep=";")
        df = df[col].drop_duplicates()
        print(f"{df.name}: {df.nunique()}")

if overwrite:
    df_annotations.to_csv(
        model_dirpath / f"{annotation_type}.tsv", sep="\t", index=False
    )

df_annotations

### Genes

In [None]:
annotation_type = "genes"
all_annotation_keys = set()
for annotation in getattr(new_model, annotation_type).list_attr("annotation"):
    all_annotation_keys.update(list(annotation.keys()))
# Get annotation df is agnostic to object type, swap ID column with the `annotation_type`
df_annotations = get_annotation_df(
    getattr(new_model, annotation_type), sorted(list(all_annotation_keys))
).rename({"id": annotation_type}, axis=1)

df_annotations = df_annotations.replace(float("nan"), pd.NA).replace("", pd.NA)
if compare:
    compare_on_index = [annotation_type]
    try:
        df_previous = pd.read_csv(
            model_dirpath / f"{annotation_type}.tsv", sep="\t", index_col=None
        )
        df_previous = df_previous.replace(float("nan"), pd.NA).replace("", pd.NA)
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=compare_on_index)
    df_comparision = compare_tables(
        df_previous.set_index(compare_on_index),
        df_annotations.set_index(compare_on_index),
    )

    fig, ax = plt.subplots(1, 1, figsize=(10, 20))
    ax.yaxis.set_tick_params(labelsize=8)
    ax = visualize_comparison(df_comparision)

if display_nunique:
    for col in df_annotations.columns:
        df = explode_column(df_annotations, name=col, sep=";")
        df = df[col].drop_duplicates()
        print(f"{df.name}: {df.nunique()}")

if overwrite:
    df_annotations.to_csv(
        model_dirpath / f"{annotation_type}.tsv", sep="\t", index=False
    )

df_annotations