# Reconcile annotations for the RBC-GEM

## Setup
### Import packages

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from rbc_gem_utils import (
    ANNOTATION_PATH,
    COBRA_CONFIGURATION,
    INTERIM_PATH,
    MODEL_PATH,
    ROOT_PATH,
    compare_tables,
    read_rbc_model,
    visualize_comparison,
)

### Define configuration
#### COBRA Configuration

In [None]:
COBRA_CONFIGURATION

## Load RBC-GEM model
### Current Version: 0.2.0

In [None]:
model = read_rbc_model(filetype="xml")
model.reactions.sort()
model.genes.sort()
model.metabolites.sort()
model

## Create annotation tables

In [None]:
annotation_dirpath = f"{ROOT_PATH}{ANNOTATION_PATH}"
compare = True
overwrite = True
model_annotation_dfs = {
    "reactions": None,
    "metabolites": None,
    "genes": None,
}
to_double_check = {
    "reactions": {},
    "metabolites": {},
    "genes": {},
}

### Reactions

In [None]:
annotation_type = "reactions"
databases = ["MetAtlas"]
df_annotations = pd.read_csv(
    f"{ROOT_PATH}{MODEL_PATH}/{annotation_type}.tsv",
    sep="\t",
    index_col=0,
    dtype=str,
)
df_annotations = df_annotations.set_index(annotation_type)
df_annotations = df_annotations.rename(
    {c: f"{c}_MODEL" for c in df_annotations.columns}, axis=1
)

for db_tag in databases:
    df = pd.read_csv(
        f"{annotation_dirpath}/{annotation_type}_{db_tag}.tsv",
        sep="\t",
        index_col=0,
        dtype=str,
    ).set_index(annotation_type)
    df.columns = [f"{c}_{db_tag}" for c in df.columns]
    df_annotations = df_annotations.merge(
        df,
        left_index=True,
        right_index=True,
        how="left",
    )

# Keep database tags on duplicate columns for reconcilliation, remove for unique columns
unique_columns = {name.rsplit("_", 1)[0] for name in df_annotations.columns}
to_double_check[annotation_type] = {
    unique: [
        name for name in df_annotations.columns if name.rsplit("_", 1)[0] == unique
    ]
    for unique in unique_columns
}
unique_mapping = {
    to_double_check[annotation_type].pop(unique).pop(): unique
    for unique, dbs in to_double_check[annotation_type].copy().items()
    if len(dbs) == 1
}
print(f"Annotation columns to reconcile: {to_double_check[annotation_type]}")
df_annotations

#### Reconcile columns

In [None]:
key = "rhea"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key

for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    reaction = model.reactions.get_by_id(idx)
    if reaction.boundary:
        df_column[idx] = row[f"{key}_MetAtlas"]
        continue
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]


df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)
to_double_check[annotation_type]

key = "metanetx.reaction"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key

for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]

df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "metatlas"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key

for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]

df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "kegg.reaction"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key

for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]

df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)


key = "spontaneous"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key

for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]

df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "tcdb"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key

for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]

df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "vmhreaction"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key

for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]

df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "bigg.reaction"
df_column = df_annotations[f"{key}_MODEL"]
df_column.name = key
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)


key = "reactome"
df_column = df_annotations[f"{key}_MODEL"]
df_column.name = key
# Only keep human annotations
df_column = df_column.dropna().apply(
    lambda items: ";".join([x for x in items.split(";") if x.startswith("R-HSA")])
)
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

# Overwrite
df_annotations = df_annotations.rename(unique_mapping, axis=1)
df_annotations = df_annotations.drop(
    [x for x in df_annotations.columns if x == "REMOVE"], axis=1
)
df_annotations = df_annotations.reset_index(drop=False)
model_annotation_dfs[annotation_type] = df_annotations
df_annotations

#### Save annotations

In [None]:
assert (
    len(to_double_check[annotation_type]) == 0
), f"There are still columns with data that need reconcilliation: {to_double_check[annotation_type]}"
df_annotations = model_annotation_dfs[annotation_type]

if compare:
    try:
        df_previous = pd.read_csv(
            f"{ROOT_PATH}{MODEL_PATH}/{annotation_type}.tsv",
            sep="\t",
            index_col=0,
            dtype=str,
        )
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=[annotation_type])

    fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    df_comparision = compare_tables(
        df_previous.set_index(annotation_type),
        df_annotations.set_index(annotation_type),
    )
    ax = visualize_comparison(df_comparision)


if overwrite:
    df_annotations.to_csv(f"{ROOT_PATH}{MODEL_PATH}/{annotation_type}.tsv", sep="\t")
else:
    df_annotations.to_csv(f"{ROOT_PATH}{INTERIM_PATH}/{annotation_type}.tsv", sep="\t")
df_annotations

### Metabolites

In [None]:
annotation_type = "metabolites"
databases = ["MetAtlas"]


df_annotations = pd.read_csv(
    f"{ROOT_PATH}{MODEL_PATH}/{annotation_type}.tsv",
    sep="\t",
    index_col=0,
    dtype=str,
)
df_annotations = df_annotations.set_index(annotation_type)
df_annotations = df_annotations.rename(
    {c: f"{c}_MODEL" for c in df_annotations.columns}, axis=1
)

for db_tag in databases:
    df = pd.read_csv(
        f"{annotation_dirpath}/{annotation_type}_{db_tag}.tsv",
        sep="\t",
        index_col=0,
        dtype=str,
    ).set_index(annotation_type)
    df.columns = [f"{c}_{db_tag}" for c in df.columns]
    df_annotations = df_annotations.merge(
        df,
        left_index=True,
        right_index=True,
        how="left",
    )

# Keep database tags on duplicate columns for reconcilliation, remove for unique columns
unique_columns = {name.rsplit("_", 1)[0] for name in df_annotations.columns}
to_double_check[annotation_type] = {
    unique: [
        name for name in df_annotations.columns if name.rsplit("_", 1)[0] == unique
    ]
    for unique in unique_columns
}
unique_mapping = {
    to_double_check[annotation_type].pop(unique).pop(): unique
    for unique, dbs in to_double_check[annotation_type].copy().items()
    if len(dbs) == 1
}
print(f"Annotation columns to reconcile: {to_double_check[annotation_type]}")
df_annotations

#### Reconcile columns

In [None]:
key = "metanetx.chemical"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key

for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]

df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "kegg.compound"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key

for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]

df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "hmdb"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key

for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]

df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "chebi"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key

for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]

df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "bigg.metabolite"
df_column = df_annotations[f"{key}_MODEL"]
df_column.name = key
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "lipidmaps"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key
for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "metatlas"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key
for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "vmhmetabolite"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key
for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "pubchem.compound"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key
for idx, row in df_annotations.loc[:, to_double_check[annotation_type][key]].iterrows():
    # Preserve old ID to prevent loss of ID
    if not row.isna()[f"{key}_MODEL"] and row.isna()[f"{key}_MetAtlas"]:
        df_column[idx] = row[f"{key}_MODEL"]
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

# Overwrite
df_annotations = df_annotations.rename(unique_mapping, axis=1)
df_annotations = df_annotations.drop(
    [x for x in df_annotations.columns if x == "REMOVE"], axis=1
)
df_annotations = df_annotations.reset_index(drop=False)
model_annotation_dfs[annotation_type] = df_annotations
df_annotations

#### Save annotations

In [None]:
assert (
    len(to_double_check[annotation_type]) == 0
), f"There are still columns with data that need reconcilliation: {to_double_check[annotation_type]}"
df_annotations = model_annotation_dfs[annotation_type]

if compare:
    try:
        df_previous = pd.read_csv(
            f"{ROOT_PATH}{MODEL_PATH}/{annotation_type}.tsv",
            sep="\t",
            index_col=0,
            dtype=str,
        )
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=[annotation_type])

    fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    df_comparision = compare_tables(
        df_previous.set_index(annotation_type),
        df_annotations.set_index(annotation_type),
    )
    ax = visualize_comparison(df_comparision)


if overwrite:
    df_annotations.to_csv(f"{ROOT_PATH}{MODEL_PATH}/{annotation_type}.tsv", sep="\t")
else:
    df_annotations.to_csv(f"{ROOT_PATH}{INTERIM_PATH}/{annotation_type}.tsv", sep="\t")
df_annotations

### Genes

In [None]:
annotation_type = "genes"
databases = ["UniProt", "MIM", "MetAtlas"]
objects = model.reactions
df_annotations = pd.read_csv(
    f"{ROOT_PATH}{MODEL_PATH}/{annotation_type}.tsv",
    sep="\t",
    index_col=0,
    dtype=str,
)
df_annotations = df_annotations.set_index(annotation_type).rename(
    {"omim": "mim"}, axis=1
)
df_annotations = df_annotations.rename(
    {c: f"{c}_MODEL" for c in df_annotations.columns}, axis=1
)

for db_tag in databases:
    df = (
        pd.read_csv(
            f"{annotation_dirpath}/{annotation_type}_{db_tag}.tsv",
            sep="\t",
            index_col=0,
            dtype=str,
        )
        .set_index(annotation_type)
        .rename({"omim": "mim"}, axis=1)
    )
    df.columns = [f"{c}_{db_tag}" for c in df.columns]
    df_annotations = df_annotations.merge(
        df,
        left_index=True,
        right_index=True,
        how="outer",
    )

# Keep database tags on duplicate columns for reconcilliation, remove for unique columns
unique_columns = {name.rsplit("_", 1)[0] for name in df_annotations.columns}
to_double_check[annotation_type] = {
    unique: [
        name for name in df_annotations.columns if name.rsplit("_", 1)[0] == unique
    ]
    for unique in unique_columns
}
unique_mapping = {
    to_double_check[annotation_type].pop(unique).pop(): unique
    for unique, dbs in to_double_check[annotation_type].copy().items()
    if len(dbs) == 1
}
print(f"Annotation columns to reconcile: {to_double_check[annotation_type]}")
df_annotations

#### Reconcile columns

In [None]:
key = "hgnc.symbol"
df_column = df_annotations[f"{key}_UniProt"]
df_column.name = key
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "uniprot"
df_column = df_annotations[f"{key}_UniProt"]
df_column.name = key
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "ccds"
df_column = df_annotations[f"{key}_UniProt"]
df_column.name = key
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "uniprot.isoform"
df_column = df_annotations[f"{key}_UniProt"]
df_column.name = key
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "ncbigene"
df_column = df_annotations[f"{key}_UniProt"]
df_column.name = key
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "refseq"
df_column = df_annotations[f"{key}_UniProt"]
df_column.name = key
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "drugbank"
df_column = df_annotations[f"{key}_UniProt"]
df_column.name = key
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "mim"
df_column = df_annotations[f"{key}_MIM"]
df_column.name = key
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

key = "ensembl"
df_column = df_annotations[f"{key}_MetAtlas"]
df_column.name = key
df_annotations[key] = df_column
df_annotations = df_annotations.drop(to_double_check[annotation_type][key], axis=1)
to_double_check[annotation_type].pop(key)

# Overwrite
df_annotations = df_annotations.rename(unique_mapping, axis=1)
df_annotations = df_annotations.drop(
    [x for x in df_annotations.columns if x == "REMOVE"], axis=1
)
df_annotations = df_annotations.reset_index(drop=False)
model_annotation_dfs[annotation_type] = df_annotations
df_annotations

#### Save annotations

In [None]:
assert (
    len(to_double_check[annotation_type]) == 0
), f"There are still columns with data that need reconcilliation: {to_double_check[annotation_type]}"
df_annotations = model_annotation_dfs[annotation_type]

if compare:
    try:
        df_previous = pd.read_csv(
            f"{ROOT_PATH}{MODEL_PATH}/{annotation_type}.tsv",
            sep="\t",
            index_col=0,
            dtype=str,
        )
    except FileNotFoundError:
        df_previous = pd.DataFrame([], columns=[annotation_type])

    fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    df_comparision = compare_tables(
        df_previous.set_index(annotation_type),
        df_annotations.set_index(annotation_type),
    )
    ax = visualize_comparison(df_comparision)


if overwrite:
    df_annotations.to_csv(f"{ROOT_PATH}{MODEL_PATH}/{annotation_type}.tsv", sep="\t")
else:
    df_annotations.to_csv(f"{ROOT_PATH}{INTERIM_PATH}/{annotation_type}.tsv", sep="\t")
df_annotations