# RBC-GEM 1.1.0 Updates
## Setup
### Import packages

In [None]:
import pandas as pd
from cobra.core import Gene, Group, Metabolite, Reaction
from rbc_gem_utils import (
    COBRA_CONFIGURATION,
    build_string,
    get_dirpath,
    read_cobra_model,
    write_cobra_model,
)
from rbc_gem_utils.qc import standardardize_metabolite_formulas

### Define configuration
#### COBRA Configuration

In [None]:
COBRA_CONFIGURATION

## Load RBC-GEM model
### Version: 1.0.0

In [None]:
model_dirpath = get_dirpath("model")
model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.yml")
model

### Deprecate identifiers

In [None]:
id_mapping_dicts = {"metabolites": {}, "genes": {}, "reactions": {"ARD": "ARDFE2"}}

### Update model based on curation and proteomic evidence
* Add metabolites first, then genes and lastly, reactions
* Add metabolites with mass and charge balanced formulas (ChemAxon)

In [None]:
dataframes_updated = {}
dataframes_evidence = {}
overwrite = False

In [None]:
attribute_type = "reactions"
id_mapping_dict = id_mapping_dicts[attribute_type]


id_mapping_df = pd.DataFrame.from_dict(id_mapping_dict, orient="index")
id_mapping_df = id_mapping_df.reset_index(drop=False)
id_mapping_df.columns = ["rxnRetired", "rxns"]
id_mapping_df = id_mapping_df.loc[:, id_mapping_df.columns[::-1]]

previous_id_mapping_df = pd.read_csv(
    get_dirpath("deprecatedIdentifiers")
    / f"{attribute_type}_deprecatedIdentifiers.tsv",
    sep="\t",
    index_col=None,
)

for idx, row in id_mapping_df.iterrows():
    new_id, retiring = row[["rxns", "rxnRetired"]]
    previously_retired = previous_id_mapping_df[
        previous_id_mapping_df["rxns"] == retiring
    ]
    retired_set_of_ids = {retiring}
    if not previously_retired.empty:
        # Get all previously retired IDs
        retired_set_of_ids.update(
            previously_retired["rxnRetired"].apply(split_string).item()
        )
        # Pulling the ID out of retirement
        if new_id in retired_set_of_ids:
            retired_set_of_ids.remove(new_id)
        retired_set_of_ids.add(retiring)
    id_mapping_df.loc[idx, "rxnRetired"] = build_string(retired_set_of_ids, sep=";")

for old, new in id_mapping_dict.items():
    try:
        reaction = model.reactions.get_by_id(old)
    except KeyError:
        print(f"Could not map {old} to new ID.")
    else:
        reaction.id = id_mapping_dict[reaction.id]

model.repair()

id_mapping_df = pd.concat((id_mapping_df, previous_id_mapping_df), axis=0)
id_mapping_df = id_mapping_df.drop_duplicates().reset_index(drop=True)
if overwrite:
    id_mapping_df.to_csv(
        get_dirpath("deprecatedIdentifiers")
        / f"{attribute_type}_deprecatedIdentifiers.tsv",
        sep="\t",
    )
id_mapping_df

#### Metabolites

In [None]:
attribute_type = "metabolites"

df_updated = pd.read_csv(
    get_dirpath("curated") / f"{attribute_type}_updated_1.1.0.tsv",
    sep="\t",
    index_col=0,
    dtype=str,
).fillna("")

attr_cols = ["metabolites", "name", "formula", "charge", "compartment"]
for idx, row in df_updated.iterrows():
    mid, name, formula, charge, compartment = row[attr_cols]
    if not model.metabolites.has_id(mid):
        # Add reaction to model
        model.add_metabolites([Metabolite(mid)])
        # print(mid)

    metabolite = model.metabolites.get_by_id(mid)
    metabolite.name = name
    metabolite.formula = formula
    metabolite.charge = int(charge)
    metabolite.compartment = compartment

    annotations_dict = row[~row.index.isin(attr_cols)].to_dict()
    notes = annotations_dict.pop("notes")
    if notes:
        metabolite.notes.update({"notes": notes})
    annotations_dict = {
        k: v
        for k, v in annotations_dict.items()
        if not k.startswith("metabolomic") and v
    }
    metabolite.annotation.update(annotations_dict)

met_formulas = standardardize_metabolite_formulas(
    dict(zip(model.metabolites.list_attr("id"), model.metabolites.list_attr("formula")))
)
for mid, formula in met_formulas.items():
    model.metabolites.get_by_id(mid).formula = formula

dataframes_updated[attribute_type] = df_updated

try:
    df_previous_evidence = pd.read_csv(
        get_dirpath("curated") / f"{attribute_type}_evidence.tsv",
        sep="\t",
        index_col=None,
        dtype=str,
    )
except FileNotFoundError:
    df_previous_evidence = pd.DataFrame([], columns=[attribute_type], dtype=str)


df_evidence = pd.concat((df_updated, df_previous_evidence), axis=0)
df_evidence = df_evidence.drop_duplicates(subset=[attribute_type]).reset_index(
    drop=True
)
if overwrite:
    df_evidence.to_csv(
        get_dirpath("curated") / f"{attribute_type}_evidence.tsv", sep="\t"
    )

dataframes_evidence[attribute_type] = df_evidence
dataframes_updated[attribute_type]

#### Genes

In [None]:
attribute_type = "genes"

df_updated = (
    pd.read_csv(
        get_dirpath("curated") / f"{attribute_type}_updated_1.1.0.tsv",
        sep="	",
        index_col=None,
        dtype=str,
    )
    .fillna("")
    .reset_index(drop=True)
)

attr_cols = ["genes"]
for idx, row in df_updated.iterrows():
    # Between these three annotation fields, should be likely that the rest can get extracted from UniProt.
    gid = row[attr_cols[0]]
    if not model.genes.has_id(gid):
        gene = Gene(gid)
        model.genes.extend([gene])
        # print(gene.id)
    gene = model.genes.get_by_id(gid)
    annotations_dict = row[~row.index.isin(attr_cols)].to_dict()
    annotations_dict = {
        k: v for k, v in annotations_dict.items() if not k.startswith("proteomic") and v
    }
    gene.annotation.update(annotations_dict)

dataframes_updated[attribute_type] = df_updated

try:
    df_previous_evidence = pd.read_csv(
        get_dirpath("curated") / f"{attribute_type}_evidence.tsv",
        sep="	",
        index_col=None,
        dtype=str,
    )
except FileNotFoundError:
    df_previous_evidence = pd.DataFrame([], columns=[attribute_type], dtype=str)


df_evidence = pd.concat((df_updated, df_previous_evidence), axis=0).reset_index(
    drop=True
)
df_evidence = df_evidence.drop_duplicates(subset=[attribute_type]).reset_index(
    drop=True
)

if overwrite:
    df_evidence.to_csv(
        get_dirpath("curated")H / f"{attribute_type}_evidence.tsv", sep="	"
    )

dataframes_evidence[attribute_type] = df_evidence
dataframes_updated[attribute_type]

#### Reactions

In [None]:
attribute_type = "reactions"

df_updated = pd.read_csv(
    get_dirpath("curated") / f"{attribute_type}_updated_1.1.0.tsv",
    sep="\t",
    index_col=None,
    dtype=str,
).fillna("")

attr_cols = ["reactions", "name", "reaction", "gene reaction rule", "subsystem"]
for idx, row in df_updated.iterrows():
    rid, name, reaction_str, gpr, subsystem = row[attr_cols]
    if not model.reactions.has_id(rid):
        # Add reaction to model
        model.add_reactions([Reaction(rid)])

    reaction = model.reactions.get_by_id(rid)
    reaction.build_reaction_from_string(reaction_str)
    reaction.name = name
    reaction.gene_reaction_rule = gpr
    reaction.subsystem = subsystem

    annotations_dict = row[~row.index.isin(attr_cols)].to_dict()
    notes = annotations_dict.pop("notes")
    if notes:
        reaction.notes.update({"notes": notes})
    if "proteomic evidence (#studies)" in annotations_dict:
        annotations_dict.pop("proteomic evidence (#studies)")
    annotations_dict = {
        k: v for k, v in annotations_dict.items() if not k.startswith("proteomic") and v
    }
    reaction.annotation.update(annotations_dict)


dataframes_updated[attribute_type] = df_updated

try:
    df_previous_evidence = pd.read_csv(
        get_dirpath("curated") / f"{attribute_type}_evidence.tsv",
        sep="\t",
        index_col=None,
        dtype=str,
    )
except FileNotFoundError:
    df_previous_evidence = pd.DataFrame([], columns=[attribute_type], dtype=str)

df_evidence = pd.concat((df_updated, df_previous_evidence), axis=0)
df_evidence[attribute_type] = df_evidence[attribute_type].replace(
    id_mapping_dicts[attribute_type]
)
df_evidence = df_evidence.drop_duplicates(subset=[attribute_type]).reset_index(
    drop=True
)
if overwrite:
    df_evidence.to_csv(
        get_dirpath("curated") / f"{attribute_type}_evidence.tsv", sep="\t"
    )

dataframes_evidence[attribute_type] = df_evidence
dataframes_updated[attribute_type]

### Ensure all metabolites, genes, and reactions exist

In [None]:
missing_metabolites = set(model.metabolites.list_attr("id")).symmetric_difference(
    dataframes_evidence["metabolites"]["metabolites"].values
)

missing_genes = set(model.genes.list_attr("id")).symmetric_difference(
    dataframes_evidence["genes"]["genes"].values
)

missing_reactions = set(
    model.reactions.query(lambda x: not x.subsystem == "Pseudoreactions").list_attr(
        "id"
    )
).symmetric_difference(dataframes_evidence["reactions"]["reactions"].values)

print(len(missing_genes))
print(len(missing_metabolites))
print(len(missing_reactions))

In [None]:
missing_genes, missing_metabolites, missing_reactions

In [None]:
for x in missing_genes:
    print(x)

#### Add exchanges

In [None]:
boundaries = {
    # "adprbp_c": "demand"
}
for met, btype in boundaries.items():
    met = model.metabolites.get_by_id(met)
    try:
        model.add_boundary(met, type=btype)
    except ValueError:
        pass


for met in model.metabolites.query(lambda x: x.compartment == "e"):
    try:
        model.add_boundary(met, type="exchange")
    except ValueError:
        pass

for reaction in model.boundary:
    reaction.subsystem = "Pseudoreactions"

#### Reset subsystem groups

In [None]:
model.remove_groups(model.groups)
for subsystem in sorted(set(model.reactions.list_attr("subsystem"))):
    reaction_list = model.reactions.query(lambda x: x.subsystem == subsystem)
    if subsystem not in model.groups:
        group = Group(id=subsystem, name=subsystem, members=reaction_list)
        model.add_groups([group])
    else:
        group = model.groups.get_by_id(subsystem).add_members(reaction_list)

### Check mass balancing

In [None]:
for reaction in model.reactions:
    if reaction.boundary:
        continue
    if reaction.check_mass_balance():
        print(reaction)
        print(reaction.check_mass_balance())
        print()

### Remove duplicated reaction

In [None]:
duplicated_reaction = model.reactions.get_by_id("GLCt1")
# Fix annotations
original_reaction = model.reactions.get_by_id("GLC_Dt")
original_reaction.annotation.update(duplicated_reaction.annotation)

model.remove_reactions([duplicated_reaction])

### Export model

In [None]:
write_cobra_model(
    filename=get_dirpath("model", use_temp="interim" if not overwrite else None)
    / f"{model.id}.xml"
)
write_cobra_model(
    filename=get_dirpath("model", use_temp="interim" if not overwrite else None)
    / f"{model.id}.json"
)
model