# RBC-GEM 1.2.0 Updates
## Setup
### Import packages

In [1]:
import pandas as pd

from cobra.core import Reaction, Gene, Metabolite, Group
from rbc_gem_utils import (
    COBRA_CONFIGURATION,
    ROOT_PATH,
    CURATION_PATH,
    INTERIM_PATH,
    read_rbc_model,
    write_rbc_model,
    build_string,
    split_string,
)
from rbc_gem_utils.qc import (
    standardardize_metabolite_formulas,
    reset_subsystem_groups,
    reset_reaction_bounds,
)
from rbc_gem_utils.annotation import set_sbo_default_annotations

### Define configuration
#### COBRA Configuration

In [2]:
COBRA_CONFIGURATION

Attribute,Description,Value
solver,Mathematical optimization solver,gurobi
tolerance,"General solver tolerance (feasibility, integrality, etc.)",1e-07
lower_bound,Default reaction lower bound,-1000.0
upper_bound,Default reaction upper bound,1000.0
processes,Number of parallel processes,15
cache_directory,Path for the model cache,/Users/zhaiman/Library/Caches/cobrapy
max_cache_size,Maximum cache size in bytes,104857600
cache_expiration,Model cache expiration time in seconds (if any),


## Load RBC-GEM model
### Version: 1.1.0

In [3]:
read_model = read_rbc_model(filetype="yml")

Set parameter Username
Academic license - for non-commercial use only - expires 2024-11-28


In [4]:
model = read_model.copy()

Read LP format model from file /var/folders/5t/hk8m3g6d1jn25x5rssjgsrmm0000gn/T/tmpqfcbcxhi.lp
Reading time = 0.02 seconds
: 2008 rows, 5952 columns, 23214 nonzeros


### Deprecate identifiers

In [5]:
id_mapping_dicts = {
    "metabolites": {},
    "genes": {},
    "reactions": {
        "NO22Natex": "NO2_2Natex",
        "CYSL": "CYSLNET",
        "CYSTCYSL": "CYSTL1",
        "HSERL1": "HSERL1",
        "SELCYSTL": "SELCYSTL1",
        "SELMETHL": "SELMETHL1",
    },
}

In [6]:
attribute_type = "reactions"
id_mapping_dict = id_mapping_dicts[attribute_type]


id_mapping_df = pd.DataFrame.from_dict(id_mapping_dict, orient="index")
id_mapping_df = id_mapping_df.reset_index(drop=False)
id_mapping_df.columns = ["rxnRetired", "rxns"]
id_mapping_df = id_mapping_df.loc[:, id_mapping_df.columns[::-1]]

previous_id_mapping_df = pd.read_csv(
    f"{ROOT_PATH}/data/deprecatedIdentifiers/{attribute_type}_deprecatedIdentifiers.tsv",
    sep="\t",
    index_col=0,
)

for idx, row in id_mapping_df.iterrows():
    new_id, retiring = row[["rxns", "rxnRetired"]]
    previously_retired = previous_id_mapping_df[
        previous_id_mapping_df["rxns"] == retiring
    ]
    retired_set_of_ids = set([retiring])
    if not previously_retired.empty:
        # Get all previously retired IDs
        retired_set_of_ids.update(
            previously_retired["rxnRetired"].apply(split_string).item()
        )
        # Pulling the ID out of retirement
        if new_id in retired_set_of_ids:
            retired_set_of_ids.remove(new_id)
        retired_set_of_ids.add(retiring)
    id_mapping_df.loc[idx, "rxnRetired"] = build_string(retired_set_of_ids, sep=";")

for old, new in id_mapping_dict.items():
    try:
        reaction = model.reactions.get_by_id(old)
    except KeyError:
        print(f"Could not map {old} to new ID.")
    else:
        reaction.id = id_mapping_dict[reaction.id]

model.repair()

id_mapping_df = pd.concat((id_mapping_df, previous_id_mapping_df), axis=0)
id_mapping_df = id_mapping_df.drop_duplicates().reset_index(drop=True)
id_mapping_df.to_csv(
    f"{ROOT_PATH}/data/deprecatedIdentifiers/{attribute_type}_deprecatedIdentifiers.tsv",
    sep="\t",
)
id_mapping_df

Unnamed: 0,rxns,rxnRetired
0,NO2_2Natex,NO22Natex
1,CYSLNET,CYSL
2,CYSTL1,CYSTCYSL
3,HSERL1,HSERL1
4,SELCYSTL1,SELCYSTL
...,...,...
186,DM_adprbp_c,SK_adprbp_c
187,DM_mi1345p_c,SK_mi1345p_c
188,DM_mi134p_c,SK_mi134p_c
189,DM_mi145p_c,SK_mi145p_c


### Update model based on curation and proteomic evidence
* Add metabolites first, then genes and lastly, reactions
* Add metabolites with mass and charge balanced formulas (ChemAxon)

In [7]:
dataframes_updated = {}
dataframes_evidence = {}
overwrite = True

#### Metabolites

In [8]:
attribute_type = "metabolites"

df_updated = pd.read_csv(
    f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_updated_1.2.0.tsv",
    sep="\t",
    index_col=None,
).fillna("")

attr_cols = ["metabolites", "name", "formula", "charge", "compartment"]
for idx, row in df_updated.iterrows():
    mid, name, formula, charge, compartment = row[attr_cols]
    if not model.metabolites.has_id(mid):
        # Add reaction to model
        model.add_metabolites([Metabolite(mid)])
        # print(mid)

    metabolite = model.metabolites.get_by_id(mid)
    metabolite.name = name
    metabolite.formula = formula
    metabolite.charge = int(charge)
    metabolite.compartment = compartment

    annotations_dict = row[~row.index.isin(attr_cols)].to_dict()
    notes = annotations_dict.pop("notes")
    if notes:
        metabolite.notes.update({"notes": notes})
    annotations_dict = {
        k: v
        for k, v in annotations_dict.items()
        if not k.startswith("metabolomic") and v
    }
    metabolite.annotation.update(annotations_dict)

met_formulas = standardardize_metabolite_formulas(
    dict(zip(model.metabolites.list_attr("id"), model.metabolites.list_attr("formula")))
)
for mid, formula in met_formulas.items():
    model.metabolites.get_by_id(mid).formula = formula


dataframes_updated[attribute_type] = df_updated
try:
    df_previous_evidence = pd.read_csv(
        f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_evidence.tsv",
        sep="\t",
        index_col=None,
        dtype=str,
    )
except FileNotFoundError:
    df_previous_evidence = pd.DataFrame([], columns=[attribute_type], dtype=str)

df_evidence = pd.concat((df_updated, df_previous_evidence), axis=0)
df_evidence[attribute_type] = df_evidence[attribute_type].replace(
    id_mapping_dicts[attribute_type]
)
df_evidence = df_evidence.drop_duplicates(subset=[attribute_type]).reset_index(
    drop=True
)
if overwrite:
    df_evidence.to_csv(
        f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_evidence.tsv",
        sep="\t",
        index=False,
    )
else:
    df_evidence.to_csv(
        f"{ROOT_PATH}{INTERIM_PATH}/{attribute_type}_evidence.tsv",
        sep="\t",
        index=False,
    )
dataframes_evidence[attribute_type] = df_evidence
dataframes_updated[attribute_type]

Unnamed: 0,metabolites,name,formula,charge,compartment,metabolomic evidence (#studies),metabolomic evidence (pubmed),references,notes
0,2mop_c,2-methyl-3-oxopropanoate,C4H5O3,-1,c,,,,
1,34dhmald_c,"3,4-dihydroxymandelaldehyde",C8H8O4,0,c,,,,
2,34dhoxmand_c,"3,4-dihydroxymandelate",C8H7O5,-1,c,,,,
3,3mlda_c,Methylimidazoleacetate,C6H7N2O2,-1,c,,,,
4,3mlda_e,Methylimidazoleacetate,C6H7N2O2,-1,e,,,,
5,3mldz_c,Methylimidazole-acetaldehyde,C6H8N2O,0,c,,,,
6,3mox4hoxm_c,vanillylmandelate,C9H9O5,-1,c,,,,
7,3mox4hoxm_e,vanillylmandelate,C9H9O5,-1,e,,,,
8,4hoxpacd_c,4-hydroxyphenylacetaldehyde,C8H8O2,0,c,,,,
9,4hphac_c,4-hydroxyphenylacetate,C8H7O3,-1,c,,,,


#### Genes

In [9]:
attribute_type = "genes"

df_updated = pd.read_csv(
    f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_updated_1.2.0.tsv",
    sep="\t",
    index_col=None,
).fillna("")

attr_cols = ["genes"]
for idx, row in df_updated.iterrows():
    # Between these three annotation fields, should be likely that the rest can get extracted from UniProt.
    gid = row[attr_cols[0]]
    if not model.genes.has_id(gid):
        gene = Gene(gid)
        model.genes.extend([gene])
        # print(gene.id)
    gene = model.genes.get_by_id(gid)
    annotations_dict = row[~row.index.isin(attr_cols)].to_dict()
    annotations_dict = {
        k: v for k, v in annotations_dict.items() if not k.startswith("proteomic") and v
    }
    gene.annotation.update(annotations_dict)


dataframes_updated[attribute_type] = df_updated
try:
    df_previous_evidence = pd.read_csv(
        f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_evidence.tsv",
        sep="\t",
        index_col=None,
        dtype=str,
    )
except FileNotFoundError:
    df_previous_evidence = pd.DataFrame([], columns=[attribute_type], dtype=str)


df_evidence = pd.concat((df_updated, df_previous_evidence), axis=0)
df_evidence[attribute_type] = df_evidence[attribute_type].replace(
    id_mapping_dicts[attribute_type]
)
df_evidence = df_evidence.drop_duplicates(subset=[attribute_type]).reset_index(
    drop=True
)
if overwrite:
    df_evidence.to_csv(
        f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_evidence.tsv",
        sep="\t",
        index=False,
    )
else:
    df_evidence.to_csv(
        f"{ROOT_PATH}{INTERIM_PATH}/{attribute_type}_evidence.tsv",
        sep="\t",
        index=False,
    )
dataframes_evidence[attribute_type] = df_evidence
dataframes_updated[attribute_type]

Unnamed: 0,genes,uniprot,ncbigene,hgnc.symbol,proteomic evidence (#studies),proteomic evidence (pubmed),references,notes
0,G6PD,P11413,2539,G6PD,17.0,pubmed:16861337;pubmed:18614565;pubmed:2455556...,pubmed:3126064;pubmed:5643703;pubmed:5666113;p...,
1,PGD,P52209,5226,PGD,17.0,pubmed:16861337;pubmed:18614565;pubmed:1977864...,pubmed:521257;pubmed:3932573;pubmed:3994686,
2,CPT1A,P50416,1374,CPT1A,14.0,pubmed:16861337;pubmed:24555563;pubmed:2607847...,pubmed:1618773;pubmed:2039446;pubmed:38513237,
3,ACSS2,Q9NR19,55902,ACSS2,2.0,pubmed:30327373;pubmed:38260479,pubmed:4379089;pubmed:6115779;pubmed:38513237,
4,SLC22A16,Q86VW1,85413,SLC22A16,0.0,,pubmed:35108516;pubmed:38513237,
...,...,...,...,...,...,...,...,...
91,GALK2,Q01415,2585,GALK2,,,,
92,CRYL1,Q9Y2S2,51084,CRYL1,,,,
93,CRPPA,A4D126,729920,CRPPA,,,pubmed:31375477,
94,TRIM58,Q8NG06,25893,TRIM58,,,pubmed:25241935,


#### Reactions
* Addition of hemoglobin glycation
* All intracellular reactions converted to use/consume NH4, which is dominant at pH 7.25. Transport and protonation reactions for NH3 kept.
* Add NEDD8 neddylation and fix ubiquitin reaction GPRs accordingly
* Reversibility updates for phosphoribosyltransferase reactions
* GPR updates according to Complex Portal and proteomic data
    * Add PRMT5 methylsome complex
    * Add PI3K class III complex proteins
    * Multiple Ubiquination complexes
    * Multiple VCP complexes and other ATPases
* Add IDH2 and MDH2 to GPRs
* Add SLC22A4 and SLC22A16 GPRs

In [10]:
attribute_type = "reactions"

df_updated = pd.read_csv(
    f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_updated_1.2.0.tsv",
    sep="\t",
    index_col=None,
).fillna("")

attr_cols = ["reactions", "name", "reaction", "gene reaction rule", "subsystem"]
for idx, row in df_updated.iterrows():
    rid, name, reaction_str, gpr, subsystem = row[attr_cols]
    try:
        if not model.reactions.has_id(rid):
            # Add reaction to model
            model.add_reactions([Reaction(rid)])
    except ValueError as e:
        print(idx, rid, name)
        raise e

    reaction = model.reactions.get_by_id(rid)
    reaction.build_reaction_from_string(reaction_str)
    reaction.name = name
    reaction.gene_reaction_rule = gpr
    reaction.subsystem = subsystem

    annotations_dict = row[~row.index.isin(attr_cols)].to_dict()
    notes = annotations_dict.pop("notes")
    if notes:
        reaction.notes.update({"notes": notes})
    if "proteomic evidence (#studies)" in annotations_dict:
        annotations_dict.pop("proteomic evidence (#studies)")
    annotations_dict = {
        k: v for k, v in annotations_dict.items() if not k.startswith("proteomic") and v
    }
    reaction.annotation.update(annotations_dict)


dataframes_updated[attribute_type] = df_updated
try:
    df_previous_evidence = pd.read_csv(
        f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_evidence.tsv",
        sep="\t",
        index_col=None,
        dtype=str,
    )
except FileNotFoundError:
    df_previous_evidence = pd.DataFrame([], columns=[attribute_type], dtype=str)

df_evidence = pd.concat((df_updated, df_previous_evidence), axis=0)
df_evidence[attribute_type] = df_evidence[attribute_type].replace(
    id_mapping_dicts[attribute_type]
)
df_evidence = df_evidence.drop_duplicates(subset=[attribute_type]).reset_index(
    drop=True
)
if overwrite:
    df_evidence.to_csv(
        f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_evidence.tsv",
        sep="\t",
        index=False,
    )
else:
    df_evidence.to_csv(
        f"{ROOT_PATH}{INTERIM_PATH}/{attribute_type}_evidence.tsv",
        sep="\t",
        index=False,
    )
dataframes_evidence[attribute_type] = df_evidence
dataframes_updated[attribute_type]

Unnamed: 0,reactions,name,reaction,gene reaction rule,subsystem,spontaneous,proteomic evidence (#studies),references,notes,metatlas
0,HBGLYC,Hemoglobin glycation,hb_c + glc__D_c --> h2o_c + hba1c_c,HBZ or HBG1 or HBE1 or HBG2 or HBB or HBM or H...,Hemoglobin binding and degradation,0,,pubmed:934240;pubmed:27708063;pubmed:33441732,,
1,NO2_2Natex,Transport of nitrite via sodium cotransport,2.0 na1_e + no2_e --> 2.0 na1_c + no2_c,SLC17A7,"Transport, extracellular",0,,pubmed:10820226;pubmed:10962014;pubmed:1267265...,,
2,NH4t,Transport of ammonium,nh4_c <=> nh4_e,AQP3 or AQP1 or (RHAG and RHCE and RHD) or (RH...,"Transport, extracellular",0,,doi:10.1134/S1990747816040097;pubmed:11861637;...,,
3,CSND,Cytosine deaminase,csn_c + h2o_c + h_c --> nh4_c + ura_c,,Pyrimidine metabolism,0,,,GAPFILLING,
4,CTPS1,CTP synthase (ammonia),atp_c + nh4_c + utp_c --> adp_c + ctp_c + 2 h_...,CTPS1 or CTPS2,Pyrimidine metabolism,0,CTPS1 (16);CTPS2 (12),pubmed:10064135,,
...,...,...,...,...,...,...,...,...,...,...
110,SELMETHL1,Selenomethionine methanethiol-lyase (deaminating),selmet__L_c --> 2abuten_c + methsel_c,CTH,Selenocompound metabolism,0,CTH (2),pubmed:24636780;pubmed:24887198;pubmed:2764614...,,
111,2AMACT,Nonenzymatic tautomerization of 2-Aminoprop-2-...,2amac_c <=> 2imppa_c,CTH,"Glycine, serine and threonine metabolism",1,,pubmed:27646145,Spontaneous according to KEGG,
112,2IMPPADA,2-iminopropanoate aminohydrolase,2imppa_c + h2o_c --> pyr_c + nh4_c,RIDA,"Glycine, serine and threonine metabolism",1,,pubmed:27646145,Spontaneous according to KEGG,
113,2ABUTENT,Nonenzymatic tautomerization of 2-aminobut-2-e...,2abuten_c <=> 2ibut_c,,"Glycine, serine and threonine metabolism",1,,pubmed:24636780;pubmed:27646145,Spontaneous according to KEGG,


### Ensure all metabolites, genes, and reactions exist

In [11]:
missing_metabolites = set(model.metabolites.list_attr("id")).symmetric_difference(
    dataframes_evidence["metabolites"]["metabolites"].values
)

missing_genes = set(model.genes.list_attr("id")).symmetric_difference(
    dataframes_evidence["genes"]["genes"].values
)

missing_reactions = set(
    model.reactions.query(lambda x: not x.subsystem == "Pseudoreactions").list_attr(
        "id"
    )
).symmetric_difference(dataframes_evidence["reactions"]["reactions"].values)

print(len(missing_genes))
print(len(missing_metabolites))
print(len(missing_reactions))

0
0
0


In [12]:
for x in missing_metabolites:
    print(x)

In [13]:
for x in missing_genes:
    print(x)

In [14]:
for x in missing_reactions:
    print(x)

### Check for extra metabolites, genes, and reactions

In [15]:
for met in model.metabolites.query(lambda x: not len(x.reactions)):
    print(f"{met.id}")

In [16]:
for gene in model.genes.query(lambda x: not len(x.reactions)):
    print(gene.id)

In [17]:
for reaction in model.reactions.query(lambda x: not len(x.metabolites)):
    print(reaction.id)

#### Add exchanges

In [18]:
boundaries = {
    # "adprbp_c": "demand"
}
for met, btype in boundaries.items():
    met = model.metabolites.get_by_id(met)
    try:
        model.add_boundary(met, type=btype)
    except ValueError:
        pass


for met in model.metabolites.query(lambda x: x.compartment == "e"):
    try:
        model.add_boundary(met, type="exchange")
    except ValueError:
        pass

for reaction in model.boundary:
    reaction.subsystem = "Pseudoreactions"

#### Reset subsystem groups

In [19]:
reset_subsystem_groups(model)
model

0,1
Name,RBC_GEM
Memory address,158734450
Number of metabolites,2057
Number of reactions,3030
Number of genes,779
Number of groups,78
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"


### Check mass balancing

In [20]:
for reaction in model.reactions:
    if reaction.boundary:
        continue
    if reaction.check_mass_balance():
        print(reaction)
        print(reaction.check_mass_balance())
        print()

6LTHPI: 6lthp_c --> h_c + sppt_c
{'charge': 2.0}

METHBCYTBR: 2.0 focytb5_c + methb_c --> 2.0 ficytb5_c + hb_c
{'charge': 1.0}

METHBFMNR: fmnh2_c + methb_c --> fmn_c + 3.0 h_c + hb_c
{'charge': 1.0}

POOL_FACOA: FAcoa_hs_c <=> 0.0004 FAcoa_hs_12_0_c + 0.0004 FAcoa_hs_13_0_c + 0.0133 FAcoa_hs_14_0_c + 0.0004 FAcoa_hs_14_5Z_c + 0.0004 FAcoa_hs_14_7Z_c + 0.0004 FAcoa_hs_14_9Z_c + 0.0004 FAcoa_hs_15_0_c + 0.222 FAcoa_hs_16_0_c + 0.0004 FAcoa_hs_16_7Z_c + 0.0219 FAcoa_hs_16_9Z_c + 0.0004 FAcoa_hs_17_0_c + 0.0004 FAcoa_hs_17_10Z_c + 0.0004 FAcoa_hs_17_9Z_c + 0.1498 FAcoa_hs_18_0_c + 0.025 FAcoa_hs_18_11Z_c + 0.0004 FAcoa_hs_18_13Z_c + 0.0025 FAcoa_hs_18_6Z9Z12Z15Z_c + 0.0029 FAcoa_hs_18_6Z9Z12Z_c + 0.0004 FAcoa_hs_18_6Z9Z_c + 0.0004 FAcoa_hs_18_7Z_c + 0.0004 FAcoa_hs_18_9E_c + 0.0084 FAcoa_hs_18_9Z12Z15Z_c + 0.1915 FAcoa_hs_18_9Z12Z_c + 0.1545 FAcoa_hs_18_9Z_c + 0.0004 FAcoa_hs_19_0_c + 0.0004 FAcoa_hs_20_0_c + 0.0215 FAcoa_hs_20_11Z14Z17Z_c + 0.0004 FAcoa_hs_20_11Z14Z_c + 0.0004 FAcoa_hs_2

### Set bounds

In [21]:
reset_reaction_bounds(model)

Before: EX_ade_e: ade_e <=> 
Before: EX_adn_e: adn_e <=> 
Before: EX_35cgmp_e: 35cgmp_e --> 
Before: EX_adrnl__L_e: adrnl__L_e <-- 
Before: EX_ala__L_e: ala__L_e --> 
Before: EX_arg__L_e: arg__L_e <-- 
Before: EX_ascb__L_e: ascb__L_e --> 
Before: EX_3moxtyr_e: 3moxtyr_e --> 
Before: EX_4pyrdx_e: 4pyrdx_e --> 
Before: EX_ca2_e: ca2_e <=> 
Before: EX_35camp_e: 35camp_e --> 
Before: EX_chol_e: chol_e <-- 
Before: EX_cl_e: cl_e <=> 
Before: EX_co_e: co_e --> 
Before: EX_co2_e: co2_e <=> 
Before: EX_cys__L_e: cys__L_e <-- 
Before: EX_dhdascb__L_e: dhdascb__L_e <-- 
Before: EX_dpam__L_e: dpam__L_e <-- 
Before: EX_5aop_e: 5aop_e <-- 
Before: EX_etha_e: etha_e <-- 
Before: EX_fe2_e: fe2_e <=> 
Before: EX_fru_e: fru_e <=> 
Before: EX_fum_e: fum_e <=> 
Before: EX_gal_e: gal_e <-- 
Before: EX_ac_e: ac_e --> 
Before: EX_gam_e: gam_e <-- 
Before: EX_acald_e: acald_e <=> 
Before: EX_acnam_e: acnam_e <-- 
Before: EX_glc__D_e: glc__D_e <-- 
Before: EX_gln__L_e: gln__L_e <=> 
Before: EX_gly_e: gly_e <-

In [22]:
# blocked_reactions = find_blocked_reactions(model, open_exchanges=True)

In [23]:
# blocked_reactions

### Export model

In [24]:
if overwrite:
    write_rbc_model(model, filetype={"xml", "json"})
else:
    write_rbc_model(
        model, filetype={"xml", "json"}, directory=f"{ROOT_PATH}{INTERIM_PATH}"
    )
model

0,1
Name,RBC_GEM
Memory address,158734450
Number of metabolites,2057
Number of reactions,3030
Number of genes,779
Number of groups,78
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"
