# RBC-GEM 1.1.0 Updates
## Setup
### Import packages

In [1]:
import pandas as pd
from cobra.core import Gene, Group, Metabolite, Reaction
from rbc_gem_utils import (COBRA_CONFIGURATION, CURATION_PATH, INTERIM_PATH,
                           ROOT_PATH, build_string, read_rbc_model,
                           write_rbc_model)
from rbc_gem_utils.annotation import set_sbo_default_annotations
from rbc_gem_utils.qc import standardardize_metabolite_formulas

### Define configuration
#### COBRA Configuration

In [2]:
COBRA_CONFIGURATION

Attribute,Description,Value
solver,Mathematical optimization solver,glpk
tolerance,"General solver tolerance (feasibility, integrality, etc.)",1e-07
lower_bound,Default reaction lower bound,-1000.0
upper_bound,Default reaction upper bound,1000.0
processes,Number of parallel processes,15
cache_directory,Path for the model cache,/Users/zhaiman/Library/Caches/cobrapy
max_cache_size,Maximum cache size in bytes,104857600
cache_expiration,Model cache expiration time in seconds (if any),


## Load RBC-GEM model
### Version: 1.0.0

In [3]:
model = read_rbc_model(filetype="yml")

### Deprecate identifiers

In [4]:
id_mapping_dicts = {"metabolites": {}, "genes": {}, "reactions": {"ARD": "ARDFE2"}}

In [5]:
attribute_type = "reactions"
id_mapping_dict = id_mapping_dicts[attribute_type]


id_mapping_df = pd.DataFrame.from_dict(id_mapping_dict, orient="index")
id_mapping_df = id_mapping_df.reset_index(drop=False)
id_mapping_df.columns = ["rxnRetired", "rxns"]
id_mapping_df = id_mapping_df.loc[:, id_mapping_df.columns[::-1]]

previous_id_mapping_df = pd.read_csv(
    f"{ROOT_PATH}/data/deprecatedIdentifiers/{attribute_type}_deprecatedIdentifiers.tsv",
    sep="\t",
    index_col=0,
)

for idx, row in id_mapping_df.iterrows():
    new_id, retiring = row[["rxns", "rxnRetired"]]
    previously_retired = previous_id_mapping_df[
        previous_id_mapping_df["rxns"] == retiring
    ]
    retired_set_of_ids = {retiring}
    if not previously_retired.empty:
        # Get all previously retired IDs
        retired_set_of_ids.update(
            previously_retired["rxnRetired"].apply(split_string).item()
        )
        # Pulling the ID out of retirement
        if new_id in retired_set_of_ids:
            retired_set_of_ids.remove(new_id)
        retired_set_of_ids.add(retiring)
    id_mapping_df.loc[idx, "rxnRetired"] = build_string(retired_set_of_ids, sep=";")

for old, new in id_mapping_dict.items():
    try:
        reaction = model.reactions.get_by_id(old)
    except KeyError:
        print(f"Could not map {old} to new ID.")
    else:
        reaction.id = id_mapping_dict[reaction.id]

model.repair()

id_mapping_df = pd.concat((id_mapping_df, previous_id_mapping_df), axis=0)
id_mapping_df = id_mapping_df.drop_duplicates().reset_index(drop=True)
id_mapping_df.to_csv(
    f"{ROOT_PATH}/data/deprecatedIdentifiers/{attribute_type}_deprecatedIdentifiers.tsv",
    sep="\t",
)
id_mapping_df

Unnamed: 0,rxns,rxnRetired
0,ARDFE2,ARD
1,NADHload,DM_nadh
2,CAATPS2,CAATPS
3,BILIREDy,BILIRED
4,GTHOy,GTHOr
...,...,...
180,DM_adprbp_c,SK_adprbp_c
181,DM_mi1345p_c,SK_mi1345p_c
182,DM_mi134p_c,SK_mi134p_c
183,DM_mi145p_c,SK_mi145p_c


### Update model based on curation and proteomic evidence
* Add metabolites first, then genes and lastly, reactions
* Add metabolites with mass and charge balanced formulas (ChemAxon)

In [6]:
dataframes_updated = {}
dataframes_evidence = {}
overwrite = False

#### Metabolites

In [7]:
attribute_type = "metabolites"

df_updated = pd.read_csv(
    f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_updated_1.1.0.tsv",
    sep="\t",
    index_col=0,
    dtype=str,
).fillna("")

attr_cols = ["metabolites", "name", "formula", "charge", "compartment"]
for idx, row in df_updated.iterrows():
    mid, name, formula, charge, compartment = row[attr_cols]
    if not model.metabolites.has_id(mid):
        # Add reaction to model
        model.add_metabolites([Metabolite(mid)])
        # print(mid)

    metabolite = model.metabolites.get_by_id(mid)
    metabolite.name = name
    metabolite.formula = formula
    metabolite.charge = int(charge)
    metabolite.compartment = compartment

    annotations_dict = row[~row.index.isin(attr_cols)].to_dict()
    notes = annotations_dict.pop("notes")
    if notes:
        metabolite.notes.update({"notes": notes})
    annotations_dict = {
        k: v
        for k, v in annotations_dict.items()
        if not k.startswith("metabolomic") and v
    }
    metabolite.annotation.update(annotations_dict)

met_formulas = standardardize_metabolite_formulas(
    dict(zip(model.metabolites.list_attr("id"), model.metabolites.list_attr("formula")))
)
for mid, formula in met_formulas.items():
    model.metabolites.get_by_id(mid).formula = formula

dataframes_updated[attribute_type] = df_updated

try:
    df_previous_evidence = pd.read_csv(
        f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_evidence.tsv",
        sep="\t",
        index_col=0,
        dtype=str,
    )
except FileNotFoundError:
    df_previous_evidence = pd.DataFrame([], columns=[attribute_type], dtype=str)


df_evidence = pd.concat((df_updated, df_previous_evidence), axis=0)
df_evidence = df_evidence.drop_duplicates(subset=[attribute_type]).reset_index(
    drop=True
)
if overwrite:
    df_evidence.to_csv(
        f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_evidence.tsv", sep="\t"
    )
else:
    df_evidence.to_csv(
        f"{ROOT_PATH}{INTERIM_PATH}/{attribute_type}_evidence.tsv", sep="\t"
    )
dataframes_evidence[attribute_type] = df_evidence
dataframes_updated[attribute_type]

Unnamed: 0_level_0,metabolites,name,formula,charge,compartment,metabolomic evidence (#studies),metabolomic evidence (pubmed),references,notes
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,diiodthy__L_c,"3,3'-Diiodo-L-thyronine (T2)",C15H13I2NO4,0,c,,,,
,diiodthy__L_e,"3,3'-Diiodo-L-thyronine (T2)",C15H13I2NO4,0,e,,,,
,hser__L_e,L-homoserine,C4H9NO3,0,e,,,,
,hLkynr_c,3-hydroxy-L-kynurenine,C10H12N2O4,0,c,,,,
,hser__L_c,L-homoserine,C4H9NO3,0,c,,,,
,3mtp_c,3-(methylsulfanyl)propanoate,C4H7O2S,-1,c,,,,
,3mtp_e,3-(methylsulfanyl)propanoate,C4H7O2S,-1,e,,,,
,hLkynr_e,3-hydroxy-L-kynurenine,C10H12N2O4,0,e,,,,
,mercpur_e,6-mercaptopurine,C5H4N4S,0,e,,,,
,thiogmp_e,6-thioguanosine monophosphate,C10H12N5O7PS,-2,e,,,,


#### Genes

In [8]:
attribute_type = "genes"

df_updated = (
    pd.read_csv(
        f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_updated_1.1.0.tsv",
        sep="\t",
        index_col=0,
        dtype=str,
    )
    .fillna("")
    .reset_index(drop=True)
)

attr_cols = ["genes"]
for idx, row in df_updated.iterrows():
    # Between these three annotation fields, should be likely that the rest can get extracted from UniProt.
    gid = row[attr_cols[0]]
    if not model.genes.has_id(gid):
        gene = Gene(gid)
        model.genes.extend([gene])
        # print(gene.id)
    gene = model.genes.get_by_id(gid)
    annotations_dict = row[~row.index.isin(attr_cols)].to_dict()
    annotations_dict = {
        k: v for k, v in annotations_dict.items() if not k.startswith("proteomic") and v
    }
    gene.annotation.update(annotations_dict)

dataframes_updated[attribute_type] = df_updated

try:
    df_previous_evidence = pd.read_csv(
        f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_evidence.tsv",
        sep="\t",
        index_col=0,
        dtype=str,
    )
except FileNotFoundError:
    df_previous_evidence = pd.DataFrame([], columns=[attribute_type], dtype=str)


df_evidence = pd.concat((df_updated, df_previous_evidence), axis=0).reset_index(
    drop=True
)
df_evidence = df_evidence.drop_duplicates(subset=[attribute_type]).reset_index(
    drop=True
)

if overwrite:
    df_evidence.to_csv(
        f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_evidence.tsv", sep="\t"
    )
else:
    df_evidence.to_csv(
        f"{ROOT_PATH}{INTERIM_PATH}/{attribute_type}_evidence.tsv", sep="\t"
    )
dataframes_evidence[attribute_type] = df_evidence
dataframes_updated[attribute_type]

Unnamed: 0,genes,uniprot,ncbigene,hgnc.symbol,proteomic evidence (#studies),proteomic evidence (pubmed),references,notes
0,SLC16A7,O60669,9194,SLC16A7,,,,
1,ALDOB,P05062,229,ALDOB,,,,
2,ENO3,P13929,2027,ENO3,,,,
3,ACYP2,P14621,98,ACYP2,,,pubmed:7796909,
4,CDK2,P24941,1017,CDK2,,,,
5,CDC34,P49427,997,CDC34,,,,
6,ALDH3A2,P51648,224,ALDH3A2,,,,
7,ACTC1,P68032,70,ACTC1,,,,
8,ACTA1,P68133,58,ACTA1,,,,
9,CSNK1G2,P78368,1455,CSNK1G2,,,,


#### Reactions

In [9]:
attribute_type = "reactions"

df_updated = pd.read_csv(
    f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_updated_1.1.0.tsv",
    sep="\t",
    index_col=0,
    dtype=str,
).fillna("")

attr_cols = ["reactions", "name", "reaction", "gene reaction rule", "subsystem"]
for idx, row in df_updated.iterrows():
    rid, name, reaction_str, gpr, subsystem = row[attr_cols]
    if not model.reactions.has_id(rid):
        # Add reaction to model
        model.add_reactions([Reaction(rid)])

    reaction = model.reactions.get_by_id(rid)
    reaction.build_reaction_from_string(reaction_str)
    reaction.name = name
    reaction.gene_reaction_rule = gpr
    reaction.subsystem = subsystem

    annotations_dict = row[~row.index.isin(attr_cols)].to_dict()
    notes = annotations_dict.pop("notes")
    if notes:
        reaction.notes.update({"notes": notes})
    if "proteomic evidence (#studies)" in annotations_dict:
        annotations_dict.pop("proteomic evidence (#studies)")
    annotations_dict = {
        k: v for k, v in annotations_dict.items() if not k.startswith("proteomic") and v
    }
    reaction.annotation.update(annotations_dict)


dataframes_updated[attribute_type] = df_updated

try:
    df_previous_evidence = pd.read_csv(
        f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_evidence.tsv",
        sep="\t",
        index_col=0,
        dtype=str,
    )
except FileNotFoundError:
    df_previous_evidence = pd.DataFrame([], columns=[attribute_type], dtype=str)

df_evidence = pd.concat((df_updated, df_previous_evidence), axis=0)
df_evidence[attribute_type] = df_evidence[attribute_type].replace(
    id_mapping_dicts[attribute_type]
)
df_evidence = df_evidence.drop_duplicates(subset=[attribute_type]).reset_index(
    drop=True
)
if overwrite:
    df_evidence.to_csv(
        f"{ROOT_PATH}{CURATION_PATH}/{attribute_type}_evidence.tsv", sep="\t"
    )
else:
    df_evidence.to_csv(
        f"{ROOT_PATH}{INTERIM_PATH}/{attribute_type}_evidence.tsv", sep="\t"
    )
dataframes_evidence[attribute_type] = df_evidence
dataframes_updated[attribute_type]

Unnamed: 0_level_0,reactions,name,reaction,gene reaction rule,subsystem,spontaneous,proteomic evidence (#studies),references,notes,metatlas
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
298.0,ARDFE2,Acireductone dioxygenase [iron(II)-requiring],dhmtp_c + o2_c --> 2kmb_c + for_c + h_c,ADI1,Cysteine and methionine metabolism,1,,pubmed:27491795,According to description of EC 3.1.3.77. React...,MAR05389
799.0,BHMT,Betaine-homocysteine S-methyltransferase,glyb_c + hcys__L_c --> dmgly_c + met__L_c,BHMT,"Glycine, serine and threonine metabolism",0,,pubmed:24636780;pubmed:25200932;pubmed:25556331,,
812.0,ACYP,3-Phospho-D-glyceroyl phosphate phosphohydrolase,13dpg_c + h2o_c --> 3pg_c + h_c + pi_c,ACYP1 or ACYP2,Glycolysis / Gluconeogenesis,0,,pubmed:1645713;pubmed:3026468;pubmed:4311179,,
818.0,E14BPP,Erroneous formation of 4-phosphoerythronate,e14bp_c + h2o_c --> 4per_c + pi_c,ACYP1 or ACYP2,Glycolysis / Gluconeogenesis,0,,pubmed:1645713;pubmed:3026468,,
820.0,ENO,Enolase,2pg_c <=> h2o_c + pep_c,ENO1 or ENO2 or ENO3 or (ENO1 and ENO2) or (EN...,Glycolysis / Gluconeogenesis,0,,doi:10.1016/B978-0-12-677201-2.X5001-6;pubmed:...,,
...,...,...,...,...,...,...,...,...,...,...
,TRPTYRex,L-tyrptophan/L-tyrosine exchange,trp__L_c + tyr__L_e <=> trp__L_e + tyr__L_c,(SLC7A5 and SLC3A2),"Transport, extracellular",0,,pubmed:37976448;pubmed:36582828,,MAR05524
,TRPVALex,L-tyrptophan/L-valine exchange,trp__L_c + val__L_e <=> trp__L_e + val__L_c,(SLC7A5 and SLC3A2),"Transport, extracellular",0,,pubmed:37976448;pubmed:36582828,,MAR05529
,TYRVALex,L-tyrosine/L-valine exchange,tyr__L_c + val__L_e <=> tyr__L_e + val__L_c,(SLC7A5 and SLC3A2),"Transport, extracellular",0,,pubmed:37976448;pubmed:36582828,,MAR05546
,BILRUBABCte,ATP-binding Cassette (ABC) export of unconjuga...,bilirub_c + atp_c + h2o_c --> bilirub_e + adp...,ABCC4,"Transport, extracellular",0,,,,


### Ensure all metabolites, genes, and reactions exist

In [10]:
missing_metabolites = set(model.metabolites.list_attr("id")).symmetric_difference(
    dataframes_evidence["metabolites"]["metabolites"].values
)

missing_genes = set(model.genes.list_attr("id")).symmetric_difference(
    dataframes_evidence["genes"]["genes"].values
)

missing_reactions = set(
    model.reactions.query(lambda x: not x.subsystem == "Pseudoreactions").list_attr(
        "id"
    )
).symmetric_difference(dataframes_evidence["reactions"]["reactions"].values)

print(len(missing_genes))
print(len(missing_metabolites))
print(len(missing_reactions))

0
0
1


In [11]:
missing_genes, missing_metabolites, missing_reactions

(set(), set(), {'GLCt1'})

In [12]:
for x in missing_genes:
    print(x)

#### Add exchanges

In [13]:
boundaries = {
    # "adprbp_c": "demand"
}
for met, btype in boundaries.items():
    met = model.metabolites.get_by_id(met)
    try:
        model.add_boundary(met, type=btype)
    except ValueError:
        pass


for met in model.metabolites.query(lambda x: x.compartment == "e"):
    try:
        model.add_boundary(met, type="exchange")
    except ValueError:
        pass

for reaction in model.boundary:
    reaction.subsystem = "Pseudoreactions"

#### Reset subsystem groups

In [14]:
model.remove_groups(model.groups)
for subsystem in sorted(set(model.reactions.list_attr("subsystem"))):
    reaction_list = model.reactions.query(lambda x: x.subsystem == subsystem)
    if subsystem not in model.groups:
        group = Group(id=subsystem, name=subsystem, members=reaction_list)
        model.add_groups([group])
    else:
        group = model.groups.get_by_id(subsystem).add_members(reaction_list)

### Check mass balancing

In [15]:
for reaction in model.reactions:
    if reaction.boundary:
        continue
    if reaction.check_mass_balance():
        print(reaction)
        print(reaction.check_mass_balance())
        print()

6LTHPI: 6lthp_c --> h_c + sppt_c
{'charge': 2.0}

METHBCYTBR: 2.0 focytb5_c + methb_c --> 2.0 ficytb5_c + hb_c
{'charge': 1.0}

METHBFMNR: fmnh2_c + methb_c --> fmn_c + 3.0 h_c + hb_c
{'charge': 1.0}

POOL_FACOA: FAcoa_hs_c <=> 0.0004 FAcoa_hs_12_0_c + 0.0004 FAcoa_hs_13_0_c + 0.0133 FAcoa_hs_14_0_c + 0.0004 FAcoa_hs_14_5Z_c + 0.0004 FAcoa_hs_14_7Z_c + 0.0004 FAcoa_hs_14_9Z_c + 0.0004 FAcoa_hs_15_0_c + 0.222 FAcoa_hs_16_0_c + 0.0004 FAcoa_hs_16_7Z_c + 0.0219 FAcoa_hs_16_9Z_c + 0.0004 FAcoa_hs_17_0_c + 0.0004 FAcoa_hs_17_10Z_c + 0.0004 FAcoa_hs_17_9Z_c + 0.1498 FAcoa_hs_18_0_c + 0.025 FAcoa_hs_18_11Z_c + 0.0004 FAcoa_hs_18_13Z_c + 0.0025 FAcoa_hs_18_6Z9Z12Z15Z_c + 0.0029 FAcoa_hs_18_6Z9Z12Z_c + 0.0004 FAcoa_hs_18_6Z9Z_c + 0.0004 FAcoa_hs_18_7Z_c + 0.0004 FAcoa_hs_18_9E_c + 0.0084 FAcoa_hs_18_9Z12Z15Z_c + 0.1915 FAcoa_hs_18_9Z12Z_c + 0.1545 FAcoa_hs_18_9Z_c + 0.0004 FAcoa_hs_19_0_c + 0.0004 FAcoa_hs_20_0_c + 0.0215 FAcoa_hs_20_11Z14Z17Z_c + 0.0004 FAcoa_hs_20_11Z14Z_c + 0.0004 FAcoa_hs_2

### Remove duplicated reaction

In [16]:
duplicated_reaction = model.reactions.get_by_id("GLCt1")
# Fix annotations
original_reaction = model.reactions.get_by_id("GLC_Dt")
original_reaction.annotation.update(duplicated_reaction.annotation)

model.remove_reactions([duplicated_reaction])

  warn("need to pass in a list")


### Export model

In [17]:
write_rbc_model(model, filetype={"xml", "json"})
model

0,1
Name,RBC_GEM
Memory address,1494e9050
Number of metabolites,2008
Number of reactions,2976
Number of genes,718
Number of groups,77
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"
