# Extract identifiers from HumanGEM

Use HumanGEM (1.18.0) to get the MetabolicAtlas identifiers for reactions and metabolites.
Will need for annotation purposes and identifier tracking in later scripts.

## Setup
### Import packages

In [1]:
import pandas as pd

from rbc_gem_utils import (
    COBRA_CONFIGURATION, 
    REPO_PATH, 
    show_versions,
    read_rbc_model, write_rbc_model, 
    read_cobra_model,
)

from rbc_gem_utils.util import build_string
# Display versions of last time notebook ran and worked
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.1

Dependency Information
----------------------
cobra       0.29.0
depinfo      2.2.0
memote      0.16.1
notebook     7.0.6
scipy       1.11.4
simplejson missing

Build Tools Information
-----------------------
pip        23.3.1
setuptools 68.2.2
wheel      0.41.2

Platform Information
--------------------
Darwin  22.6.0-x86_64
CPython        3.12.0


### Define configuration
#### COBRA Configuration

In [2]:
COBRA_CONFIGURATION

Attribute,Description,Value
solver,Mathematical optimization solver,gurobi
tolerance,"General solver tolerance (feasibility, integrality, etc.)",1e-07
lower_bound,Default reaction lower bound,-1000.0
upper_bound,Default reaction upper bound,1000.0
processes,Number of parallel processes,15
cache_directory,Path for the model cache,/Users/zhaiman/Library/Caches/cobrapy
max_cache_size,Maximum cache size in bytes,104857600
cache_expiration,Model cache expiration time in seconds (if any),


## Load RBC-GEM model
### Version: 0.1.1

In [3]:
model = read_rbc_model(filetype="xml")
model

Set parameter Username
Academic license - for non-commercial use only - expires 2024-11-28


0,1
Name,iAB_RBC_283
Memory address,1466f6ea0
Number of metabolites,342
Number of reactions,469
Number of genes,346
Number of groups,41
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"


## Load HumanGEM model
### Version: 1.18.0

In [4]:
HumanGEM = read_cobra_model(f"{REPO_PATH}/data/raw/Human-GEM.xml")
HumanGEM

0,1
Name,HumanGEM
Memory address,146ed4cb0
Number of metabolites,8456
Number of reactions,12995
Number of genes,2889
Number of groups,148
Objective expression,1.0*MAR13082 - 1.0*MAR13082_reverse_11d67
Compartments,"Cytosol, Extracellular, Lysosome, Endoplasmic reticulum, Mitochondria, Peroxisome, Golgi apparatus, Nucleus, Inner mitochondria"


In [5]:
HumanGEM.groups.list_attr("name")

[' Arachidonic acid metabolism',
 'Acyl-CoA hydrolysis',
 'Acylglycerides metabolism',
 'Alanine, aspartate and glutamate metabolism',
 'Alkaloids biosynthesis',
 'Amino sugar and nucleotide sugar metabolism',
 'Aminoacyl-tRNA biosynthesis',
 'Androgen metabolism',
 'Arachidonic acid metabolism',
 'Arginine and proline metabolism',
 'Artificial reactions',
 'Ascorbate and aldarate metabolism',
 'Beta oxidation of branched-chain fatty acids (mitochondrial)',
 'Beta oxidation of di-unsaturated fatty acids (n-6) (mitochondrial)',
 'Beta oxidation of di-unsaturated fatty acids (n-6) (peroxisomal)',
 'Beta oxidation of even-chain fatty acids (mitochondrial)',
 'Beta oxidation of even-chain fatty acids (peroxisomal)',
 'Beta oxidation of odd-chain fatty acids (mitochondrial)',
 'Beta oxidation of odd-chain fatty acids (peroxisomal)',
 'Beta oxidation of phytanic acid (peroxisomal)',
 'Beta oxidation of poly-unsaturated fatty acids (mitochondrial)',
 'Beta oxidation of unsaturated fatty acids

In [6]:
for g in HumanGEM.groups.query(lambda x: x.name.lower() == 'Urea cycle'.lower()):
    for m in g.members:
        if any([c not in {"e", "c"} for c in m.compartments]):
            continue
        print(m.name)
        print(m.build_reaction_string(use_metabolite_names=True))
        print(m.compartments)
        print()



4-aminobutanal <=> 1-pyrroline + H2O
{'c'}



In [11]:
model.reactions.BILIRBU

0,1
Reaction identifier,BILIRBU
Name,Bilirubin UDP-glucuronosyltransferase
Memory address,0x1474ff9e0
Stoichiometry,bilirub_c + 2.0 h_c + udpglcur_c --> bilglcur_c + udp_c  Bilirubin cytosol + 2.0 H+ + UDP-D-glucuronate --> Bilirubin monoglucuronide + UDP C9H11N2O12P2
GPR,Ugt1A1_AT1 or Ugt1A4_AT1
Lower bound,0.0
Upper bound,1000.0


## Map model BiGG ID reactions to MetabolicAtlas via HumanGEM

In [12]:
model_reaction_ids = {reaction.id for reaction in model.reactions}

bigg_reactions = set()
human_gem_mapping = {}
for reaction in HumanGEM.reactions:
    bigg_ids = reaction.annotation.get("bigg.reaction")
    if not bigg_ids:
        continue
    if isinstance(bigg_ids, str):
        bigg_ids = [bigg_ids]
    bigg_ids = set(bigg_ids)
    bigg_reactions.update(bigg_ids)
    for bigg_id in bigg_ids:
        if bigg_id in human_gem_mapping:
            human_gem_mapping[bigg_id] += [reaction.id]
        else:
            human_gem_mapping[bigg_id] = [reaction.id]


print(f"Number of reactions in RBC-GEM (excluding boundaries): {len(model_reaction_ids)}")

intersection = sorted(bigg_reactions.intersection(model_reaction_ids))
print(f"Number of reactions that could be found: {len(intersection)}")
print()
id_mapping_dict = {}
for rbc_reaction in intersection:
    reaction_ids = human_gem_mapping.get(rbc_reaction, [])
    if len(reaction_ids) > 1:
        print(f"Check {rbc_reaction} for false mappings")
    id_mapping_dict[rbc_reaction] = build_string(reaction_ids)

manual_updates_and_corrections = {
    # Selected reactions below needed corrections.
    "CAATPS": "MAR07629",
    "CAT": "MAR03980",
    "CHLPCTD": "MAR00638",
    "CHOLK": "MAR00636",
    "CHOLt4": "MAR07734",
    "COt": "MAR07798",
    "CYStec": "MAR05084",
    "CYTK1": "MAR04024",
    "EX_ac_e": "MAR09086",
    "EX_adrnl_e": "MAR09095",
    "EX_ala__L_e": "MAR09061",
    "EX_arg__L_e": "MAR09066",
    "EX_chol_e": "MAR09083",
    "EX_cl_e": "MAR09150",
    "EX_dopa_e": "MAR09092",
    "EX_fe2_e": "MAR09076",
    "EX_gal_e": "MAR09140",
    "EX_gam_e": "MAR09168",
    "EX_glc__D_e": "MAR09034",
    "EX_gln__L_e": "MAR09063",
    "EX_h2o_e": "MAR09047",
    "EX_h_e": "MAR09079",
    "EX_hco3_e": "MAR09078",
    "EX_k_e": "MAR09081",
    "EX_lnlc_e": "MAR09035",
    "EX_met__L_e": "MAR09042",
    "EX_nac_e": "MAR09142",
    "EX_nh4_e": "MAR11420",
    "EX_nrpphr_e": "MAR09093",
    "EX_ocdcea_e": "MAR00650",
    "EX_phe__L_e": "MAR09043",
    "EX_pyr_e": "MAR09133",
    "EX_ribflv_e": "MAR09143",
    "EX_thmmp_e": "MAR09105",
    "FBA": "MAR04375",
    "FBP26": "MAR04706",
    "FUM": "MAR04408",
    "G6PDH2r": "MAR08971",
    "GALKr": "MAR04130",
    "GAMt1r": "MAR04996",
    "GAPD": "MAR04373",
    "GLNt4": "MAR05308",
    "MEPIVESSte": "MAR08922",
    "NADK": "MAR04269",
    "NH4t3r": "MAR01534",
    "PPPGO": "MAR11316",
    # Selected reactions needed manaul updates
    "ACP1_FMN": "MAR06507",
    "ADRNLtu": "MAR09192",
    "ARD": "MAR05389",
    "ARGN": "MAR03816",
    "BILIRBU": "MAR11321",
    "C160CPT2rbc": "MAR02626",
    "C181CPT2rbc": "MAR11310",
    "CHLP": "MAR08424",
    "DGULND": "MAR08353",
    "DHAAt1r": "MAR08846",
    "DOPAMT": "MAR06763",
    "DPGM": "MAR04371",
    "DPGase": "MAR04372",
    "ENOPH": "MAR05387",
    "ETHAt": "MAR07896",
    "GALOR": "MAR08766",
    "GALT": "MAR08767",
    "GLCt1": "MAR05029",
    "GMPR": "MAR04419",
    "GPDDA1": "MAR00635",
    "GULND": "MAR06537",
    "HCO3_CLt": "MAR06525",
    "LEUKTRA4t": "MAR06254",
    "LEUKTRB4t": "MAR06255",
    "LNLCCPT2rbc": "MAR02742",
    "LTA4H": "MAR01080",
    "MDRPD": "MAR05386",
    "MI1345PP": "MAR06565",
    "MI145PK": "MAR06563",
    "MI145PP": "MAR06560",
    "MTRI": "MAR05385",
    "NADPN": "MAR07627",
    "NMNHYD": "MAR04264",
    "ORNDC": "MAR04212",
    "RNMK": "MAR04265",
    "SALMCOM": "MAR06750",
    "SALMCOM2": "MAR06746",
    "SBTD_D2": "MAR04315",
    "SBTR": "MAR04316",
    "SPMDtex2": "MAR04994",
    "TDP": "MAR04208",
    "THMTP": "MAR04207",
    "TMDPK": "MAR04204",
    "TMDPPK": "MAR04206",
    "UNK3": "MAR05391",
    "UPPDC1": "MAR04750",
    "XYLK": "MAR04595",
    "XYLTD_D": "MAR04593",
}

id_mapping_dict.update(manual_updates_and_corrections)
id_mapping_df = pd.DataFrame.from_dict(id_mapping_dict, orient="index")
id_mapping_df = id_mapping_df.reset_index(drop=False)
id_mapping_df.columns = ["bigg", "metatlas"]
# id_mapping_df = id_mapping_df.loc[:, id_mapping_df.columns[::-1]]

id_mapping_df.to_csv(
    f"{REPO_PATH}/data/interim/BiGGMetAtlasReactions.tsv",
    sep="\t",
)
id_mapping_df

Number of reactions in RBC-GEM (excluding boundaries): 469
Number of reactions that could be found: 276

Check CAATPS for false mappings
Check CAT for false mappings
Check CHLPCTD for false mappings
Check CHOLK for false mappings
Check CHOLt4 for false mappings
Check COt for false mappings
Check CYStec for false mappings
Check CYTK1 for false mappings
Check EX_ac_e for false mappings
Check EX_adrnl_e for false mappings
Check EX_ala__L_e for false mappings
Check EX_arg__L_e for false mappings
Check EX_chol_e for false mappings
Check EX_cl_e for false mappings
Check EX_dopa_e for false mappings
Check EX_fe2_e for false mappings
Check EX_gal_e for false mappings
Check EX_gam_e for false mappings
Check EX_glc__D_e for false mappings
Check EX_gln__L_e for false mappings
Check EX_h2o_e for false mappings
Check EX_h_e for false mappings
Check EX_hco3_e for false mappings
Check EX_k_e for false mappings
Check EX_lnlc_e for false mappings
Check EX_met__L_e for false mappings
Check EX_nac_e for 

Unnamed: 0,bigg,metatlas
0,3MOXTYRESSte,MAR11306
1,4PYRDX,MAR08103
2,5AOPt2,MAR11307
3,ACALDt,MAR04948
4,ACGAM2E,MAR04527
...,...,...
317,TMDPPK,MAR04206
318,UNK3,MAR05391
319,UPPDC1,MAR04750
320,XYLK,MAR04595


## Map model BiGG ID metabolites to MetabolicAtlas via HumanGEM

In [13]:
model_metabolite_ids = {metabolite.id.replace(f"_{metabolite.compartment}", "") for metabolite in model.metabolites}
bigg_metabolites = set()
human_gem_mapping = {}
for metabolite in HumanGEM.metabolites:
    bigg_ids = metabolite.annotation.get("bigg.metabolite")
    if not bigg_ids:
        continue
    if isinstance(bigg_ids, str):
        bigg_ids = [bigg_ids]
    bigg_ids = set(bigg_ids)
    bigg_metabolites.update(bigg_ids)
    for bigg_id in bigg_ids:
        if bigg_id in human_gem_mapping:
            human_gem_mapping[bigg_id] += [metabolite.id[:-1]]
        else:
            human_gem_mapping[bigg_id] = [metabolite.id[:-1]]


print(f"Number of metabolites in RBC-GEM (excluding compartments): {len(model_metabolite_ids)}")

intersection = sorted(bigg_metabolites.intersection(model_metabolite_ids))
print(f"Number of metabolites that could be found: {len(intersection)}")
print()

id_mapping_dict = {}
for rbc_metabolite in intersection:
    metabolite_ids = set(human_gem_mapping.get(rbc_metabolite, []))
    if len(metabolite_ids) > 1:
        print(f'"{rbc_metabolite}": "{build_string(metabolite_ids)}",')
    id_mapping_dict[rbc_metabolite] = build_string(metabolite_ids)

manual_updates_and_corrections = {
    # Selected reactions below needed corrections.
 
}

id_mapping_dict.update(manual_updates_and_corrections)
id_mapping_df = pd.DataFrame.from_dict(id_mapping_dict, orient="index")
id_mapping_df = id_mapping_df.reset_index(drop=False)
id_mapping_df.columns = ["bigg", "metatlas"]

id_mapping_df.to_csv(
    f"{REPO_PATH}/data/interim/BiGGMetAtlasMetabolites.tsv",
    sep="\t",
)
id_mapping_df

Number of metabolites in RBC-GEM (excluding compartments): 267
Number of metabolites that could be found: 205



Unnamed: 0,bigg,metatlas
0,13dpg,MAM00247
1,23dpg,MAM00569
2,2kmb,MAM01016
3,2pg,MAM00674
4,35cgmp,MAM01433
...,...,...
200,xmp,MAM03150
201,xu5p__D,MAM01761
202,xylt,MAM03155
203,xylu__D,MAM01759
