# Derive base model
## Setup
### Import packages

In [None]:
from warnings import warn

import pandas as pd
from cobra.core import Group
from cobra.manipulation import remove_genes
from cobra.util.array import create_stoichiometric_matrix, nullspace
from rbc_gem_utils import (
    COBRA_CONFIGURATION,
    GEM_NAME,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
    split_string,
    write_cobra_model,
)
from rbc_gem_utils.database.mgi import MGI_DB_TAG
from rbc_gem_utils.database.uniprot import UNIPROT_DB_TAG

pd.set_option("display.precision", 6)

import gurobipy as gp

gp.setParam("OutputFlag", 0)
gp.setParam("LogToConsole", 0)

# Show versions of notebook
show_versions()

Set parameter Username
Academic license - for non-commercial use only - expires 2025-11-21

Package Information
-------------------
rbc-gem-utils 0.0.1

Dependency Information
----------------------
beautifulsoup4                       4.12.3
bio                                   1.6.2
cobra                                0.29.0
depinfo                               2.2.0
kaleido                               0.2.1
matplotlib                            3.8.2
memote                               0.17.0
networkx                              3.2.1
notebook                              7.0.7
openpyxl                              3.1.2
pandas                                2.2.0
pre-commit                            3.6.0
pyvis                                 0.3.2
rbc-gem-utils[database,network,vis] missing
requests                             2.31.0
scipy                                1.12.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip 

### Define configuration
#### COBRA Configuration

In [2]:
COBRA_CONFIGURATION.solver = "gurobi"
COBRA_CONFIGURATION.bounds = (-1e3, 1e3)
COBRA_CONFIGURATION

Attribute,Description,Value
solver,Mathematical optimization solver,gurobi
tolerance,"General solver tolerance (feasibility, integrality, etc.)",1e-07
lower_bound,Default reaction lower bound,-1000.0
upper_bound,Default reaction upper bound,1000.0
processes,Number of parallel processes,15
cache_directory,Path for the model cache,/Users/zhaiman/Library/Caches/cobrapy
max_cache_size,Maximum cache size in bytes,104857600
cache_expiration,Model cache expiration time in seconds (if any),


## Define organism

In [3]:
organism = "Human"

## Load RBC-GEM model

In [4]:
valid_organisms = {"Human", "Mouse"}
if organism not in valid_organisms:
    raise ValueError(f"Organism must be one of the following: {valid_organisms}")

if organism == "Mouse":
    # Load mouse GEM
    sequence_dirpath = get_dirpath("database", MGI_DB_TAG)
    rbc_gem = read_cobra_model(sequence_dirpath / f"{GEM_NAME}.xml")
else:
    # Load human GEM
    sequence_dirpath = get_dirpath("database", UNIPROT_DB_TAG)
    rbc_gem = read_cobra_model(get_dirpath("model") / f"{GEM_NAME}.xml")


for r in rbc_gem.reactions:
    if r.bounds == (0.0, 1000.0):
        r.bounds = (0, COBRA_CONFIGURATION.upper_bound)
    elif r.bounds == (-1000.0, 0.0):
        r.bounds = (COBRA_CONFIGURATION.lower_bound, 0.0)
    else:
        r.bounds = COBRA_CONFIGURATION.bounds

overlay_dirpath = (get_dirpath("analysis") / "OVERLAY" / organism).resolve()
overlay_dirpath.mkdir(exist_ok=True, parents=True)
print(overlay_dirpath)
overwrite = True
rbc_gem

/Users/zhaiman/opt/github/RBC-GEM/data/analysis/OVERLAY/Human


0,1
Name,RBC_GEM
Memory address,147116950
Number of metabolites,2157
Number of reactions,3275
Number of genes,820
Number of groups,78
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"


### Extract model
Load the full reconstruction or use a pre-defined model for a reduced RBC model.

In [5]:
reduced_model_id = None
model = rbc_gem.copy()

boundary_types_dict = {
    "exchange": "EX_",
    "sink": "SK_",
    "demand": "DM_",
}


if reduced_model_id is not None and reduced_model_id != rbc_gem.id:
    df_reactions = pd.read_csv(
        get_dirpath("analysis")
        / organism
        / "reduced_models"
        / f"model_reactions_{reduced_model_id}.tsv",
        sep="\t",
        index_col=0,
        dtype=str,
    ).fillna("")

    # Determine pseudoreactions if any
    pseudoreactions = set()
    boundaries = {}
    for btype, prefix in boundary_types_dict.items():
        boundaries[btype] = {
            r: r.replace(prefix, "") for r in df_reactions.index if r.startswith(prefix)
        }
        pseudoreactions.update([r for r in df_reactions.index if r.startswith(prefix)])

    # Extract reactions from model that are not pseudoreactions
    reactions = sorted(list(df_reactions.index.difference(pseudoreactions)))
    reactions = model.reactions.get_by_any(reactions)
    # Remove reactions not in list
    model.remove_reactions(
        [x for x in model.reactions if x not in reactions], remove_orphans=True
    )
    # Remove orphaned groups not in list
    model.remove_groups([x for x in model.groups if not x.members])
    # Cleanup gene reaction rules
    for reaction in reactions:
        reaction.gene_reaction_rule = df_reactions.loc[
            reaction.id, "gene_reaction_rule"
        ]
    to_remove = model.genes.query(lambda x: len(x.reactions) < 1)
    remove_genes(model, gene_list=to_remove, remove_reactions=False)

    # Add pseudoreactions, defining new ones if needed.
    for btype, boundary_dict in boundaries.items():
        for reaction, met in boundary_dict.items():
            try:
                met = model.metabolites.get_by_id(met)
            except KeyError as e:
                warn(f"{met} not in model")
            else:
                try:
                    reaction = model.add_boundary(met, type=btype)
                except ValueError:
                    reaction = model.reactions.get_by_id(reaction)

    for reaction in model.boundary:
        reaction.subsystem = "Pseudoreactions"

    model.id = reduced_model_id
model

0,1
Name,RBC_GEM
Memory address,149d52650
Number of metabolites,2157
Number of reactions,3275
Number of genes,820
Number of groups,78
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"


### Check blocked reactions

In [6]:
# blocked_reactions = find_blocked_reactions(model, open_exchanges=False);
# blocked_reactions = sorted([str(r) for r in model.reactions.get_by_any(blocked_reactions) if r.subsystem not in {"Pseudoreactions", "Transport, extracellular"}])
# blocked_reactions

### Set objective(s)

In [7]:
test_model = model.copy()
for r in test_model.boundary:
    r.bounds = (0, COBRA_CONFIGURATION.upper_bound)
test_model.reactions.get_by_id("EX_glc__D_e").bounds = (-1, 0)


sol = test_model.optimize()
sol.fluxes[sol.fluxes != 0]
objective_rxns = ["NaKt"]
model.objective = sum(
    [test_model.reactions.get_by_id(rid).flux_expression for rid in objective_rxns]
)
sol = test_model.optimize()
print(sol[objective_rxns])
sol.fluxes[sol.fluxes != 0]

NaKt    2.0
Name: fluxes, dtype: float64


ACt2           2.0
EX_glc__D_e   -1.0
EX_h_e         2.0
EX_lac__L_e    2.0
ENO            2.0
FBA            1.0
GAPD           2.0
LDH_L          2.0
NaKt           2.0
PFK            1.0
PGI            1.0
PGK            2.0
PGM            2.0
PYK            2.0
TPI            1.0
ADPGK          1.0
DADK4          1.0
NDPK9          1.0
ADK7           1.0
DADK7         -1.0
CYSTHRNaEx     3.0
GLC_Dt        -1.0
Kt1            4.0
PYR_ACtex      2.0
PYR_LLACtex   -2.0
THRCYSNaEx     3.0
Name: fluxes, dtype: float64

### Get dimension of nullspace

In [8]:
S = create_stoichiometric_matrix(model)
ns = nullspace(S)
ns.shape

(3275, 1258)

### Reset subsystems

In [9]:
model.remove_groups(model.groups)
for subsystem in sorted(set(model.reactions.list_attr("subsystem"))):
    reaction_list = model.reactions.query(lambda x: x.subsystem == subsystem)
    if subsystem not in model.groups:
        group = Group(id=subsystem, name=subsystem, members=reaction_list)
        model.add_groups([group])
    else:
        group = model.groups.get_by_id(subsystem).add_members(reaction_list)

### Assemble protein data
#### Get model mappings

In [10]:
annotation_type = "genes"
mapping_key = "uniprot"
annotation_cols = [mapping_key]

df_model_mappings = (
    get_annotation_df(model.genes, annotation_cols)
    .rename({"id": annotation_type}, axis=1)
    .dropna(subset=[mapping_key])
)
for col in df_model_mappings.columns:
    df_model_mappings[col] = df_model_mappings[col].apply(lambda x: split_string(x))
    df_model_mappings = df_model_mappings.explode(col).drop_duplicates().dropna()
df_model_mappings = df_model_mappings.sort_values(annotation_type)

print(df_model_mappings.nunique(dropna=True))
df_model_mappings = df_model_mappings.reset_index(drop=True)
df_model_mappings

genes      820
uniprot    820
dtype: int64


Unnamed: 0,genes,uniprot
0,A4GALT,Q9NPC4
1,AARS1,P49588
2,AASDHPPT,Q9NRN7
3,ABCA1,O95477
4,ABCA7,Q8IZY2
...,...,...
815,YES1,P07947
816,ZDHHC2,Q9UIJ5
817,ZDHHC20,Q5W0Z9
818,ZDHHC3,Q9NYG2


#### Load protein amino acid sequences

In [11]:
df_isoforms_sequences = pd.read_csv(
    sequence_dirpath / f"{UNIPROT_DB_TAG}_isoforms_sequences.tsv",
    sep="\t",
    index_col=None,
).fillna(pd.NA)
print(df_isoforms_sequences[df_isoforms_sequences["erythroid"]]["uniprot"].unique())


df_sequences = df_model_mappings.merge(
    df_isoforms_sequences,
    left_on=mapping_key,
    right_on=mapping_key,
    how="left",
)
df_sequences = df_sequences[["genes", "sequence.id", "sequence"]].dropna(
    subset=["sequence"], axis=0
)
df_sequences

['P00167' 'P00387' 'P02730' 'P05089' 'P08397' 'P19367' 'P22303' 'P30613'
 'Q8N0V5' 'Q9H0P0']


Unnamed: 0,genes,sequence.id,sequence
0,A4GALT,Q9NPC4,MSKPPDLLLRLLRGAPRQRVCTLFIIGFKFTFFVSIMIYWHVVGEP...
1,AARS1,P49588-1,MDSTLTASEIRQRFIDFFKRNEHTYVHSSATIPLDDPTLLFANAGM...
2,AARS1,P49588-2,MDSTLTASEIRQRFIDFFKRNEHTYVHSSATIPLDDPTLLFANAGM...
3,AASDHPPT,Q9NRN7-1,MVFPAKRFCLVPSMEGVRWAFSCGTWLPSRAEWLLAVRSIQPEEKE...
4,AASDHPPT,Q9NRN7-2,MVFPAKRFCLVPSMEGVRWAFSCGTWLPSRAEWLLAVRSIQPEEKE...
...,...,...,...
1851,ZDHHC20,Q5W0Z9-4,MAPWTLWRCCQRVVGWVPVLFITFVVVWSYYAYVVELCVFTIFGNE...
1852,ZDHHC3,Q9NYG2-1,MMLIPTHHFRNIERKPEYLQPEKCVPPPYPGPVGTMWFIRDGCGIA...
1853,ZDHHC3,Q9NYG2-2,MMLIPTHHFRNIERKPEYLQPEKCVPPPYPGPVGTMWFIRDGCGIA...
1854,ZDHHC5,Q9C0B5-1,MPAESGKRFKPSKYVPVSAAAIFLVGATTLFFAFTCPGLSLYVSPA...


### Remove genes without sequence data

In [12]:
genes_to_remove = model.genes.query(lambda x: x.id not in set(df_sequences["genes"]))
remove_genes(model, gene_list=genes_to_remove, remove_reactions=False)

### Export model

In [13]:
model_dirpath = overlay_dirpath / model.id
model_dirpath.mkdir(exist_ok=True)
write_cobra_model(model, filename=model_dirpath / f"{model.id}.xml")
model

0,1
Name,RBC_GEM
Memory address,149d52650
Number of metabolites,2157
Number of reactions,3275
Number of genes,820
Number of groups,78
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"
