# Derive base model
## Setup
### Import packages

In [1]:
from warnings import warn

import pandas as pd
from cobra.core import Group
from cobra.manipulation import remove_genes
from cobra.util.array import create_stoichiometric_matrix, nullspace
from rbc_gem_utils import (
    COBRA_CONFIGURATION,
    GEM_NAME,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
    split_string,
    write_cobra_model,
)
from rbc_gem_utils.database.mgi import MGI_DB_TAG
from rbc_gem_utils.database.uniprot import UNIPROT_DB_TAG

pd.set_option("display.precision", 6)

import gurobipy as gp

gp.setParam("OutputFlag", 0)
gp.setParam("LogToConsole", 0)

# Show versions of notebook
show_versions()

Set parameter Username
Set parameter LicenseID to value 2664191

Package Information
-------------------
rbc-gem-utils 0.0.3

Dependency Information
----------------------
beautifulsoup4                       4.13.4
bio                                   1.8.0
cobra                                0.29.1
depinfo                               2.2.0
gurobipy                             12.0.2
matplotlib                           3.10.3
matplotlib-venn                       1.1.2
memote                               0.17.0
networkx                                3.5
notebook                              7.4.3
openpyxl                              3.1.5
pandas                                2.2.3
pre-commit                            4.2.0
rbc-gem-utils[database,network,vis] missing
requests                             2.32.3
scikit-learn                          1.6.1
scipy                                1.15.3
seaborn                              0.13.2

Build Tools Information
-----------

### Define configuration
#### COBRA Configuration

In [2]:
COBRA_CONFIGURATION.solver = "gurobi"
COBRA_CONFIGURATION.bounds = (-1e3, 1e3)
COBRA_CONFIGURATION

Attribute,Description,Value
solver,Mathematical optimization solver,gurobi
tolerance,"General solver tolerance (feasibility, integrality, etc.)",1e-07
lower_bound,Default reaction lower bound,-1000.0
upper_bound,Default reaction upper bound,1000.0
processes,Number of parallel processes,127
cache_directory,Path for the model cache,C:\Users\P7875\AppData\Local\opencobra\cobrapy\Cache
max_cache_size,Maximum cache size in bytes,104857600
cache_expiration,Model cache expiration time in seconds (if any),


### Define organism

In [3]:
organism = "Mouse"

### Set paths

In [4]:
valid_organisms = {"Human", "Mouse"}
if organism not in valid_organisms:
    raise ValueError(f"Organism must be one of the following: {valid_organisms}")
elif organism == "Mouse":
    # Load mouse GEM
    sequence_dirpath = get_dirpath("database", MGI_DB_TAG)
    model_dirpath = sequence_dirpath
else:
    # Load human GEM
    sequence_dirpath = get_dirpath("database", UNIPROT_DB_TAG)
    model_dirpath = get_dirpath("model")

overlay_dirpath = (get_dirpath("analysis") / "OVERLAY" / organism).resolve()
overlay_dirpath.mkdir(exist_ok=True, parents=True)

## Load RBC-GEM model

In [5]:
rbc_gem = read_cobra_model(model_dirpath / f"{GEM_NAME}.xml")
for r in rbc_gem.reactions:
    if r.bounds == (0.0, 1000.0):
        r.bounds = (0, COBRA_CONFIGURATION.upper_bound)
    elif r.bounds == (-1000.0, 0.0):
        r.bounds = (COBRA_CONFIGURATION.lower_bound, 0.0)
    else:
        r.bounds = COBRA_CONFIGURATION.bounds


rbc_gem

0,1
Name,RBC_GEM
Memory address,1aa27edb510
Number of metabolites,2157
Number of reactions,3274
Number of genes,827
Number of groups,0
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"


### Extract model
Load the full reconstruction or use a pre-defined model for a reduced RBC model.

In [6]:
reduced_model_id = None
model = rbc_gem.copy()

boundary_types_dict = {
    "exchange": "EX_",
    "sink": "SK_",
    "demand": "DM_",
}


if reduced_model_id is not None and reduced_model_id != rbc_gem.id:
    df_reactions = pd.read_csv(
        get_dirpath("analysis")
        / organism
        / "reduced_models"
        / f"model_reactions_{reduced_model_id}.tsv",
        sep="\t",
        index_col=0,
        dtype=str,
    ).fillna("")

    # Determine pseudoreactions if any
    pseudoreactions = set()
    boundaries = {}
    for btype, prefix in boundary_types_dict.items():
        boundaries[btype] = {
            r: r.replace(prefix, "") for r in df_reactions.index if r.startswith(prefix)
        }
        pseudoreactions.update([r for r in df_reactions.index if r.startswith(prefix)])

    # Extract reactions from model that are not pseudoreactions
    reactions = sorted(list(df_reactions.index.difference(pseudoreactions)))
    reactions = model.reactions.get_by_any(reactions)
    # Remove reactions not in list
    model.remove_reactions(
        [x for x in model.reactions if x not in reactions], remove_orphans=True
    )
    # Remove orphaned groups not in list
    model.remove_groups([x for x in model.groups if not x.members])
    # Cleanup gene reaction rules
    for reaction in reactions:
        reaction.gene_reaction_rule = df_reactions.loc[
            reaction.id, "gene_reaction_rule"
        ]
    to_remove = model.genes.query(lambda x: len(x.reactions) < 1)
    remove_genes(model, gene_list=to_remove, remove_reactions=False)

    # Add pseudoreactions, defining new ones if needed.
    for btype, boundary_dict in boundaries.items():
        for reaction, met in boundary_dict.items():
            try:
                met = model.metabolites.get_by_id(met)
            except KeyError as e:
                warn(f"{met} not in model")
            else:
                try:
                    reaction = model.add_boundary(met, type=btype)
                except ValueError:
                    reaction = model.reactions.get_by_id(reaction)

    for reaction in model.boundary:
        reaction.subsystem = "Pseudoreactions"

    model.id = reduced_model_id
model

0,1
Name,RBC_GEM
Memory address,1aa296cf750
Number of metabolites,2157
Number of reactions,3274
Number of genes,827
Number of groups,0
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"


### Check blocked reactions

In [7]:
# blocked_reactions = find_blocked_reactions(model, open_exchanges=False);
# blocked_reactions = sorted([str(r) for r in model.reactions.get_by_any(blocked_reactions) if r.subsystem not in {"Pseudoreactions", "Transport, extracellular"}])
# blocked_reactions

### Set objective(s)

In [8]:
test_model = model.copy()
for r in test_model.boundary:
    r.bounds = (0, COBRA_CONFIGURATION.upper_bound)
test_model.reactions.get_by_id("EX_glc__D_e").bounds = (-1, 0)


sol = test_model.optimize()
sol.fluxes[sol.fluxes != 0]
objective_rxns = ["NaKt"]
model.objective = sum(
    [test_model.reactions.get_by_id(rid).flux_expression for rid in objective_rxns]
)
sol = test_model.optimize()
print(sol[objective_rxns])
sol.fluxes[sol.fluxes != 0]

NaKt    2.0
Name: fluxes, dtype: float64


EX_glc__D_e   -1.0
EX_h_e         2.0
EX_lac__L_e    2.0
ENO            2.0
FBA            1.0
HEX1           1.0
GAPD           2.0
LDH_L          2.0
NaKt           2.0
PFK            1.0
PGI            1.0
PGK            2.0
PGM            2.0
PYK            2.0
PYRt2          2.0
TPI            1.0
CYSTHRNaEx     3.0
GLC_Dt        -1.0
Kt1            4.0
PYR_LLACtex   -2.0
THRCYSNaEx     3.0
Name: fluxes, dtype: float64

### Get dimension of nullspace

In [9]:
S = create_stoichiometric_matrix(model)
ns = nullspace(S)
ns.shape

(3274, 1257)

### Reset subsystems

In [10]:
model.remove_groups(model.groups)
for subsystem in sorted(set(model.reactions.list_attr("subsystem"))):
    reaction_list = model.reactions.query(lambda x: x.subsystem == subsystem)
    if subsystem not in model.groups:
        group = Group(id=subsystem, name=subsystem, members=reaction_list)
        model.add_groups([group])
    else:
        group = model.groups.get_by_id(subsystem).add_members(reaction_list)

### Assemble protein data
#### Get model mappings

In [11]:
annotation_type = "genes"
mapping_key = "uniprot"
annotation_cols = [mapping_key]

df_model_mappings = (
    get_annotation_df(model.genes, annotation_cols)
    .rename({"id": annotation_type}, axis=1)
    .dropna(subset=[mapping_key])
)
for col in df_model_mappings.columns:
    df_model_mappings[col] = df_model_mappings[col].apply(lambda x: split_string(x))
    df_model_mappings = df_model_mappings.explode(col).drop_duplicates().dropna()
df_model_mappings = df_model_mappings.sort_values(annotation_type)

print(df_model_mappings.nunique(dropna=True))
df_model_mappings = df_model_mappings.reset_index(drop=True)
df_model_mappings

genes      827
uniprot    827
dtype: int64


Unnamed: 0,genes,uniprot
0,A4galt,Q67BJ4
1,Aars1,Q8BGQ7
2,Aasdhppt,Q9CQF6
3,Abca1,P41233
4,Abca7,Q91V24
...,...,...
822,Yes1,Q04736
823,Zdhhc2,P59267
824,Zdhhc20,Q5Y5T1
825,Zdhhc3,Q8R173


#### Load protein amino acid sequences

In [12]:
df_isoforms_sequences = pd.read_csv(
    sequence_dirpath / f"{UNIPROT_DB_TAG}_isoforms_sequences.tsv",
    sep="\t",
    index_col=None,
).fillna(pd.NA)
print(df_isoforms_sequences[df_isoforms_sequences["erythroid"]]["uniprot"].unique())


df_sequences = df_model_mappings.merge(
    df_isoforms_sequences,
    left_on=mapping_key,
    right_on=mapping_key,
    how="left",
)
df_sequences = df_sequences[["genes", "sequence.id", "sequence"]].dropna(
    subset=["sequence"], axis=0
)
df_sequences

['P04919' 'P22907' 'P49282' 'P53657']


Unnamed: 0,genes,sequence.id,sequence
0,A4galt,Q67BJ4,MGISCSHLEETMSKPPDCLLRMLRGTPRQRVFTFFIISFKFMFLIS...
1,Aars1,Q8BGQ7,MDATLTAREIRERFINFFRRNEHTYVHSSATIPLDDPTLLFANAGM...
2,Aasdhppt,Q9CQF6-1,MVFPAKRLCVVPSMEGVRWAFSCGTWLPSRAEWLLAMRSIQPEEKE...
3,Aasdhppt,Q9CQF6-2,MVFPAKRLCVVPSMEGVRWAFSCGTWLPSRAEWLLAMRSIQPEEKE...
4,Abca1,P41233,MACWPQLRLLLWKNLTFRRRQTCQLLLEVAWPLFIFLILISVRLSY...
...,...,...,...
1104,Zdhhc20,Q5Y5T1-1,MAPWTLWRCCQRVVGWVPVLFITFVVVWSYYAYVVELCVSTISRTG...
1105,Zdhhc20,Q5Y5T1-2,MAPWTLWRCCQRVVGWVPVLFITFVVVWSYYAYVVELCVSTISRTG...
1106,Zdhhc3,Q8R173,MMLIPTHHFRDIERKPEYLQPEKCAPPPFPGPAGAMWFIRDGCGIA...
1107,Zdhhc5,Q8VDZ4-1,MPAESGKRFKPSKYVPVSAAAIFLVGATTLFFAFTCPGLSLNVSPA...


### Remove genes without sequence data

In [13]:
genes_to_remove = model.genes.query(lambda x: x.id not in set(df_sequences["genes"]))
remove_genes(model, gene_list=genes_to_remove, remove_reactions=False)

### Export model

In [14]:
new_model_dirpath = overlay_dirpath / model.id
new_model_dirpath.mkdir(exist_ok=True)
write_cobra_model(model, filename=new_model_dirpath / f"{model.id}.xml")
model

0,1
Name,RBC_GEM
Memory address,1aa296cf750
Number of metabolites,2157
Number of reactions,3274
Number of genes,826
Number of groups,1
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"
