# Create Protein-Constrained RBC model via OVERLAY workflow 
This notebook facilitates the construction of a proteome constrained model ("pcModel") via the OVERLAY methodology.
## Setup
### Import packages

In [2]:
import itertools
from collections import defaultdict
from pathlib import Path

import gurobipy as gp
import pandas as pd
from rbc_gem_utils import (COBRA_CONFIGURATION, DATABASE_PATH, ROOT_PATH,
                           build_string, get_annotation_df, read_cobra_model,
                           show_versions, split_string, write_cobra_model)
from rbc_gem_utils.analysis.overlay import (
    ATTR_SUBCLASS_DICT, DEFAULT_COMPARTMENT_CONSTRAINT_PREFIX,
    DEFAULT_CONCENTRATION_BOUND, DEFAULT_CONSTRAINT_PREFIX,
    DEFAULT_ENZYME_FORWARD_SUFFIX, DEFAULT_ENZYME_REVERSE_SUFFIX,
    DEFAULT_ENZYME_TOTAL_SUFFIX, DEFAULT_ISOFORM_CONSTRAINT_PREFIX,
    DEFAULT_KEFF, Enzyme, EnzymeDilution, Protein, ProteinDilution,
    ProteomeBudget, ProteomeBudgetDilution, construct_pcmodel_from_tables,
    create_complex_table, create_enzyme_table, create_protein_table,
    create_sequence_table)
from rbc_gem_utils.database.uniprot import UNIPROT_DB_TAG, UNIPROT_PATH
from rbc_gem_utils.util import strip_plural

gp.setParam("OutputFlag", 0)
gp.setParam("LogToConsole", 0)

# Show versions of notebook
show_versions()

Set parameter Username

Package Information
-------------------
rbc-gem-utils 0.0.1

Dependency Information
----------------------
beautifulsoup4                       4.12.3
bio                                   1.6.2
cobra                                0.29.0
depinfo                               2.2.0
kaleido                               0.2.1
matplotlib                            3.8.2
memote                               0.17.0
networkx                              3.2.1
notebook                              7.0.7
openpyxl                              3.1.2
pandas                                2.2.0
pre-commit                            3.6.0
pyvis                                 0.3.2
rbc-gem-utils[database,network,vis] missing
requests                             2.31.0
scipy                                1.12.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip        23.3.1
setuptools 68.2.2
wheel      0.41.2

Platform Informat

### Define configuration
#### COBRA Configuration

In [3]:
COBRA_CONFIGURATION.solver = "gurobi"
COBRA_CONFIGURATION.bounds = (-1e3, 1e3)
COBRA_CONFIGURATION

Attribute,Description,Value
solver,Mathematical optimization solver,gurobi
tolerance,"General solver tolerance (feasibility, integrality, etc.)",1e-07
lower_bound,Default reaction lower bound,-1000.0
upper_bound,Default reaction upper bound,1000.0
processes,Number of parallel processes,15
cache_directory,Path for the model cache,/Users/zhaiman/Library/Caches/cobrapy
max_cache_size,Maximum cache size in bytes,104857600
cache_expiration,Model cache expiration time in seconds (if any),


## Load RBC model

In [4]:
model_id = "RBC3P"

data_path = Path(f"{ROOT_PATH}/data/analysis/OVERLAY").resolve()
overwrite = True
save_figures = True
imagetype = "svg"

model_dirpath = Path(f"{data_path}/{model_id}")
model = read_cobra_model(filename=f"{model_dirpath}/{model_id}.xml")
model

0,1
Name,RBC3P
Memory address,1508db290
Number of metabolites,72
Number of reactions,85
Number of genes,103
Number of groups,10
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"


In [5]:
annotation_type = "genes"
mapping_key = "uniprot"
annotation_cols = [mapping_key]

df_model_mappings = (
    get_annotation_df(model.genes, annotation_cols)
    .rename({"id": annotation_type}, axis=1)
    .dropna(subset=[mapping_key])
)
for col in df_model_mappings.columns:
    df_model_mappings[col] = df_model_mappings[col].apply(lambda x: split_string(x))
    df_model_mappings = df_model_mappings.explode(col).drop_duplicates().dropna()
df_model_mappings = df_model_mappings.sort_values(annotation_type)

print(df_model_mappings.nunique(dropna=True))
df_model_mappings = df_model_mappings.reset_index(drop=True)
df_model_mappings

genes      103
uniprot    103
dtype: int64


Unnamed: 0,genes,uniprot
0,ADA,P00813
1,ADK,P55263
2,AK1,P00568
3,ALDOA,P04075
4,ALDOB,P05062
...,...,...
98,TMEM109,Q9BVC6
99,TPI1,P60174
100,TRPC6,Q9Y210
101,TRPV2,Q9Y5S1


## Assemble data for PC-model
### Load protein data
#### Protein amino acid sequences


In [6]:
df_isoforms_sequences = pd.read_csv(
    f"{ROOT_PATH}{DATABASE_PATH}{UNIPROT_PATH}/{UNIPROT_DB_TAG}_isoforms_sequences.tsv",
    sep="\t",
    index_col=None,
).fillna(pd.NA)
print(df_isoforms_sequences[df_isoforms_sequences["erythroid"]]["uniprot"].unique())
df_isoforms_sequences

['P00167' 'P00387' 'P02730' 'P05089' 'P08397' 'P19367' 'P22303' 'P30613'
 'Q8N0V5' 'Q9H0P0']


Unnamed: 0,uniprot,uniprot.isoform,sequence.id,canonical,erythroid,backup,avoid,keywords.erythroid,keywords.backup,keywords.avoid,sequence,sequence.length
0,A0AVT1,A0AVT1-1,A0AVT1-1,True,False,False,False,,,,MEGSEPVAAHQGEEASCSSWGTGSTNKNLPIMSTASVEIDDALYSR...,1052
1,A0AVT1,A0AVT1-2,A0AVT1-2,False,False,False,False,,,,MLKNFALLGVGTSKEKGMITVTDPDLIEKSNLNRQFLFRPHHIQKP...,578
2,A0AVT1,A0AVT1-3,A0AVT1-3,False,False,False,False,,,,MEGSEPVAAHQGEEASCSSWGTGSTNKNLPIMSTASVEIDDALYSR...,389
3,A0AVT1,A0AVT1-4,A0AVT1-4,False,False,False,False,,,,MEGSEPVAAHQGEEASCSSWGTGSTNKNLPIMSTASVEIDDALYSR...,340
4,A4D126,A4D126-1,A4D126-1,True,False,False,False,,,,MEAGPPGSARPAEPGPCLSGQRGADHTASASLQSVAGTEPGRHPQA...,451
...,...,...,...,...,...,...,...,...,...,...,...,...
1848,Q9Y6M4,Q9Y6M4-3,Q9Y6M4-3,False,False,False,False,,,,MENKKKDKDKSDDRMARPSGRSGHNTRGTGSSSSGVLMVGPNFRVG...,423
1849,Q9Y6M4,Q9Y6M4-4,Q9Y6M4-4,False,False,False,False,,,,MENKKKDKDKSDDRMARPSGRSGHNTRGTGSSSSGVLMVGPNFRVG...,424
1850,Q9Y6M4,Q9Y6M4-5,Q9Y6M4-5,False,False,False,False,,,,MKSRAPQLHLEYRFYKQLGSGDGIPQVYYFGPCGKYNAMVLELLGP...,348
1851,Q9Y6M4,Q9Y6M4-6,Q9Y6M4-6,False,False,False,False,,,,MVLELLGPSLEDLFDLCDRTFSLKTVLMIAIQLISRMEYVHSKNLI...,311


#### Determine protein isoforms and associated sequences

In [7]:
# Erythroid first, then canonical to enable drop_duplicates to keep erythroid over canonical
df_model_isoforms_sequences = (
    pd.concat(
        (
            df_isoforms_sequences[df_isoforms_sequences["erythroid"]],
            df_isoforms_sequences[df_isoforms_sequences["canonical"]],
            df_isoforms_sequences[df_isoforms_sequences["backup"]],
        ),
        axis=0,
    )
    .fillna(pd.NA)
    .drop_duplicates()
    .sort_values(
        ["uniprot", "erythroid", "uniprot.isoform"], ascending=[True, False, True]
    )
)

print(
    df_model_isoforms_sequences[["canonical", "erythroid", "backup", "avoid"]].sum(
        axis=0
    )
)
print(f"Total: {len(df_model_isoforms_sequences)}")
df_model_isoforms_sequences = df_model_isoforms_sequences.loc[
    :,
    [
        "uniprot",
        "uniprot.isoform",
        "sequence.id",
        "sequence",
        "sequence.length",
        "canonical",
        "erythroid",
        "backup",
        "avoid",
    ],
].reset_index(drop=True)


df_model_isoforms_sequences = df_model_isoforms_sequences.copy()
df_model_isoforms_sequences["keep"] = df_model_isoforms_sequences["canonical"].values
to_avoid = df_model_isoforms_sequences[df_model_isoforms_sequences["avoid"]][
    "uniprot"
].to_dict()
df_model_isoforms_sequences.loc[
    list(to_avoid),
    "keep",
] = False

df_possible_backups = df_model_isoforms_sequences[
    df_model_isoforms_sequences["uniprot"].isin(list(to_avoid.values()))
]
df_possible_backups = df_possible_backups[~df_possible_backups["avoid"]]
df_model_isoforms_sequences.loc[
    list(df_possible_backups.index),
    "keep",
] = True
df_model_isoforms_sequences.loc[
    df_model_isoforms_sequences[df_model_isoforms_sequences["erythroid"]].index,
    "keep",
] = True
df_model_isoforms_sequences = df_model_isoforms_sequences[
    df_model_isoforms_sequences["keep"]
]

lost_ids = set(df_isoforms_sequences["uniprot"].unique()).difference(
    set(df_model_isoforms_sequences["uniprot"].unique())
)
if lost_ids:
    lost_ids = df_isoforms_sequences[df_isoforms_sequences["uniprot"].isin(lost_ids)]
    df_model_isoforms_sequences = pd.concat(
        (df_model_isoforms_sequences, lost_ids[lost_ids["canonical"]]), axis=0
    )
print()
print(
    df_model_isoforms_sequences[["canonical", "erythroid", "backup", "avoid"]].sum(
        axis=0
    )
)
print(f"Total: {len(df_model_isoforms_sequences)}")

df_model_isoforms_sequences = df_model_isoforms_sequences.loc[
    :, ["uniprot", "sequence.id", "sequence"]
].copy()
df_sequence_data = (
    df_model_mappings.merge(
        df_model_isoforms_sequences, left_on="uniprot", right_on="uniprot"
    )
    .loc[:, ["genes", "uniprot", "sequence.id", "sequence"]]
    .copy()
)
df_sequence_data

canonical    820
erythroid     10
backup        93
avoid         22
dtype: int64
Total: 887

canonical    807
erythroid     10
backup        53
avoid          2
dtype: int64
Total: 835


Unnamed: 0,genes,uniprot,sequence.id,sequence
0,ADA,P00813,P00813,MAQTPAFDKPKVELHVHLDGSIKPETILYYGRRRGIALPANTAEGL...
1,ADK,P55263,P55263-1,MAAAEEEPKPKKLKVEAPQALRENILFGMGNPLLDISAVVDKDFLD...
2,AK1,P00568,P00568,MEEKLKKTKIIFVVGGPGSGKGTQCEKIVQKYGYTHLSTGDLLRSE...
3,ALDOA,P04075,P04075-1,MPYQYPALTPEQKKELSDIAHRIVAPGKGILAADESTGSIAKRLQS...
4,ALDOB,P05062,P05062,MAHRFPALTQEQKKELSEIAQSIVANGKGILAADESVGTMGNRLQR...
...,...,...,...,...
101,TMEM109,Q9BVC6,Q9BVC6,MAASSISSPWGKHVFKAILMVLVALILLHSALAQSRRDFAPPGQQK...
102,TPI1,P60174,P60174-1,MAPSRKFFVGGNWKMNGRKQSLGELIGTLNAAKVPADTEVVCAPPT...
103,TRPC6,Q9Y210,Q9Y210-1,MSQSPAFGPRRGSSPRGAAGAAARRNESQDYLLMDSELGEDGCPQA...
104,TRPV2,Q9Y5S1,Q9Y5S1,MTSPSSSPVFRLETLDGGQEDGSEADRGKLDFGSGLPPMESQFQGE...


In [8]:
mapping_key = "uniprot"
protein_id_key = (
    "sequence.id.genes"  # genes, uniprot, sequence.id, or sequence.id.genes are best,
)
unique_gene_to_protein_map = True
isoform_transform = False
df_copy_numbers_data = None
df_protein_data = create_sequence_table(
    df_sequences=df_sequence_data,
    mapping_key=mapping_key,
    isoform_transform=isoform_transform,
)
ordered_isoform_ids = df_sequence_data[df_sequence_data["uniprot"].duplicated(False)][
    "sequence.id"
]
df_isoforms = df_protein_data[
    df_protein_data["sequence.id"].isin(ordered_isoform_ids)
].copy()
print(f"Number of proteins: {len(df_isoforms[mapping_key].unique())}")
print(f"Number of isoforms: {len(df_isoforms['sequence.id'].unique())}")
df_protein_data = df_protein_data.set_index("sequence.id")
df_protein_data = pd.concat(
    (
        df_protein_data.loc[ordered_isoform_ids],
        df_protein_data.loc[df_protein_data.index.difference(ordered_isoform_ids)],
    ),
    axis=0,
)
df_protein_data = df_protein_data.reset_index(drop=False)
df_protein_data = df_protein_data.loc[
    :, ["genes", "uniprot", "sequence.id", "sequence"]
].copy()
# print(df_isoforms[mapping_key])
if protein_id_key == "sequence.id.genes":
    protein_id_key = "protein.id"
    sequence_id_updates = df_model_mappings.set_index("uniprot")["genes"].to_dict()
    df_protein_data["protein.id"] = df_protein_data["sequence.id"].apply(
        lambda seq_id: "_".join(
            [sequence_id_updates.get(x, x) for x in seq_id.split("-")]
        )
    )
    df_isoforms["protein.id"] = df_isoforms["sequence.id"].apply(
        lambda seq_id: "_".join(
            [sequence_id_updates.get(x, x) for x in seq_id.split("-")]
        )
    )
    ids_to_fix = df_protein_data[
        ~df_protein_data["sequence.id"].isin(df_isoforms["sequence.id"])
    ].index
    df_protein_data.loc[ids_to_fix, "protein.id"] = df_protein_data.loc[
        ids_to_fix, "protein.id"
    ].apply(lambda x: x.split("_")[0])

# Use to remove duplicates
if unique_gene_to_protein_map:
    df_protein_data = df_protein_data.drop_duplicates(
        subset=["uniprot"],
        keep="first",
    )
    protein_id_key = "genes"

df_protein_data

Number of proteins: 3
Number of isoforms: 6


Unnamed: 0,genes,uniprot,sequence.id,sequence,protein.id
0,CYB5A,P00167,P00167-2,MAEQSDEAVKYYTLEEIQKHNHSKSTWLILHHKVYDLTKFLEEHPG...,CYB5A_2
2,CYB5R3,P00387,P00387-2,MKLFQRSTPAITLESPDIKYPLRLIDREIISHDTRRFRFALPSPQH...,CYB5R3_2
4,HK1,P19367,P19367-2,MDCEHSLSLPCRGAEAWEIGIDKYLYAMRLSDETLIDIMTRFRKEM...,HK1_2
6,CACNA1A,O00555,O00555-8,MARFGDEMPARYGGGGSGAAAGVVVGSGGGRGAGGSRQGGQPGAQR...,CACNA1A
7,GAPDHS,O14556,O14556,MSKRDIVLTNVTVVQLLRQPCPVTRAPPPPEPKAEVEPQPQPEPTP...,GAPDHS
...,...,...,...,...,...
101,SLC22A4,Q9H015,Q9H015,MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTP...,SLC22A4
102,PPA2,Q9H2U2,Q9H2U2-1,MSALLRLLRTGAPAAACLRLGTSAGTGSRRAMALYHTEERGQPCSQ...,PPA2
103,P2RX2,Q9UBL9,Q9UBL9-1,MAAAQPKYPAGATARRLARGCWSALWDYETPKVIVVRNRRLGVLYR...,P2RX2
104,TRPC6,Q9Y210,Q9Y210-1,MSQSPAFGPRRGSSPRGAAGAAARRNESQDYLLMDSELGEDGCPQA...,TRPC6


###  List all unique proteins, complexes, and enzymes
#### Option 1: Initialize draft tables
1. The draft tables are created and used to initialize the draft PC-model.
    * The protein table can be used to initialize proteins and their molar weight ($\textbf{d}$ vector).
    * The complex table can be used to initialize complexes with their subunit stoichiometry ($\textbf{C}$ matrix).
        * All stoichiometric coefficients are initialized at a value of one.
    * The enzyme table can be used to initialize enzymes with their effective rate constants ($\textbf{K}_\mathrm{eff}$ matrix).
        * All $k_\mathrm{eff}$ values are initialized at average rate constant of 65 $s^{-1}$  (or equivalently, 234000 $hr^{-1})$.

2. The draft tables are made to be facilitate curation and data replacement. Therefore, the draft PC-model is exported with the draft tables. 
3. A refined PC-model can be created using the curated tables. 

#### Option 2: Load tables from files
4. The formation of a draft model can be skipped if the curated tables already exist. They can be loaded.

In [9]:
pcmodel_tables = {}
replace_compartments = {
    # Cytosol:extracellular --> plasma membrane
    "c": "c",
    "ce": "pm",
    "e": "e",  # Most extracellular reactions that occur are due to proteins bound to the external side of them membrane.
}

# Convert all protein compartments to one compartment
simplify_compartments = True
prefix = True
optional_columns = True

# Enzyme values for new tables
max_weight_fraction = 100
enzyme_keff_base = DEFAULT_KEFF
enzyme_forward_suffix = DEFAULT_ENZYME_FORWARD_SUFFIX
enzyme_reverse_suffix = DEFAULT_ENZYME_REVERSE_SUFFIX
enzyme_total_suffix = DEFAULT_ENZYME_TOTAL_SUFFIX

dict_of_id_keys = {
    "proteins": protein_id_key,
    "complexes": None,
    "enzymes": "reactions",
}

# Provide filepaths to load a specific model
model_filepaths = {
    # "proteins": f"{model_dirpath}/pcmodel_{model}_proteins.tsv",
    # "complexes": f"{model_dirpath}/pcmodel_{model}_complexes.tsv",
    # "enzymes": f"{model_dirpath}/pcmodel_{model}_enzymes.tsv",
    # "complex_keffs": f"{model_dirpath}/pcmodel_{model}_complex_keffs.tsv",
    # "enzyme_keffs": f"{model_dirpath}/pcmodel_{model}_enzyme_keffs.tsv",
}

# Provide filepaths to general files
filepaths = {
    "proteins": f"{data_path}/pcmodel_proteins.tsv",
    "complexes": f"{data_path}/pcmodel_complexes.tsv",
    # "enzymes": f"{data_path}/pcmodel_enzymes.tsv",
    # "complex_keffs": f"{data_path}/pcmodel_complex_keffs.tsv",
    # "enzyme_keffs": f"{data_path}/pcmodel_enzyme_keffs.tsv",
}

##### Proteins

In [10]:
table_type = "proteins"
try:
    # Try loading previously build model proteins
    df_proteins = pd.read_csv(model_filepaths[table_type], sep="\t", index_col=None)
    print("Loaded from previously generated file")
except (FileNotFoundError, KeyError):
    # Otherwise try using main RBC-GEM files to make model proteins
    try:
        df_proteins = pd.read_csv(filepaths[table_type], sep="\t", index_col=None)
    except (FileNotFoundError, KeyError):
        # Otherwise, make from scratch
        df_proteins = create_protein_table(
            model,
            df_protein_data=df_protein_data,
            id_key=dict_of_id_keys.get(table_type),
            prefix=prefix,
            optional_columns=optional_columns,
            annotation_columns=[
                "uniprot",
            ],
            replace_compartments=replace_compartments,
        )
        print("Created new table")
        # Create column for identifiers if None exists, or if compartments were replaced
        if not isoform_transform:
            df_proteins[df_proteins[table_type].duplicated(False)]
    else:
        df_proteins = df_protein_data.merge(
            df_proteins[["uniprot", "compartment"]],
            left_on="uniprot",
            right_on="uniprot",
            how="left",
        )
        df_proteins["protein"] = df_proteins[protein_id_key].apply(
            lambda x: f"protein_{x}"
        )
        df_proteins = df_proteins.drop("protein.id", axis=1)
        df_proteins = df_proteins[
            df_proteins["genes"].isin(model.genes.list_attr("id"))
        ].reset_index(drop=True)
        print("Loaded from main RBC-GEM file")

if simplify_compartments:
    df_proteins = df_proteins.groupby(["genes", "protein"]).agg(
        lambda values: ";".join(
            [str(value) for value in list(values.dropna().unique())]
        )
    )
    df_proteins["compartment"] = "pc"
    df_proteins = df_proteins.reset_index(drop=False)

df_proteins["proteins"] = df_proteins[[strip_plural(table_type), "compartment"]].apply(
    lambda x: "_".join(x.values), axis=1
)
df_proteins = df_proteins.set_index(strip_plural(table_type))
pcmodel_tables[table_type] = df_proteins.copy()
df_proteins

Loaded from main RBC-GEM file


Unnamed: 0_level_0,genes,uniprot,sequence.id,sequence,compartment,proteins
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
protein_ADA,ADA,P00813,P00813,MAQTPAFDKPKVELHVHLDGSIKPETILYYGRRRGIALPANTAEGL...,pc,protein_ADA_pc
protein_ADK,ADK,P55263,P55263-1,MAAAEEEPKPKKLKVEAPQALRENILFGMGNPLLDISAVVDKDFLD...,pc,protein_ADK_pc
protein_AK1,AK1,P00568,P00568,MEEKLKKTKIIFVVGGPGSGKGTQCEKIVQKYGYTHLSTGDLLRSE...,pc,protein_AK1_pc
protein_ALDOA,ALDOA,P04075,P04075-1,MPYQYPALTPEQKKELSDIAHRIVAPGKGILAADESTGSIAKRLQS...,pc,protein_ALDOA_pc
protein_ALDOB,ALDOB,P05062,P05062,MAHRFPALTQEQKKELSEIAQSIVANGKGILAADESVGTMGNRLQR...,pc,protein_ALDOB_pc
...,...,...,...,...,...,...
protein_TMEM109,TMEM109,Q9BVC6,Q9BVC6,MAASSISSPWGKHVFKAILMVLVALILLHSALAQSRRDFAPPGQQK...,pc,protein_TMEM109_pc
protein_TPI1,TPI1,P60174,P60174-1,MAPSRKFFVGGNWKMNGRKQSLGELIGTLNAAKVPADTEVVCAPPT...,pc,protein_TPI1_pc
protein_TRPC6,TRPC6,Q9Y210,Q9Y210-1,MSQSPAFGPRRGSSPRGAAGAAARRNESQDYLLMDSELGEDGCPQA...,pc,protein_TRPC6_pc
protein_TRPV2,TRPV2,Q9Y5S1,Q9Y5S1,MTSPSSSPVFRLETLDGGQEDGSEADRGKLDFGSGLPPMESQFQGE...,pc,protein_TRPV2_pc


##### Complexes

In [11]:
table_type = "complexes"
try:
    df_complexes = pd.read_csv(model_filepaths[table_type], sep="\t", index_col=None)
except (FileNotFoundError, KeyError):
    try:
        df_complexes = pd.read_csv(filepaths[table_type], sep="\t", index_col=None)
    except (FileNotFoundError, KeyError):
        genes_to_proteins = (
            pcmodel_tables["proteins"]
            .groupby(["genes"], as_index=True)["proteins"]
            .agg(lambda x: build_string(list(x)))
            .to_dict()
        )
        cofactor_genes = {}
        # Create table
        df_complexes = create_complex_table(
            model,
            genes_to_proteins=genes_to_proteins,
            cofactor_genes=cofactor_genes,
            id_key=dict_of_id_keys.get(table_type),
            prefix=prefix,
            optional_columns=optional_columns,
            annotation_columns=[
                # "uniprot"
            ],
            replace_compartments=replace_compartments,
        )
    else:
        df_complexes = df_complexes[
            df_complexes["genes"].apply(
                lambda genes: all(
                    [model.genes.has_id(gene) for gene in genes.split(";")]
                )
            )
        ]
        df_complexes["reactions"] = df_complexes["reactions"].apply(
            lambda reactions: ";".join(
                [r for r in reactions.split(";") if model.reactions.has_id(r)]
            )
        )
        df_complexes = df_complexes[df_complexes["reactions"] != ""]
        df_complexes = df_complexes.loc[
            :,
            [
                "complex",
                "subunits",
                "compartment",
                "reactions",
                "genes",
                "coefficients",
                "cofactors",
                "notes",
            ],
        ]

    # Address isoform mapping to complexes
    isoforms_map = defaultdict(list)
    complex_name_update = defaultdict(list)
    for x in df_proteins[df_proteins["genes"].duplicated(False)].index:
        isoforms_map[x.rsplit("_", maxsplit=1)[0]].append(x)
        complex_name_update[
            x.rsplit("_", maxsplit=1)[0].replace("protein_", "")
        ].append(x.replace("protein_", ""))
    df_isoforms_complexes = df_complexes[
        df_complexes["subunits"].apply(
            lambda proteins: bool(set(isoforms_map).intersection(proteins.split(";")))
        )
    ]
    df_updated_rows = []
    for _, row in df_isoforms_complexes.iterrows():
        complex_names = [
            complex_name
            for complex_name in itertools.product(
                *[complex_name_update.get(c, [c]) for c in row["complex"].split("_")]
            )
        ]
        combos = [
            list(combo)
            for combo in itertools.product(
                *[
                    isoforms_map.get(protein, [protein])
                    for protein in row["subunits"].split(";")
                ]
            )
        ]

        for complex_name, combo in zip(complex_names, combos):
            new_row = row.to_dict()
            new_row["complex"] = "_".join(complex_name)
            new_row["subunits"] = ";".join(combo)
            df_updated_rows.append(new_row)

    df_complexes = pd.concat(
        (
            df_complexes[~df_complexes.index.isin(df_isoforms_complexes.index)],
            pd.DataFrame(df_updated_rows),
        ),
        axis=0,
    )

if simplify_compartments:
    df_complexes = df_complexes.groupby(["subunits", "complex"]).agg(
        lambda values: ";".join(
            [str(value) for value in list(values.dropna().unique())]
        )
    )
    df_complexes["compartment"] = "pc"
    df_complexes = df_complexes.reset_index(drop=False)

df_complexes["complexes"] = df_complexes[
    [strip_plural(table_type), "compartment"]
].apply(lambda x: "_".join(x.values), axis=1)
df_complexes["subunits"] = df_complexes[["subunits", "compartment"]].apply(
    lambda values: ";".join(
        [
            "_".join((x, values["compartment"])) if not x.endswith("_pc") else x
            for x in values["subunits"].split(";")
        ]
    ),
    axis=1,
)
df_complexes = df_complexes.set_index(strip_plural(table_type))
pcmodel_tables[table_type] = df_complexes.copy()
df_complexes

Unnamed: 0_level_0,subunits,compartment,reactions,genes,coefficients,cofactors,notes,complexes
complex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
cplx_MONOMER_ADA,protein_ADA_pc,pc,ADA,ADA,1,,,cplx_MONOMER_ADA_pc
cplx_MONOMER_ADK,protein_ADK_pc,pc,ADNK1,ADK,1,,,cplx_MONOMER_ADK_pc
cplx_MONOMER_AK1,protein_AK1_pc,pc,ADK1,AK1,1,,,cplx_MONOMER_AK1_pc
cplx_HOMOTETRAMER_ALDOA,protein_ALDOA_pc,pc,FBA,ALDOA,4,,,cplx_HOMOTETRAMER_ALDOA_pc
cplx_HOMOTETRAMER_ALDOB,protein_ALDOB_pc,pc,FBA,ALDOB,4,,,cplx_HOMOTETRAMER_ALDOB_pc
...,...,...,...,...,...,...,...,...
cplx_MONOMER_TMEM109,protein_TMEM109_pc,pc,CA2t;Kt1,TMEM109,1,,,cplx_MONOMER_TMEM109_pc
cplx_HOMODIMER_TPI1,protein_TPI1_pc,pc,TPI,TPI1,2,,,cplx_HOMODIMER_TPI1_pc
cplx_HOMODIMER_TRPC6,protein_TRPC6_pc,pc,CA2t,TRPC6,2,,,cplx_HOMODIMER_TRPC6_pc
cplx_HOMOTETRAMER_TRPV2,protein_TRPV2_pc,pc,CA2t,TRPV2,4,,,cplx_HOMOTETRAMER_TRPV2_pc


##### Enzymes

In [12]:
table_type = "enzymes"
try:
    df_enzymes = pd.read_csv(model_filepaths[table_type], sep="\t", index_col=None)
except (FileNotFoundError, KeyError):
    try:
        df_enzymes = pd.read_csv(filepaths[table_type], sep="\t", index_col=None)
    except (FileNotFoundError, KeyError):
        complexes_to_reactions = (
            pcmodel_tables["complexes"].set_index("complexes")["reactions"].to_dict()
        )
        df_enzymes = create_enzyme_table(
            model,
            complexes_to_reactions=complexes_to_reactions,
            enzyme_keff_base=enzyme_keff_base,
            enzyme_forward_suffix=enzyme_forward_suffix,
            enzyme_reverse_suffix=enzyme_reverse_suffix,
            id_key=dict_of_id_keys.get(table_type),
            prefix=prefix,
            optional_columns=optional_columns,
            annotation_columns=[
                # "uniprot"
            ],
            replace_compartments=replace_compartments,
        )
        if replace_compartments:
            df_enzymes["compartment"] = df_enzymes["compartment"].replace(
                replace_compartments
            )

if simplify_compartments:
    df_enzymes = df_enzymes.groupby(["complexes", "enzyme"]).agg(
        lambda values: ";".join(
            [str(value) for value in list(values.dropna().unique())]
        )
    )
    df_enzymes["compartment"] = "pc"
    df_enzymes = df_enzymes.reset_index(drop=False)

df_enzymes[table_type] = df_enzymes[[strip_plural(table_type), "compartment"]].apply(
    lambda x: "_".join(x.values), axis=1
)
df_enzymes = df_enzymes.set_index(strip_plural(table_type))
pcmodel_tables[table_type] = df_enzymes.copy()
df_enzymes

Unnamed: 0_level_0,complexes,compartment,reactions,enzymes,direction
enzyme,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
enzyme_FCYTOB5OXR_fwd,cplx_COMPLEX_CYB5R3_B5A_pc;cplx_COMPLEX_CYB5R3...,pc,FCYTOB5OXR,enzyme_FCYTOB5OXR_fwd_pc,forward
enzyme_FCYTOB5OXR_rev,cplx_COMPLEX_CYB5R3_B5A_pc;cplx_COMPLEX_CYB5R3...,pc,FCYTOB5OXR,enzyme_FCYTOB5OXR_rev_pc,reverse
enzyme_METHBCYTBR_fwd,cplx_COMPLEX_HBA_CYB5A_pc;cplx_COMPLEX_HBB_CYB...,pc,METHBCYTBR,enzyme_METHBCYTBR_fwd_pc,forward
enzyme_METHBCYTBR_rev,cplx_COMPLEX_HBA_CYB5A_pc;cplx_COMPLEX_HBB_CYB...,pc,METHBCYTBR,enzyme_METHBCYTBR_rev_pc,reverse
enzyme_NaKt_fwd,cplx_HETERODIMER_ATP1_A1B1_pc;cplx_HETERODIMER...,pc,NaKt,enzyme_NaKt_fwd_pc,forward
...,...,...,...,...,...
enzyme_INSt_rev,cplx_MONOMER_SLC29A1_pc,pc,INSt,enzyme_INSt_rev_pc,reverse
enzyme_ADEt_fwd,cplx_MONOMER_SLC29A1_pc;cplx_MONOMER_SLC43A3_pc,pc,ADEt,enzyme_ADEt_fwd_pc,forward
enzyme_ADEt_rev,cplx_MONOMER_SLC29A1_pc;cplx_MONOMER_SLC43A3_pc,pc,ADEt,enzyme_ADEt_rev_pc,reverse
enzyme_HYXNt_fwd,cplx_MONOMER_SLC29A1_pc;cplx_MONOMER_SLC43A3_pc,pc,HYXNt,enzyme_HYXNt_fwd_pc,forward


## Create PC-model

In [13]:
protein_table = pcmodel_tables["proteins"].reset_index(drop=False)
complex_table = pcmodel_tables["complexes"].reset_index(drop=False)
enzyme_table = pcmodel_tables["enzymes"].reset_index(drop=False)
max_weight_fraction = 100

pcmodel, final_pcmodel_tables = construct_pcmodel_from_tables(
    model,
    protein_table=protein_table,
    complex_table=complex_table,
    enzyme_table=enzyme_table,
    max_weight_fraction=max_weight_fraction,
    enzyme_keff_base=enzyme_keff_base,
    enzyme_forward_suffix=enzyme_forward_suffix,
    enzyme_reverse_suffix=enzyme_reverse_suffix,
    enzyme_total_suffix=enzyme_total_suffix,
    include_complex_dilutions=True,  # Relaxes constraints areound complexes. Recommend to start, can be set to zero later or removed entirely
    irrev_rxn_complex_keff=0,  # Set as None to ignore, small number to keep in model, 0 to remove from complex-enzyme mapping
)
if simplify_compartments:
    pcmodel.compartments = {"pc": "protein compartment"}
# Print summary
for attr, subclass_dict in ATTR_SUBCLASS_DICT.items():
    n = len(
        getattr(pcmodel, attr).query(
            lambda x: not isinstance(x, tuple(subclass_dict.values()))
        )
    )
    print(f"Number of {attr}: {n}")
    for key, subcls in subclass_dict.items():
        obj_list = getattr(pcmodel, attr).query(lambda x: isinstance(x, subcls))
        n = len(obj_list)
        print(f"Number of {key}: {n}")
        if subcls in (Enzyme, EnzymeDilution):
            print(
                f"Forward variable: {len(obj_list.query(lambda x: enzyme_forward_suffix in x.id))}"
            )
            print(
                f"Reverse variable: {len(obj_list.query(lambda x: enzyme_reverse_suffix in x.id))}"
            )
            print(
                f"Summation variable : {len(obj_list.query(lambda x: enzyme_total_suffix in x.id))}"
            )
    print()

keff_table = final_pcmodel_tables["enzymes"].copy()
keff_table["direction"] = keff_table["reactions"].apply(
    lambda rid: model.reactions.get_by_id(rid).reaction
)
keff_table["direction"] = keff_table["direction"].apply(
    lambda x: x.replace("<=>", "-->")
)
keff_table["direction"] = keff_table[["enzyme", "direction"]].apply(
    lambda x: (
        x["direction"].replace("-->", "<--")
        if x["enzyme"].endswith(DEFAULT_ENZYME_REVERSE_SUFFIX)
        else x["direction"]
    ),
    axis=1,
)
keff_table["complexes"] = keff_table["complexes"].apply(lambda x: x.split(";"))
keff_table["complex_keff"] = keff_table["complex_keff"].apply(lambda x: x.split(";"))
keff_table = keff_table.explode(["complexes", "complex_keff"])
keff_table["complex"] = keff_table[["complexes", "compartment"]].apply(
    lambda x: x["complexes"].replace(f"_{x['compartment']}", ""), axis=1
)
keff_table = keff_table.groupby(["enzyme", "complex"], as_index=False).agg(
    lambda x: list(x.unique())[0]
)
keff_table = keff_table.loc[
    :,
    [
        "enzyme",
        "enzyme_keff",
        "complex",
        "complex_keff",
        "compartment",
        "reactions",
        "direction",
    ],
]
complex_keff_table = keff_table.drop("enzyme_keff", axis=1).drop_duplicates()
enzyme_keff_table = (
    keff_table.groupby(["enzyme", "enzyme_keff"], as_index=False)[
        ["reactions", "direction"]
    ]
    .agg(lambda x: list(x.unique())[0])
    .drop_duplicates()
)
final_pcmodel_tables["complex_keffs"] = complex_keff_table
final_pcmodel_tables["enzyme_keffs"] = enzyme_keff_table

n_cplx_keff = len(
    complex_keff_table[complex_keff_table["complex_keff"].astype(float) != 0]
)
print(f"Number of non-zero complex rate constants: {n_cplx_keff}")

n_enzyme_keff = len(
    enzyme_keff_table[enzyme_keff_table["enzyme_keff"].astype(float) != 0]
)
print(f"Number of non-zero enzyme rate constants: {n_enzyme_keff}")


for table_type, df_table in final_pcmodel_tables.items():
    df_table.to_csv(
        f"{data_path}/pcmodel_{pcmodel}_{table_type}.tsv", sep="\t", index=False
    )

Number of metabolites: 72
Number of proteins: 103
Number of complexes: 137
Number of enzymes: 174
Forward variable: 58
Reverse variable: 58
Summation variable : 58
Number of proteome budget: 1

Number of reactions: 85
Number of protein dilutions: 103
Number of complex formation reactions: 137
Number of complex dilutions: 137
Number of enzyme formation reactions: 296
Number of enzyme dilutions: 174
Forward variable: 58
Reverse variable: 58
Summation variable : 58
Number of proteome budget demand: 1

Number of non-zero complex rate constants: 296
Number of non-zero enzyme rate constants: 116


### Formulate additional protein constraints
#### Address isoforms and compartments with additional constraints
For isoforms and/or compartments, place an additional constraint such that the total sum of all isoforms does not exceed the measured concentraiton value.

In [14]:
protein_table = pcmodel_tables["proteins"]
mapping_key = "uniprot"
df_additional_constraints = protein_table[protein_table[mapping_key].duplicated(False)]
df_additional_constraints = df_additional_constraints.groupby(
    [
        "genes",
        mapping_key,
    ],
    as_index=False,
).agg(lambda x: list(x))
if not df_additional_constraints.empty:
    if "lower_bound" in df_additional_constraints.columns:
        df_additional_constraints["lower_bound"] = df_additional_constraints[
            "lower_bound"
        ].apply(min)
    if "upper_bound" in df_additional_constraints.columns:
        df_additional_constraints["upper_bound"] = df_additional_constraints[
            "upper_bound"
        ].apply(max)

data = {}
for idx, row in df_additional_constraints.iterrows():
    # Technically, always one gene but refers to genes attribute
    genes = row["genes"]
    uniprot = model.genes.get_by_id(genes).annotation.get(mapping_key, "")
    proteins = split_string(row.get("proteins"))
    proteins = pcmodel.metabolites.get_by_any(proteins)
    is_compartment = len({p.compartment for p in proteins}) > 1
    is_isoform = (
        len(
            {
                p.id.replace(f"_{p.compartment}", "").split(
                    "_",
                )[-1]
                for p in proteins
                if p.id.replace(f"_{p.compartment}", "")
                .split(
                    "_",
                )[-1]
                .isnumeric()
            }
        )
        > 1
    )
    if is_compartment and not is_isoform:
        default_prefix = DEFAULT_COMPARTMENT_CONSTRAINT_PREFIX
    elif is_isoform and not is_compartment:
        default_prefix = DEFAULT_ISOFORM_CONSTRAINT_PREFIX
    else:
        default_prefix = DEFAULT_CONSTRAINT_PREFIX
    constraint_id = row.get("constraints", f"{default_prefix}{genes}")
    lower_bound = float(row.get("lower_bound")) if row.get("lower_bound") else 0
    upper_bound = (
        float(row.get("upper_bound"))
        if row.get("upper_bound")
        else DEFAULT_CONCENTRATION_BOUND
    )
    protein_dilutions = [
        reaction
        for protein in proteins
        for reaction in list(protein.reactions)
        if reaction.id.endswith(protein.id)
    ]
    # "ISOCONS" is short for "ISOFORM CONSTRAINT"
    # "COMPCONS" is short for "COMPARTMENT CONSTRAINT"
    # "CONS" for general constraint
    data[idx] = {
        "constraints": constraint_id,
        "genes": genes,
        "proteins": build_string([p.id for p in proteins]),
        "reactions": build_string([p.id for p in protein_dilutions]),
        # Assume sum of isoforms is a constant, works well with proteomic measurements that do not distinguish
        "coefficients": ";".join([str(1) for p in protein_dilutions]),
        "lower_bound": lower_bound,
        "upper_bound": upper_bound,
        "unit": "nmol / gDW",
        mapping_key: uniprot,
    }
df_additional_constraints = pd.DataFrame.from_dict(data, orient="index")
df_additional_constraints.to_csv(
    f"{model_dirpath}/constraints_proteins_{pcmodel.id}.tsv", sep="\t", index=False
)
df_additional_constraints

## Add additional protein constraints to model

In [15]:
try:
    df_additional_constraints = pd.read_csv(
        f"{model_dirpath}/constraints_proteins_{pcmodel.id}.tsv",
        sep="\t",
        index_col=None,
    )
except (FileNotFoundError, pd.errors.EmptyDataError):
    df_additional_constraints = pd.DataFrame()
else:
    if not df_additional_constraints.empty:
        for constraint_id, row in df_additional_constraints.set_index(
            "constraints"
        ).iterrows():
            reactions = pcmodel.reactions.get_by_any(row["reactions"].split(";"))
            coefficients = row["coefficients"].split(";")
            expression = sum(
                [
                    int(coeff) * reaction.flux_expression
                    for reaction, coeff in zip(reactions, coefficients)
                ]
            )
            lower_bound = float(row.get("lower_bound")) if row.get("lower_bound") else 0
            upper_bound = (
                float(row.get("upper_bound"))
                if row.get("upper_bound")
                else DEFAULT_CONCENTRATION_BOUND
            )
            if constraint_id in pcmodel.constraints:
                # TODO warn
                pcmodel.remove_cons_vars(pcmodel.constraints[constraint_id])
            additional_constraint = pcmodel.problem.Constraint(
                expression,
                name=constraint_id,
                lb=lower_bound,
                ub=upper_bound,
            )
            pcmodel.add_cons_vars(additional_constraint)

df_additional_constraints

### Add other additional constraints

In [16]:
# constraints_ratios_filepath = f"{data_path}/constraints_additional.tsv"
# df_constraints_additional = pd.read_csv(
#     constraints_ratios_filepath,
#     sep="\t",
#     index_col="constraints",
# )

# ratio_ids = set()
# skipped_constraints = set()
# not_found = set()
# for constraint_id, row in df_constraints_additional.iterrows():
#     subs_dict = {}
#     lhs = parse_expr(row["lhs"])
#     rhs = parse_expr(row["rhs"])

#     csense = row["csense"]
#     lb=None if csense == "<" else 0
#     ub=None if csense == ">" else 0
#     reactions = row["reactions"].split(";")
#     for reaction in reactions:
#         try:
#             reaction = model.reactions.get_by_id(reaction)
#         except Exception:
#             if reaction == str(rhs) or reaction == str(lhs):
#                 skipped_constraints.add(constraint_id)
#                 continue
#             else:
#                 not_found.add(reaction)
#                 subs_dict[reaction] = 0
#         else:
#             subs_dict[reaction.id] = reaction.flux_expression
#     if (str(rhs) == "0" or str(lhs) == "0") and len([r for r in reactions if r not in not_found]) <= 1:
#         skipped_constraints.add(constraint_id)
#     if constraint_id in skipped_constraints:
#         continue
#     expression = lhs - rhs
#     expression = expression.subs(subs_dict)
#     if str(expression) == "0":
#         print(f"{constraint_id} is always equal to 0, not including.")
#         skipped_constraints.add(constraint_id)
#         continue
#     try:
#         constraint = model.constraints[constraint_id]
#     except Exception:
#         pass
#     else:
#         model.remove_cons_vars(constraint)
#     constraint = model.problem.Constraint(
#         expression=expression,
#         name=constraint_id,
#         lb=float(lb) if lb is not None else lb,
#         ub=float(ub) if ub is not None else ub,
#     )
#     model.add_cons_vars(constraint)
#     # Convert units
#     if constraint.lb is not None:
#         constraint.lb = convert_L_to_gDW(float(constraint.lb))
#     if constraint.ub is not None:
#         constraint.ub = convert_L_to_gDW(float(constraint.ub))
#     df_constraints_additional.loc[constraint_id, "reactions"] = ";".join([r for r in reactions if r not in not_found])
#     print(constraint)
#     df_constraints_additional.loc[constraint_id, "lhs"] = str(lhs)
#     df_constraints_additional.loc[constraint_id, "rhs"] = str(rhs)


# df_constraints_additional = df_constraints_additional.loc[~df_constraints_additional.index.isin(not_found.union(skipped_constraints))]
# df_constraints_additional.to_csv(f"{model_dirpath}/constraints_additional.tsv", sep="\t", index=False)
# df_constraints_additional

### Annotate proteins with UniProt IDs and sequences

In [17]:
protein_to_uniprot = protein_table.set_index("proteins")["uniprot"].to_dict()
protein_to_sequence = protein_table.set_index("proteins")["sequence"].to_dict()

for protein_dilution in pcmodel.reactions.query(
    lambda x: isinstance(x, ProteinDilution)
):
    if protein_to_uniprot.get(f"{protein_dilution.id}".replace("PROTDL_", "")):
        protein_dilution.annotation["uniprot"] = protein_to_uniprot[
            f"{protein_dilution.id}".replace("PROTDL_", "")
        ]
        protein_dilution.annotation["uniprot.aa_sequence"] = protein_to_sequence[
            f"{protein_dilution.id}".replace("PROTDL_", "")
        ]

for protein in pcmodel.metabolites.query(lambda x: isinstance(x, Protein)):
    if protein_to_uniprot.get(protein.id):
        protein.annotation["uniprot"] = protein_to_uniprot[protein.id]
        protein.annotation["uniprot.aa_sequence"] = protein_to_sequence[protein.id]


enzyme_to_reaction = enzyme_table.set_index("enzymes")["reactions"].to_dict()
for enzyme_dilution in pcmodel.reactions.query(lambda x: isinstance(x, EnzymeDilution)):
    if enzyme_to_reaction.get(f"{enzyme_dilution.id}".replace("ENZDL_", "")):
        # Make it easier to use later
        enzyme_dilution.annotation["reaction"] = enzyme_to_reaction[
            f"{enzyme_dilution.id}".replace("ENZDL_", "")
        ]

for enzyme in pcmodel.metabolites.query(lambda x: isinstance(x, Enzyme)):
    if enzyme_to_reaction.get(enzyme.id):
        enzyme.annotation["reaction"] = enzyme_to_reaction[enzyme.id]

### Set proteome budget constraints for low abundance and high-abundance proteomes
* RBCs are enucleated, terminally differentiated cells that are composed of 95% to 98% Hb by dry mass (mass of all the constituents of a cell in the absence of water)
    * PMID: 13429433, PMID: 13999462, PMID: 21796773, **PMID: 34378368**
* Therefore, remove hemoglobin from the low abundance proteome budget constraint and create a new constraint specific to hemoglobin abundance.
* Assume 90% minimum of dry mass is hemoglobin, and up to 10% of dry mass are other proteins

In [18]:
# Proteome budget for low abundance proteins
PBDL_proteome_budget = pcmodel.reactions.get_by_id("PBDL_proteome_budget")
PBDL_proteome_budget.id = "PBDL_proteome_budget"
PBDL_proteome_budget.name = "Proteome budget demand (Low abundance)"

proteome_budget = pcmodel.metabolites.get_by_id("proteome_budget")
proteome_budget.id = "proteome_budget"
proteome_budget.name = "Proteome Budget Constraint (Low abundance)"

# Proteome budget for hemoglobin
PBDL_hemoglobin_budget = PBDL_proteome_budget.copy()
hemoglobin_budget = list(PBDL_hemoglobin_budget.metabolites).pop()
PBDL_hemoglobin_budget.id = "PBDL_hemoglobin_budget"
PBDL_hemoglobin_budget.name = "Proteome budget demand (Hemoglobin)"

hemoglobin_budget.id = "hemoglobin_budget"
hemoglobin_budget.name = "Hemoglobin Budget Constraint"

# Budget for generic model
PBDL_hemoglobin_budget.bounds = (900, 1000)
PBDL_proteome_budget.bounds = (0.0, 100)

pcmodel.add_reactions([PBDL_hemoglobin_budget])

remove_from_low_abundance_budget = [
    "HBA",
    "HBB",
    "HBD",
    "HBE1",
    "HBG1",
    "HBG2",
    "HBM",
    "HBQ1",
    "HBZ",
]
for reaction in proteome_budget.reactions:
    if any(
        [f"protein_{gid}" in reaction.id for gid in remove_from_low_abundance_budget]
    ):
        reaction.add_metabolites(
            {hemoglobin_budget: reaction.get_coefficient(proteome_budget)}
        )
        reaction.subtract_metabolites(
            {proteome_budget: reaction.get_coefficient(proteome_budget)}
        )

for reaction in sorted(
    pcmodel.metabolites.get_by_id("hemoglobin_budget").reactions, key=lambda x: x.id
):
    print(reaction)

# Add total budget constraint for hemoglobin and protein
pcmodel.add_metabolites(
    [
        ProteomeBudget(
            id="total_budget",
            name="Total Budget Constraint",
            compartment=proteome_budget.compartment,
        )
    ]
)
total_budget = pcmodel.metabolites.get_by_id("total_budget")
PBDL_total_budget = f"PBDL_{total_budget.id}"
pcmodel.add_reactions(
    [
        ProteomeBudgetDilution(
            id=PBDL_total_budget,
            name="Total budget demand",
            lower_bound=0,
            upper_bound=1000,
        )
    ]
)
PBDL_total_budget = pcmodel.reactions.get_by_id(PBDL_total_budget)
print()
for reaction in pcmodel.reactions.query(
    lambda x: isinstance(x, ProteomeBudgetDilution)
):
    if PBDL_total_budget.id == reaction.id:
        reaction.add_metabolites({total_budget: -1}, combine=False)
    else:
        reaction.add_metabolites({total_budget: 1}, combine=False)
    print(reaction)

PBDL_hemoglobin_budget: hemoglobin_budget --> 
PROTDL_protein_HBA_pc:  --> 0.015259421999999979 hemoglobin_budget + protein_HBA_pc
PROTDL_protein_HBB_pc:  --> 0.015997243999999983 hemoglobin_budget + protein_HBB_pc
PROTDL_protein_HBD_pc:  --> 0.016056335999999984 hemoglobin_budget + protein_HBD_pc
PROTDL_protein_HBE1_pc:  --> 0.016204680999999985 hemoglobin_budget + protein_HBE1_pc
PROTDL_protein_HBG1_pc:  --> 0.016127253999999987 hemoglobin_budget + protein_HBG1_pc
PROTDL_protein_HBG2_pc:  --> 0.016125281999999987 hemoglobin_budget + protein_HBG2_pc
PROTDL_protein_HBM_pc:  --> 0.015614928999999987 hemoglobin_budget + protein_HBM_pc
PROTDL_protein_HBQ1_pc:  --> 0.015507637999999985 hemoglobin_budget + protein_HBQ1_pc
PROTDL_protein_HBZ_pc:  --> 0.015637878999999983 hemoglobin_budget + protein_HBZ_pc

PBDL_proteome_budget: proteome_budget --> total_budget
PBDL_hemoglobin_budget: hemoglobin_budget --> total_budget
PBDL_total_budget: total_budget --> 


### Ensure model can be optimized for glucose uptake

In [19]:
objective_rxns = ["NaKt"]
pcmodel.objective = sum(
    [pcmodel.reactions.get_by_id(rid).flux_expression for rid in objective_rxns]
)
pcsol = pcmodel.optimize()
pcsol.fluxes.loc[
    [r.id for r in model.reactions if r.id in pcsol.fluxes[pcsol.fluxes != 0].index]
].sort_index()

CO2t              -69.970153
DPGM              -17.492538
ENO                17.492538
EX_co2_e          -69.970153
EX_h2o_e          -17.492538
EX_h_e            -69.970153
EX_k_e            -69.970153
EX_na1_e          104.955229
EX_pi_e            34.985076
EX_pyr_e           17.492538
H2Ot              -17.492538
HB23DPGB          -17.492538
HBO2B1             69.970153
HBO2B2            -69.970153
Ht                -87.462691
NaKt               34.985076
PGK                17.492538
PGM                17.492538
PIt                34.985076
PYK                17.492538
PYRt2              17.492538
SK_hb4_23dpg_c    -17.492538
SK_hb_hco2_c       69.970153
SK_k_c             69.970153
SK_na1_c         -104.955229
Name: fluxes, dtype: float64

In [20]:
pcsol.fluxes.loc[
    [
        r.id
        for r in pcmodel.reactions.query(lambda x: isinstance(x, ProteinDilution))
        if r.id in pcsol.fluxes[pcsol.fluxes != 0].index
    ]
].sort_index()

PROTDL_protein_AQP1_pc         283.665747
PROTDL_protein_ATP1A3_pc       168.208208
PROTDL_protein_ATP1B3_pc       168.208208
PROTDL_protein_ENO3_pc         122.620530
PROTDL_protein_HBE1_pc       55400.771946
PROTDL_protein_HBZ_pc          143.763964
PROTDL_protein_PGAM1_pc        169.895153
PROTDL_protein_PGK1_pc          35.067271
PROTDL_protein_PKM_pc          482.618658
PROTDL_protein_RHAG_pc         468.449524
PROTDL_protein_SLC16A7_pc      132.715752
Name: fluxes, dtype: float64

In [21]:
pcsol.fluxes.loc[
    [
        r.id
        for r in pcmodel.reactions.query(lambda x: isinstance(x, EnzymeDilution))
        if r.id in pcsol.fluxes[pcsol.fluxes != 0].index
    ]
].sort_index()

ENZDL_enzyme_CO2t_fwd_pc          299.017747
ENZDL_enzyme_CO2t_total_pc        299.017747
ENZDL_enzyme_DPGM_fwd_pc           74.754437
ENZDL_enzyme_DPGM_total_pc         74.754437
ENZDL_enzyme_ENO_rev_pc            74.754437
ENZDL_enzyme_ENO_total_pc          74.754437
ENZDL_enzyme_H2Ot_fwd_pc           74.754437
ENZDL_enzyme_H2Ot_total_pc         74.754437
ENZDL_enzyme_HB23DPGB_fwd_pc       74.754437
ENZDL_enzyme_HB23DPGB_total_pc     74.754437
ENZDL_enzyme_HBO2B1_rev_pc        299.017747
ENZDL_enzyme_HBO2B1_total_pc      299.017747
ENZDL_enzyme_HBO2B2_fwd_pc        299.017747
ENZDL_enzyme_HBO2B2_total_pc      299.017747
ENZDL_enzyme_NaKt_rev_pc          149.508874
ENZDL_enzyme_NaKt_total_pc        149.508874
ENZDL_enzyme_PGK_rev_pc            74.754437
ENZDL_enzyme_PGK_total_pc          74.754437
ENZDL_enzyme_PGM_rev_pc            74.754437
ENZDL_enzyme_PGM_total_pc          74.754437
ENZDL_enzyme_PYK_rev_pc            74.754437
ENZDL_enzyme_PYK_total_pc          74.754437
ENZDL_enzy

### Export model

In [22]:
# Regular model
write_cobra_model(model, filename=f"{model_dirpath}/{model}.xml")
write_cobra_model(model, filename=f"{model_dirpath}/{model}.json")

# Protein constrained  without curated keffs
write_cobra_model(pcmodel, filename=f"{model_dirpath}/{pcmodel}.xml")
write_cobra_model(pcmodel, filename=f"{model_dirpath}/{pcmodel}.json")

### Update rate constants

In [22]:
# pcmodel_curated = load_overlay_model(filename=f"{model_dirpath}/{pcmodel.id}.xml")
# pcmodel_curated.id += "_keff_curated"
# df_complex_keffs = pd.read_csv(f"{data_path}/pcmodel_complex_keffs.tsv", sep="\t", index_col=None)
# df_enzyme_keffs = pd.read_csv(f"{data_path}/pcmodel_enzyme_keffs.tsv", sep="\t", index_col=None)

# cf = 1 / 1e6  # Conversion factor from nmol to mmol
# if simplify_compartments:
#     df_enzyme_keffs["compartment"] = "pc"
#     df_complex_keffs["compartment"] = "pc"

# df_enzyme_keffs["enzymes"] = df_enzyme_keffs[["enzyme", "compartment"]].apply(lambda x: "_".join(x.values), axis=1)
# df_complex_keffs["enzymes"] = df_complex_keffs[["enzyme", "compartment"]].apply(lambda x: "_".join(x.values), axis=1)
# df_complex_keffs["complexes"] = df_complex_keffs[["complex", "compartment"]].apply(lambda x: "_".join(x.values), axis=1)
# df_complex_keffs = df_complex_keffs[df_complex_keffs["complexes"].isin(
#     pcmodel_curated.metabolites.query(
#         lambda x: x.id.startswith("cplx_")).list_attr("id")
# )]

# df_complex_keffs["complex_keff"] = df_complex_keffs["complex_keff"].astype(float)
# df_enzyme_keffs["enzyme_keff"] = df_enzyme_keffs["enzyme_keff"].astype(float)

# df_enzyme_keffs = df_enzyme_keffs.drop_duplicates()
# df_complex_keffs = df_complex_keffs.drop_duplicates()
# for _, row in df_complex_keffs.iterrows():
#     enz = row["enzymes"]
#     try:
#         enz = pcmodel_curated.metabolites.get_by_id(enz)
#     except KeyError:
#         if enz.replace(DEFAULT_ENZYME_REVERSE_SUFFIX, "") in pcmodel_curated.metabolites:
#             enz_other_dir = pcmodel_curated.metabolites.get_by_id(enz.replace(DEFAULT_ENZYME_REVERSE_SUFFIX, ""))
#             missing_enz = enz_other_dir.copy()
#             missing_enz.id = enz
#             pcmodel_curated.add_metabolites([missing_enz])

#             enzyme_keff = df_enzyme_keffs[df_enzyme_keffs["enzymes"] == missing_enz.id]["enzyme_keff"].item()
#             df = df_complex_keffs[df_complex_keffs["enzymes"] == missing_enz.id].copy()
#             for _, (enzyme, cplx, complex_keff) in df[["enzyme", "complexes", "complex_keff"]].iterrows():
#                 if complex_keff == 0 or enzyme_keff == 0:
#                     continue

#                 keff = float(complex_keff) / float(enzyme_keff)
#                 formation_rxn = add_complex_formation_reaction(
#                     pcmodel_curated,
#                     missing_enz,
#                     "enzyme",
#                     coeff_map=f"{cplx}({keff})",
#                 )
#             if enzyme_keff != 0:
#                 dilution_rxn = add_dilution_reaction(
#                     pcmodel_curated,
#                     missing_enz,
#                     "enzyme",
#                 )

#                 for r in enz_other_dir.reactions:
#                     if not r.id in model.reactions:
#                         continue
#                     sign = -1 if enz_other_dir in r.reactants else 1
#                     pcmodel_curated.reactions.get_by_id(r.id).add_metabolites(
#                         {missing_enz: sign * (1 / enzyme_keff / cf)}, combine=False
#                     )
#     else:
#         enzyme_keff = df_enzyme_keffs[df_enzyme_keffs["enzymes"] == enz.id]["enzyme_keff"].item()
#         df = df_complex_keffs[df_complex_keffs["enzymes"] == enz.id].copy()
#         for _, (enzyme, cplx, complex_keff) in df[["enzyme", "complexes", "complex_keff"]].iterrows():
#             if complex_keff == 0 or enzyme_keff == 0:
#                 continue
#             keff = float(complex_keff) / float(enzyme_keff)
#             try:
#                 formation_rxn = pcmodel_curated.reactions.get_by_id(f"ENZFM_{enzyme}_{cplx}")
#             except KeyError:
#                 formation_rxn = add_complex_formation_reaction(
#                     pcmodel_curated,
#                     enz,
#                     "enzyme",
#                     coeff_map=f"{cplx}({keff})",
#                 )
#             else:
#                 formation_rxn.add_metabolites({cplx: -keff}, combine=False)
#         if enzyme_keff != 0:
#             for r in enz.reactions:
#                 if not r.id in model.reactions:
#                     continue
#                 sign = 1 if enz in r.products else -1
#                 pcmodel_curated.reactions.get_by_id(r.id).add_metabolites(
#                     {enz: sign * (1 / enzyme_keff / cf)}, combine=False
#                 )

# pcmodel_curated.remove_metabolites(pcmodel_curated.metabolites.query(lambda x: not x.reactions))
# df_complex_keffs.to_csv(f"{model_dirpath}/complex_keffs_{pcmodel_curated.id}.tsv", sep="\t", index=False)
# df_enzyme_keffs.to_csv(f"{model_dirpath}/enzyme_keffs_{pcmodel_curated.id}.tsv", sep="\t", index=False)

# # Print summary
# for attr, subclass_dict in ATTR_SUBCLASS_DICT.items():
#     n = len(getattr(pcmodel_curated, attr).query(lambda x: not isinstance(x, tuple(subclass_dict.values()))))
#     print(f"Number of {attr}: {n}")
#     for key, subcls in subclass_dict.items():
#         n = len(getattr(pcmodel_curated, attr).query(lambda x: isinstance(x, subcls)))
#         print(f"Number of {key}: {n}")
#     print()

# # Print summary
# for attr, subclass_dict in ATTR_SUBCLASS_DICT.items():
#     n = len(getattr(pcmodel_curated, attr).query(lambda x: not isinstance(x, tuple(subclass_dict.values()))))
#     print(f"Number of {attr}: {n}")
#     for key, subcls in subclass_dict.items():
#         n = len(getattr(pcmodel_curated, attr).query(lambda x: isinstance(x, subcls)))
#         print(f"Number of {key}: {n}")
#     print()


# try:
#     df_curated_complex_keffs = final_pcmodel_tables["complex_keffs"].set_index("enzymes")[["complexes", "complex_keff"]].copy()
# except KeyError:
#     print(f"Number of non-zero complex rate constants (curated): 0")
# else:
#     df_curated_complex_keffs = df_curated_complex_keffs.explode(["complexes", "complex_keff"]).reset_index(drop=False).drop_duplicates()
#     df_curated_complex_keffs = df_curated_complex_keffs[df_curated_complex_keffs["complex_keff"].astype(float) != 0.]
#     print(f"Number of non-zero complex rate constants (curated): {len(df_curated_complex_keffs)}")
# finally:
#     print(f"Number of non-zero complex rate constants (total): {n_cplx_keff}")
# formation_rxn
# try:
#     df_curated_enzyme_keffs = final_pcmodel_tables["enzyme_keffs"][["enzymes", "enzyme_keff"]].copy()
# except KeyError:
#     print(f"Number of non-zero enzyme rate constants (curated): 0")
# else:
#     df_curated_enzyme_keffs = df_curated_enzyme_keffs[df_curated_enzyme_keffs["enzyme_keff"].astype(float) != 0.]
#     print(f"Number of non-zero enzyme rate constants (curated): {len(df_curated_enzyme_keffs)}")
# finally:
#     print(f"Number of non-zero enzyme rate constants (total): {n_enzyme_keff}")

# write_cobra_model(pcmodel_curated, filename=f"{model_dirpath}/{pcmodel_curated}.xml")
# write_cobra_model(pcmodel_curated, filename=f"{model_dirpath}/{pcmodel_curated}.json")
# pcmodel_curated

In [23]:
model

0,1
Name,RBC_GEM
Memory address,1054486d0
Number of metabolites,2157
Number of reactions,3275
Number of genes,820
Number of groups,78
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space"


In [24]:
pcmodel

0,1
Name,RBC_GEM_PC
Memory address,156af4650
Number of metabolites,10410
Number of reactions,18799
Number of genes,820
Number of groups,78
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space, protein compartment"
