# Create representative Proteome-Constrained RBC model for dataset
## Setup
### Import packages

In [1]:
import itertools
from collections import defaultdict
from pathlib import Path

import gurobipy as gp
import pandas as pd
from rbc_gem_utils import (COBRA_CONFIGURATION, DATABASE_PATH, GEM_NAME,
                           PROCESSED_PATH, ROOT_PATH, build_string,
                           get_annotation_df, read_cobra_model, show_versions,
                           split_string, write_cobra_model)
from rbc_gem_utils.analysis.overlay import (
    ATTR_SUBCLASS_DICT, DEFAULT_COMPARTMENT_CONSTRAINT_PREFIX,
    DEFAULT_CONCENTRATION_BOUND, DEFAULT_CONSTRAINT_PREFIX,
    DEFAULT_ENZYME_FORWARD_SUFFIX, DEFAULT_ENZYME_REVERSE_SUFFIX,
    DEFAULT_ENZYME_TOTAL_SUFFIX, DEFAULT_ISOFORM_CONSTRAINT_PREFIX,
    DEFAULT_KEFF, ComplexDilution, Enzyme, EnzymeDilution, Protein,
    ProteinDilution, ProteomeBudget, ProteomeBudgetDilution,
    add_relaxation_budget, construct_pcmodel_from_tables, create_complex_table,
    create_enzyme_table, create_protein_table, create_sequence_table,
    load_overlay_model)
from rbc_gem_utils.database.uniprot import UNIPROT_DB_TAG, UNIPROT_PATH
from rbc_gem_utils.util import convert_gDW_to_L, convert_L_to_gDW, strip_plural

gp.setParam("OutputFlag", 0)
gp.setParam("LogToConsole", 0)

# Show versions of notebook
show_versions()

Set parameter Username

Package Information
-------------------
rbc-gem-utils 0.0.1

Dependency Information
----------------------
beautifulsoup4                       4.12.3
bio                                   1.6.2
cobra                                0.29.0
depinfo                               2.2.0
kaleido                               0.2.1
matplotlib                            3.8.2
memote                               0.17.0
networkx                              3.2.1
notebook                              7.0.7
openpyxl                              3.1.2
pandas                                2.2.0
pre-commit                            3.6.0
pyvis                                 0.3.2
rbc-gem-utils[database,network,vis] missing
requests                             2.31.0
scipy                                1.12.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip        23.3.1
setuptools 68.2.2
wheel      0.41.2

Platform Informat

### Define configuration
#### COBRA Configuration

In [2]:
COBRA_CONFIGURATION.solver = "gurobi"
COBRA_CONFIGURATION.bounds = (-1e3, 1e3)
COBRA_CONFIGURATION

Attribute,Description,Value
solver,Mathematical optimization solver,gurobi
tolerance,"General solver tolerance (feasibility, integrality, etc.)",1e-07
lower_bound,Default reaction lower bound,-1000.0
upper_bound,Default reaction upper bound,1000.0
processes,Number of parallel processes,15
cache_directory,Path for the model cache,/Users/zhaiman/Library/Caches/cobrapy
max_cache_size,Maximum cache size in bytes,104857600
cache_expiration,Model cache expiration time in seconds (if any),


## Load RBC model

In [3]:
model_id = "RBC_GEM"

data_path = Path(f"{ROOT_PATH}/data/analysis/OVERLAY").resolve()
results_path = Path(f"{ROOT_PATH}{PROCESSED_PATH}/{model_id}/OVERLAY")
results_path.mkdir(exist_ok=True, parents=True)

imagetype = "svg"
transparent = True
save_figures = True

dataset_name = "RBComics"
pcmodel_dirpath = Path(f"{data_path}/{model_id}")
dataset_path = Path(f"{results_path}/{dataset_name}")
dataset_models_dirpath = Path(f"{dataset_path}/pcmodels")

sample_prefix, time_prefix = ("S", "D")
# Integers are easier to work with for time points
timepoints = [10, 23, 42]

model = read_cobra_model(filename=f"{pcmodel_dirpath}/{model_id}.xml")
pcmodel = load_overlay_model(filename=f"{pcmodel_dirpath}/{model_id}_PC.xml")

# For this workflow, shut off complex dilution reactions at the start
for cplx_dilution in pcmodel.reactions.query(lambda x: isinstance(x, ComplexDilution)):
    cplx_dilution.bounds = (0, 0)

pcmodel

0,1
Name,RBC_GEM_PC
Memory address,1523377d0
Number of metabolites,10410
Number of reactions,18799
Number of genes,820
Number of groups,78
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space, protein compartment"


### Create PC-model representative of dataset
Can only be done after pcFVA results are generated

In [4]:
df_reaction_bounds = pd.read_csv(
    f"{dataset_path}/{pcmodel.id}_{dataset_name}_reaction_bounds.tsv",
    sep="\t",
    index_col="reactions",
)
df_reaction_bounds = df_reaction_bounds.rename(
    {"minimum": "lower_bound", "maximum": "upper_bound"}, axis=1
)
df_reaction_bounds

Unnamed: 0_level_0,lower_bound,upper_bound
reactions,Unnamed: 1_level_1,Unnamed: 2_level_1
13DAMPPOX,0.000000,0.108715
15KPGE1Ry,0.000000,0.000000
15KPGE2Ry,0.000000,0.000000
15KPGE3Ry,0.000000,0.000000
15KPGF1Ry,0.000000,0.000000
...,...,...
YYYTPAP,0.000000,0.000000
ZN2Htex2,0.000000,1.901283
ZN2_2HCO3t,0.000000,2.282722
ZN2_HCO3_SELNIt,0.000000,2.282722


In [5]:
pcmodel_dataset_parameterized = pcmodel.copy()
pcmodel_dataset_parameterized.id += f"_{dataset_name}"
add_relaxation_budget(pcmodel_dataset_parameterized, 0, verbose=False)
for rid, bounds in df_reaction_bounds.iterrows():
    reaction = pcmodel_dataset_parameterized.reactions.get_by_id(rid)
    reaction.bounds = bounds

original_relaxation_bounds = pcmodel_dataset_parameterized.reactions.get_by_id(
    "PBDL_relaxation_budget"
).bounds
# Protein constrained  without curated keffs
write_cobra_model(
    pcmodel, filename=f"{pcmodel_dirpath}/{pcmodel_dataset_parameterized}.xml"
)
pcmodel_dataset_parameterized

0,1
Name,RBC_GEM_PC_RBComics
Memory address,15a641850
Number of metabolites,10411
Number of reactions,19620
Number of genes,820
Number of groups,78
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space, protein compartment"


#### Generate results for subset of PC model reactions
##### Reactions necessary for all flux-expression correlation computations.
To reduce computation time, a subset of reactions can be defined. 
For flux-expression correlations, the minimum reaction set are reactions associated with genes associated and the corresponding enzyme dilution reaction for total enzyme.

In [6]:
enzyme_total_suffix = DEFAULT_ENZYME_TOTAL_SUFFIX
min_reaction_list = model.reactions.query(lambda x: x.gene_reaction_rule).list_attr(
    "id"
)
enzymes_list = pcmodel.reactions.query(
    lambda x: x.id.startswith(f"ENZDL_enzyme_") and f"{enzyme_total_suffix}" in x.id
).list_attr("id")
reaction_enzymes_map = {
    rid: tuple(
        pcmodel.reactions.query(
            lambda x: x.id.startswith(f"ENZDL_enzyme_{rid}_")
        ).list_attr("id")
    )
    for rid in min_reaction_list
}
enzyme_reaction_map = {
    enzyme: rid for rid, enzymes in reaction_enzymes_map.items() for enzyme in enzymes
}
if not enzymes_list:
    enzymes_list = [
        enzyme
        for enzyme, rid in enzyme_reaction_map.items()
        if rid in min_reaction_list
    ]
min_reaction_list += enzymes_list
print(
    f"Number of reactions minimize/maximize (minimum): {len(min_reaction_list)} / {len(pcmodel.reactions)}"
)

Number of reactions minimize/maximize (minimum): 4371 / 18799


In [7]:
pcmodel_dataset_parameterized.reactions.get_by_id(
    "PBDL_relaxation_budget"
).upper_bound = pcmodel_dataset_parameterized.reactions.get_by_id(
    "PBDL_relaxation_budget"
).lower_bound

In [8]:
pcmodel_dataset_parameterized.optimize()

Unnamed: 0,fluxes,reduced_costs
UNK3,0.0,0.000000
4PYRDXABCte,0.0,-0.305073
5AOPt2,0.0,0.000000
EX_ade_e,0.0,-0.000000
EX_adn_e,0.0,-0.000000
...,...,...
RELAX_protein_YES1_pc,0.0,-0.179939
RELAX_protein_ZDHHC2_pc,0.0,-0.124392
RELAX_protein_ZDHHC20_pc,0.0,-0.125131
RELAX_protein_ZDHHC3_pc,0.0,-0.101149


## Run pcFVA

In [9]:
from cobra.flux_analysis import flux_variability_analysis

In [10]:
verbose = True
optimum_percents = [0, 0.5, 0.9, 0.99]
reaction_list = min_reaction_list.copy()

pcfva_results_dirpath = Path(f"{dataset_path}/pcFVA")

index_cols = ["reactions", "optimum", "model"]
print("================================================")
print(f"Computing pcFVA results for {pcmodel_dataset_parameterized}")
print("================================================")
try:
    optimum_solutions = []
    if verbose:
        print(f"Starting simulations for {pcmodel_dataset_parameterized}")
    for optimum_percent in optimum_percents:
        pcfva_sol = flux_variability_analysis(
            pcmodel_dataset_parameterized,
            reaction_list=reaction_list,
            loopless=False,
            fraction_of_optimum=optimum_percent,
            processes=COBRA_CONFIGURATION.processes,
        )
        pcfva_sol.index = pd.MultiIndex.from_tuples(
            [
                (rid, optimum_percent, pcmodel_dataset_parameterized.id)
                for rid in pcfva_sol.index
            ],
            names=index_cols,
        )
        optimum_solutions.append(pcfva_sol)
        if verbose:
            print(f"Finished pcFVA for percent optimum: {optimum_percent}.")
    pcfva_sol = pd.concat(optimum_solutions, axis=0)
    filepath = Path(
        f"{pcfva_results_dirpath}/{pcmodel_dataset_parameterized}_FVAresults.tsv"
    )
    pcfva_sol.to_csv(filepath, sep="\t", index=True)
    pcfva_solutions[str(pcmodel_dataset_parameterized)] = pcfva_sol
    if verbose:
        print(f"Finished all simulations for {pcmodel_dataset_parameterized}")
except Exception as e:
    if verbose:
        print(f"{pcmodel_dataset_parameterized} failed due to an exception. {str(e)}\n")
    with open(f"{pcfva_results_dirpath}/pcFVA-errors.log", "a") as file:
        file.write(
            f"{pcmodel_dataset_parameterized} failed due to an exception. {str(e)}\n"
        )

Computing pcFVA results for RBC_GEM_PC_RBComics
Starting simulations for RBC_GEM_PC_RBComics
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Finished pcFVA for percent optimum: 0.
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Username
Set parameter Use

In [11]:
groupby_list = ["model", "reactions"]

# Initialize entries with prefixes used for seperating DataFrames
dict_of_dataframes_types = {
    "reactions": None,
    "proteins": "PROTDL",
    # "complexes": "CPLXFM",
    # "complex_dilutions": "CPLXDL",
    "enzymes": "ENZDL",
    # "enzyme_formation": "ENZFM",
    "budgets": "PBDL",
    "relaxation": "RELAX",
}
df_pcfva_all = pcfva_sol.reset_index(drop=False)
for key, prefix in dict_of_dataframes_types.copy().items():
    if prefix:
        df = df_pcfva_all[
            df_pcfva_all["reactions"].apply(lambda x: x.startswith(prefix))
        ]
    else:
        df = df_pcfva_all[
            df_pcfva_all["reactions"].apply(lambda x: x in model.reactions)
        ]
    dict_of_dataframes_types[key] = df.copy()

dict_of_dataframes_types

# Get the maximum value of the reaction flux in each direction, regardless of percent optimum
df = dict_of_dataframes_types["reactions"].copy()
df = df.groupby(groupby_list)[["minimum", "maximum"]].agg(
    {
        "minimum": "min",
        "maximum": "max",
    }
)
df_max_flux_per_model = df.abs().max(axis=1)
df_max_flux_per_model.name = "Flux"
df_max_flux_per_model

# Determine flux range
df = dict_of_dataframes_types["reactions"].copy()
df["Range"] = df["maximum"] - df["minimum"]
df_flux_range_per_model = df.groupby(groupby_list)["Range"].max()
df_flux_range_per_model

# Determine span association with reaction
df = dict_of_dataframes_types["enzymes"].copy()
df["reactions"] = df["reactions"].apply(lambda x: enzyme_reaction_map[x])
df_max_enzyme_per_model = df.groupby(groupby_list)["maximum"].max()
df_max_enzyme_per_model.name = "Expression"
df_max_enzyme_per_model

df_reaction_flux_expression = (
    pd.merge(
        df_max_flux_per_model,
        df_flux_range_per_model,
        left_index=True,
        right_index=True,
    )
    .merge(df_max_enzyme_per_model, left_index=True, right_index=True)
    .reset_index(drop=False)
)
df_reaction_flux_expression

Unnamed: 0,model,reactions,Flux,Range,Expression
0,RBC_GEM_PC_RBComics,15KPGE1Ry,0.000000,0.000000,2.677617
1,RBC_GEM_PC_RBComics,15KPGE2Ry,0.000000,0.000000,2.677617
2,RBC_GEM_PC_RBComics,15KPGE3Ry,0.000000,0.000000,2.677617
3,RBC_GEM_PC_RBComics,15KPGF1Ry,0.000000,0.000000,2.677617
4,RBC_GEM_PC_RBComics,15KPGF2Ry,0.000000,0.000000,2.677617
...,...,...,...,...,...
2176,RBC_GEM_PC_RBComics,YYYTPAP,0.000000,0.000000,3.568388
2177,RBC_GEM_PC_RBComics,ZN2Htex2,0.164737,0.164737,1.286602
2178,RBC_GEM_PC_RBComics,ZN2_2HCO3t,0.164737,0.164737,1.445040
2179,RBC_GEM_PC_RBComics,ZN2_HCO3_SELNIt,0.164737,0.164737,1.445040
