# Create Proteome-Constrained RBC model via OVERLAY workflow 
This notebook facilitates the construction of a proteome constrained model ("pcModel") via the OVERLAY methodology.
## Setup
### Import packages

In [None]:
import itertools
from collections import defaultdict
from pathlib import Path

import gurobipy as gp
import pandas as pd
from rbc_gem_utils import (
    COBRA_CONFIGURATION,
    GEM_NAME,
    build_string,
    get_annotation_df,
    read_cobra_model,
    show_versions,
    split_string,
    write_cobra_model,
)
from rbc_gem_utils.analysis.overlay import (
    ATTR_SUBCLASS_DICT,
    DEFAULT_COMPARTMENT_CONSTRAINT_PREFIX,
    DEFAULT_CONCENTRATION_BOUND,
    DEFAULT_CONSTRAINT_PREFIX,
    DEFAULT_ENZYME_FORWARD_SUFFIX,
    DEFAULT_ENZYME_REVERSE_SUFFIX,
    DEFAULT_ENZYME_TOTAL_SUFFIX,
    DEFAULT_ISOFORM_CONSTRAINT_PREFIX,
    DEFAULT_KEFF,
    Enzyme,
    EnzymeDilution,
    Protein,
    ProteinDilution,
    ProteomeBudget,
    ProteomeBudgetDilution,
    add_relaxation_budget,
    construct_pcmodel_from_tables,
    create_complex_table,
    create_enzyme_table,
    create_protein_table,
    create_sequence_table,
)
from rbc_gem_utils.util import strip_plural

gp.setParam("OutputFlag", 0)
gp.setParam("LogToConsole", 0)

# Show versions of notebook
show_versions()

### Define configuration
#### COBRA Configuration

In [None]:
COBRA_CONFIGURATION.solver = "gurobi"
COBRA_CONFIGURATION.bounds = (-1e3, 1e3)
COBRA_CONFIGURATION

## Load RBC model

In [None]:
dataset_name = "RBComics"
data_path = Path("data").resolve()
models_path = Path("models").resolve()
figures_path = Path("figures").resolve()
dataset_path = Path(dataset_name).resolve()

imagetype = "svg"
transparent = True
save_figures = True

ftype = "xml"
model = read_cobra_model(models_path / f"{GEM_NAME.replace('-', '_')}.{ftype}")
model

In [None]:
annotation_type = "genes"
mapping_key = "uniprot"
annotation_cols = [mapping_key]

df_model_mappings = (
    get_annotation_df(model.genes, annotation_cols)
    .rename({"id": annotation_type}, axis=1)
    .dropna(subset=[mapping_key])
)
for col in df_model_mappings.columns:
    df_model_mappings[col] = df_model_mappings[col].apply(lambda x: split_string(x))
    df_model_mappings = df_model_mappings.explode(col).drop_duplicates().dropna()
df_model_mappings = df_model_mappings.sort_values(annotation_type)

print(df_model_mappings.nunique(dropna=True))
df_model_mappings = df_model_mappings.reset_index(drop=True)
df_model_mappings

## Assemble data for PC-model
### Load protein data
#### Protein amino acid sequences


In [None]:
df_isoforms_sequences = pd.read_csv(
    data_path / "uniprot_isoforms_sequences.tsv",
    sep="\t",
    index_col=None,
).fillna(pd.NA)
print(df_isoforms_sequences[df_isoforms_sequences["erythroid"]]["uniprot"].unique())
df_isoforms_sequences

#### Determine protein isoforms and associated sequences

In [None]:
# Erythroid first, then canonical to enable drop_duplicates to keep erythroid over canonical
df_model_isoforms_sequences = (
    pd.concat(
        (
            df_isoforms_sequences[df_isoforms_sequences["erythroid"]],
            df_isoforms_sequences[df_isoforms_sequences["canonical"]],
            df_isoforms_sequences[df_isoforms_sequences["backup"]],
        ),
        axis=0,
    )
    .fillna(pd.NA)
    .drop_duplicates()
    .sort_values(
        ["uniprot", "erythroid", "uniprot.isoform"], ascending=[True, False, True]
    )
)

print(
    df_model_isoforms_sequences[["canonical", "erythroid", "backup", "avoid"]].sum(
        axis=0
    )
)
print(f"Total: {len(df_model_isoforms_sequences)}")
df_model_isoforms_sequences = df_model_isoforms_sequences.loc[
    :,
    [
        "uniprot",
        "uniprot.isoform",
        "sequence.id",
        "sequence",
        "sequence.length",
        "canonical",
        "erythroid",
        "backup",
        "avoid",
    ],
].reset_index(drop=True)


df_model_isoforms_sequences = df_model_isoforms_sequences.copy()
df_model_isoforms_sequences["keep"] = df_model_isoforms_sequences["canonical"].values
to_avoid = df_model_isoforms_sequences[df_model_isoforms_sequences["avoid"]][
    "uniprot"
].to_dict()
df_model_isoforms_sequences.loc[
    list(to_avoid),
    "keep",
] = False

df_possible_backups = df_model_isoforms_sequences[
    df_model_isoforms_sequences["uniprot"].isin(list(to_avoid.values()))
]
df_possible_backups = df_possible_backups[~df_possible_backups["avoid"]]
df_model_isoforms_sequences.loc[
    list(df_possible_backups.index),
    "keep",
] = True
df_model_isoforms_sequences.loc[
    df_model_isoforms_sequences[df_model_isoforms_sequences["erythroid"]].index,
    "keep",
] = True
df_model_isoforms_sequences = df_model_isoforms_sequences[
    df_model_isoforms_sequences["keep"]
]

lost_ids = set(df_isoforms_sequences["uniprot"].unique()).difference(
    set(df_model_isoforms_sequences["uniprot"].unique())
)
if lost_ids:
    lost_ids = df_isoforms_sequences[df_isoforms_sequences["uniprot"].isin(lost_ids)]
    df_model_isoforms_sequences = pd.concat(
        (df_model_isoforms_sequences, lost_ids[lost_ids["canonical"]]), axis=0
    )
print()
print(
    df_model_isoforms_sequences[["canonical", "erythroid", "backup", "avoid"]].sum(
        axis=0
    )
)
print(f"Total: {len(df_model_isoforms_sequences)}")

df_model_isoforms_sequences = df_model_isoforms_sequences.loc[
    :, ["uniprot", "sequence.id", "sequence"]
].copy()
df_sequence_data = (
    df_model_mappings.merge(
        df_model_isoforms_sequences, left_on="uniprot", right_on="uniprot"
    )
    .loc[:, ["genes", "uniprot", "sequence.id", "sequence"]]
    .copy()
)
df_sequence_data

In [None]:
mapping_key = "uniprot"
protein_id_key = (
    "sequence.id.genes"  # genes, uniprot, sequence.id, or sequence.id.genes are best,
)
unique_gene_to_protein_map = True
isoform_transform = False
df_copy_numbers_data = None
df_protein_data = create_sequence_table(
    df_sequences=df_sequence_data,
    mapping_key=mapping_key,
    isoform_transform=isoform_transform,
)
ordered_isoform_ids = df_sequence_data[df_sequence_data["uniprot"].duplicated(False)][
    "sequence.id"
]
df_isoforms = df_protein_data[
    df_protein_data["sequence.id"].isin(ordered_isoform_ids)
].copy()
print(f"Number of proteins: {len(df_isoforms[mapping_key].unique())}")
print(f"Number of isoforms: {len(df_isoforms['sequence.id'].unique())}")
df_protein_data = df_protein_data.set_index("sequence.id")
df_protein_data = pd.concat(
    (
        df_protein_data.loc[ordered_isoform_ids],
        df_protein_data.loc[df_protein_data.index.difference(ordered_isoform_ids)],
    ),
    axis=0,
)
df_protein_data = df_protein_data.reset_index(drop=False)
df_protein_data = df_protein_data.loc[
    :, ["genes", "uniprot", "sequence.id", "sequence"]
].copy()
# print(df_isoforms[mapping_key])
if protein_id_key == "sequence.id.genes":
    protein_id_key = "protein.id"
    sequence_id_updates = df_model_mappings.set_index("uniprot")["genes"].to_dict()
    df_protein_data["protein.id"] = df_protein_data["sequence.id"].apply(
        lambda seq_id: "_".join(
            [sequence_id_updates.get(x, x) for x in seq_id.split("-")]
        )
    )
    df_isoforms["protein.id"] = df_isoforms["sequence.id"].apply(
        lambda seq_id: "_".join(
            [sequence_id_updates.get(x, x) for x in seq_id.split("-")]
        )
    )
    ids_to_fix = df_protein_data[
        ~df_protein_data["sequence.id"].isin(df_isoforms["sequence.id"])
    ].index
    df_protein_data.loc[ids_to_fix, "protein.id"] = df_protein_data.loc[
        ids_to_fix, "protein.id"
    ].apply(lambda x: x.split("_")[0])

# Use to remove duplicates
if unique_gene_to_protein_map:
    df_protein_data = df_protein_data.drop_duplicates(
        subset=["uniprot"],
        keep="first",
    )
    protein_id_key = "genes"

df_protein_data

###  List all unique proteins, complexes, and enzymes
#### Option 1: Initialize draft tables
1. The draft tables are created and used to initialize the draft PC-model.
    * The protein table can be used to initialize proteins and their molar weight ($\textbf{d}$ vector).
    * The complex table can be used to initialize complexes with their subunit stoichiometry ($\textbf{C}$ matrix).
        * All stoichiometric coefficients are initialized at a value of one.
    * The enzyme table can be used to initialize enzymes with their effective rate constants ($\textbf{K}_\mathrm{eff}$ matrix).
        * All $k_\mathrm{eff}$ values are initialized at average rate constant of 65 $s^{-1}$  (or equivalently, 234000 $hr^{-1})$.

2. The draft tables are made to be facilitate curation and data replacement. Therefore, the draft PC-model is exported with the draft tables. 
3. A refined PC-model can be created using the curated tables. 

#### Option 2: Load tables from files
4. The formation of a draft model can be skipped if the curated tables already exist. They can be loaded.

In [None]:
pcmodel_tables = {}
replace_compartments = {
    # Cytosol:extracellular --> plasma membrane
    "c": "c",
    "ce": "pm",
    "e": "e",  # Most extracellular reactions that occur are due to proteins bound to the external side of them membrane.
}

# Convert all protein compartments to one compartment
simplify_compartments = True
prefix = True
optional_columns = True

# Enzyme values for new tables
max_weight_fraction = 100
enzyme_keff_base = DEFAULT_KEFF
enzyme_forward_suffix = DEFAULT_ENZYME_FORWARD_SUFFIX
enzyme_reverse_suffix = DEFAULT_ENZYME_REVERSE_SUFFIX
enzyme_total_suffix = DEFAULT_ENZYME_TOTAL_SUFFIX

dict_of_id_keys = {
    "proteins": protein_id_key,
    "complexes": None,
    "enzymes": "reactions",
}

# Provide filepaths to speed up creation process files, comment out to generate from scratch
filepaths = {
    "proteins": data_path / "pcmodel_proteins.tsv",
    "complexes": data_path / "pcmodel_complexes.tsv",
    "enzymes": data_path / "pcmodel_enzymes.tsv",
    # "complex_keffs": data_path / "pcmodel_complex_keffs.tsv",
    # "enzyme_keffs": data_path / "pcmodel_enzyme_keffs.tsv",
}

##### Proteins

In [None]:
table_type = "proteins"
# Otherwise try using main RBC-GEM files to make model proteins
try:
    df_proteins = pd.read_csv(filepaths[table_type], sep="\t", index_col=None)
    print("Loaded from main RBC-GEM file")
except (FileNotFoundError, KeyError):
    # Otherwise, make from scratch
    df_proteins = create_protein_table(
        model,
        df_protein_data=df_protein_data,
        id_key=dict_of_id_keys.get(table_type),
        prefix=prefix,
        optional_columns=optional_columns,
        annotation_columns=[
            "uniprot",
        ],
        replace_compartments=replace_compartments,
    )
    print("Created new table")
    # Create column for identifiers if None exists, or if compartments were replaced
    if not isoform_transform:
        df_proteins[df_proteins[table_type].duplicated(False)]
else:
    df_proteins = df_protein_data.merge(
        df_proteins[["uniprot", "compartment"]],
        left_on="uniprot",
        right_on="uniprot",
        how="left",
    )
    df_proteins["protein"] = df_proteins[protein_id_key].apply(lambda x: f"protein_{x}")
    df_proteins = df_proteins.drop("protein.id", axis=1)
    df_proteins = df_proteins[
        df_proteins["genes"].isin(model.genes.list_attr("id"))
    ].reset_index(drop=True)

if simplify_compartments:
    df_proteins = df_proteins.groupby(["genes", "protein"]).agg(
        lambda values: ";".join(
            [str(value) for value in list(values.dropna().unique())]
        )
    )
    df_proteins["compartment"] = "pc"
    df_proteins = df_proteins.reset_index(drop=False)

df_proteins["proteins"] = df_proteins[[strip_plural(table_type), "compartment"]].apply(
    lambda x: "_".join(x.values), axis=1
)
df_proteins = df_proteins.set_index(strip_plural(table_type))
pcmodel_tables[table_type] = df_proteins.copy()
df_proteins

##### Complexes

In [None]:
table_type = "complexes"
try:
    df_complexes = pd.read_csv(filepaths[table_type], sep="\t", index_col=None)
    print("Loaded from main RBC-GEM file")
except (FileNotFoundError, KeyError):
    genes_to_proteins = (
        pcmodel_tables["proteins"]
        .groupby(["genes"], as_index=True)["proteins"]
        .agg(lambda x: build_string(list(x)))
        .to_dict()
    )
    cofactor_genes = {}
    # Create table
    df_complexes = create_complex_table(
        model,
        genes_to_proteins=genes_to_proteins,
        cofactor_genes=cofactor_genes,
        id_key=dict_of_id_keys.get(table_type),
        prefix=prefix,
        optional_columns=optional_columns,
        annotation_columns=[
            # "uniprot"
        ],
        replace_compartments=replace_compartments,
    )
    print("Created new table")

else:
    df_complexes = df_complexes.drop("molar_mass", axis=1)
    df_complexes = df_complexes[
        df_complexes["genes"].apply(
            lambda genes: all([model.genes.has_id(gene) for gene in genes.split(";")])
        )
    ]
    df_complexes["reactions"] = df_complexes["reactions"].apply(
        lambda reactions: ";".join(
            [r for r in reactions.split(";") if model.reactions.has_id(r)]
        )
    )
    df_complexes = df_complexes[df_complexes["reactions"] != ""]
    df_complexes = df_complexes.loc[
        :,
        [
            "complex",
            "subunits",
            "compartment",
            "reactions",
            "genes",
            "coefficients",
            "cofactors",
            "notes",
        ],
    ]
# Address isoform mapping to complexes
isoforms_map = defaultdict(list)
complex_name_update = defaultdict(list)
for x in df_proteins[df_proteins["genes"].duplicated(False)].index:
    isoforms_map[x.rsplit("_", maxsplit=1)[0]].append(x)
    complex_name_update[x.rsplit("_", maxsplit=1)[0].replace("protein_", "")].append(
        x.replace("protein_", "")
    )
df_isoforms_complexes = df_complexes[
    df_complexes["subunits"].apply(
        lambda proteins: bool(set(isoforms_map).intersection(proteins.split(";")))
    )
]
df_updated_rows = []
for _, row in df_isoforms_complexes.iterrows():
    complex_names = [
        complex_name
        for complex_name in itertools.product(
            *[complex_name_update.get(c, [c]) for c in row["complex"].split("_")]
        )
    ]
    combos = [
        list(combo)
        for combo in itertools.product(
            *[
                isoforms_map.get(protein, [protein])
                for protein in row["subunits"].split(";")
            ]
        )
    ]

    for complex_name, combo in zip(complex_names, combos):
        new_row = row.to_dict()
        new_row["complex"] = "_".join(complex_name)
        new_row["subunits"] = ";".join(combo)
        df_updated_rows.append(new_row)

df_complexes = pd.concat(
    (
        df_complexes[~df_complexes.index.isin(df_isoforms_complexes.index)],
        pd.DataFrame(df_updated_rows),
    ),
    axis=0,
)


if simplify_compartments:
    df_complexes = df_complexes.groupby(["subunits", "complex"]).agg(
        lambda values: ";".join(
            [str(value) for value in list(values.dropna().unique())]
        )
    )
    df_complexes["compartment"] = "pc"
    df_complexes = df_complexes.reset_index(drop=False)

df_complexes["complexes"] = df_complexes[
    [strip_plural(table_type), "compartment"]
].apply(lambda x: "_".join(x.values), axis=1)
df_complexes["subunits"] = df_complexes[["subunits", "compartment"]].apply(
    lambda values: ";".join(
        [
            "_".join((x, values["compartment"])) if not x.endswith("_pc") else x
            for x in values["subunits"].split(";")
        ]
    ),
    axis=1,
)
df_complexes = df_complexes.set_index(strip_plural(table_type))
pcmodel_tables[table_type] = df_complexes.copy()
df_complexes

##### Enzymes

In [None]:
table_type = "enzymes"
try:
    df_enzymes = pd.read_csv(filepaths[table_type], sep="\t", index_col=None)
    print("Loaded from main RBC-GEM file")
except (FileNotFoundError, KeyError):
    complexes_to_reactions = (
        pcmodel_tables["complexes"].set_index("complexes")["reactions"].to_dict()
    )
    df_enzymes = create_enzyme_table(
        model,
        complexes_to_reactions=complexes_to_reactions,
        enzyme_keff_base=enzyme_keff_base,
        enzyme_forward_suffix=enzyme_forward_suffix,
        enzyme_reverse_suffix=enzyme_reverse_suffix,
        id_key=dict_of_id_keys.get(table_type),
        prefix=prefix,
        optional_columns=optional_columns,
        annotation_columns=[
            # "uniprot"
        ],
        replace_compartments=replace_compartments,
    )
    print("Created new table")
    if replace_compartments:
        df_enzymes["compartment"] = df_enzymes["compartment"].replace(
            replace_compartments
        )
else:
    df_enzymes = df_enzymes.loc[
        :, ["complexes", "compartment", "reactions", "enzyme", "direction"]
    ]

if simplify_compartments:
    df_enzymes = df_enzymes.groupby(["complexes", "enzyme"]).agg(
        lambda values: ";".join(
            [str(value) for value in list(values.dropna().unique())]
        )
    )
    df_enzymes["compartment"] = "pc"
    df_enzymes = df_enzymes.reset_index(drop=False)

df_enzymes[table_type] = df_enzymes[[strip_plural(table_type), "compartment"]].apply(
    lambda x: "_".join(x.values), axis=1
)
df_enzymes = df_enzymes.set_index(strip_plural(table_type))
pcmodel_tables[table_type] = df_enzymes.copy()
df_enzymes

## Create PC-model

In [None]:
protein_table = pcmodel_tables["proteins"].reset_index(drop=False)
complex_table = pcmodel_tables["complexes"].reset_index(drop=False)
enzyme_table = pcmodel_tables["enzymes"].reset_index(drop=False)
max_weight_fraction = 100

pcmodel, final_pcmodel_tables = construct_pcmodel_from_tables(
    model,
    protein_table=protein_table,
    complex_table=complex_table,
    enzyme_table=enzyme_table,
    max_weight_fraction=max_weight_fraction,
    enzyme_keff_base=enzyme_keff_base,
    enzyme_forward_suffix=enzyme_forward_suffix,
    enzyme_reverse_suffix=enzyme_reverse_suffix,
    enzyme_total_suffix=enzyme_total_suffix,
    include_complex_dilutions=True,  # Relaxes constraints areound complexes. Recommend to start, can be set to zero later or removed entirely
    irrev_rxn_complex_keff=0,  # Set as None to ignore, small number to keep in model, 0 to remove from complex-enzyme mapping
)
if simplify_compartments:
    pcmodel.compartments = {"pc": "protein compartment"}
# Print summary
for attr, subclass_dict in ATTR_SUBCLASS_DICT.items():
    n = len(
        getattr(pcmodel, attr).query(
            lambda x: not isinstance(x, tuple(subclass_dict.values()))
        )
    )
    print(f"Number of {attr}: {n}")
    for key, subcls in subclass_dict.items():
        obj_list = getattr(pcmodel, attr).query(lambda x: isinstance(x, subcls))
        n = len(obj_list)
        print(f"Number of {key}: {n}")
        if subcls in (Enzyme, EnzymeDilution):
            print(
                f"Forward variable: {len(obj_list.query(lambda x: enzyme_forward_suffix in x.id))}"
            )
            print(
                f"Reverse variable: {len(obj_list.query(lambda x: enzyme_reverse_suffix in x.id))}"
            )
            print(
                f"Summation variable : {len(obj_list.query(lambda x: enzyme_total_suffix in x.id))}"
            )
    print()

keff_table = final_pcmodel_tables["enzymes"].copy()
keff_table["direction"] = keff_table["reactions"].apply(
    lambda rid: model.reactions.get_by_id(rid).reaction
)
keff_table["direction"] = keff_table["direction"].apply(
    lambda x: x.replace("<=>", "-->")
)
keff_table["direction"] = keff_table[["enzyme", "direction"]].apply(
    lambda x: (
        x["direction"].replace("-->", "<--")
        if x["enzyme"].endswith(DEFAULT_ENZYME_REVERSE_SUFFIX)
        else x["direction"]
    ),
    axis=1,
)
keff_table["complexes"] = keff_table["complexes"].apply(lambda x: x.split(";"))
keff_table["complex_keff"] = keff_table["complex_keff"].apply(lambda x: x.split(";"))
keff_table = keff_table.explode(["complexes", "complex_keff"])
keff_table["complex"] = keff_table[["complexes", "compartment"]].apply(
    lambda x: x["complexes"].replace(f"_{x['compartment']}", ""), axis=1
)
keff_table = keff_table.groupby(["enzyme", "complex"], as_index=False).agg(
    lambda x: list(x.unique())[0]
)
keff_table = keff_table.loc[
    :,
    [
        "enzyme",
        "enzyme_keff",
        "complex",
        "complex_keff",
        "compartment",
        "reactions",
        "direction",
    ],
]
complex_keff_table = keff_table.drop("enzyme_keff", axis=1).drop_duplicates()
enzyme_keff_table = (
    keff_table.groupby(["enzyme", "enzyme_keff"], as_index=False)[
        ["reactions", "direction"]
    ]
    .agg(lambda x: list(x.unique())[0])
    .drop_duplicates()
)
final_pcmodel_tables["complex_keffs"] = complex_keff_table
final_pcmodel_tables["enzyme_keffs"] = enzyme_keff_table

n_cplx_keff = len(
    complex_keff_table[complex_keff_table["complex_keff"].astype(float) != 0]
)
print(f"Number of non-zero complex rate constants: {n_cplx_keff}")

n_enzyme_keff = len(
    enzyme_keff_table[enzyme_keff_table["enzyme_keff"].astype(float) != 0]
)
print(f"Number of non-zero enzyme rate constants: {n_enzyme_keff}")


for table_type, df_table in final_pcmodel_tables.items():
    df_table.to_csv(data_path / f"pcmodel_{table_type}.tsv", sep="\t", index=False)

### Formulate additional protein constraints
#### Address isoforms and compartments with additional constraints
For isoforms and/or compartments, place an additional constraint such that the total sum of all isoforms does not exceed the measured concentraiton value.

In [None]:
protein_table = pcmodel_tables["proteins"]
mapping_key = "uniprot"
df_additional_constraints = protein_table[protein_table[mapping_key].duplicated(False)]
df_additional_constraints = df_additional_constraints.groupby(
    [
        "genes",
        mapping_key,
    ],
    as_index=False,
).agg(lambda x: list(x))
if not df_additional_constraints.empty:
    if "lower_bound" in df_additional_constraints.columns:
        df_additional_constraints["lower_bound"] = df_additional_constraints[
            "lower_bound"
        ].apply(min)
    if "upper_bound" in df_additional_constraints.columns:
        df_additional_constraints["upper_bound"] = df_additional_constraints[
            "upper_bound"
        ].apply(max)

data = {}
for idx, row in df_additional_constraints.iterrows():
    # Technically, always one gene but refers to genes attribute
    genes = row["genes"]
    uniprot = model.genes.get_by_id(genes).annotation.get(mapping_key, "")
    proteins = split_string(row.get("proteins"))
    proteins = pcmodel.metabolites.get_by_any(proteins)
    is_compartment = len({p.compartment for p in proteins}) > 1
    is_isoform = (
        len(
            {
                p.id.replace(f"_{p.compartment}", "").split(
                    "_",
                )[-1]
                for p in proteins
                if p.id.replace(f"_{p.compartment}", "")
                .split(
                    "_",
                )[-1]
                .isnumeric()
            }
        )
        > 1
    )
    if is_compartment and not is_isoform:
        default_prefix = DEFAULT_COMPARTMENT_CONSTRAINT_PREFIX
    elif is_isoform and not is_compartment:
        default_prefix = DEFAULT_ISOFORM_CONSTRAINT_PREFIX
    else:
        default_prefix = DEFAULT_CONSTRAINT_PREFIX
    constraint_id = row.get("constraints", f"{default_prefix}{genes}")
    lower_bound = float(row.get("lower_bound")) if row.get("lower_bound") else 0
    upper_bound = (
        float(row.get("upper_bound"))
        if row.get("upper_bound")
        else DEFAULT_CONCENTRATION_BOUND
    )
    protein_dilutions = [
        reaction
        for protein in proteins
        for reaction in list(protein.reactions)
        if reaction.id.endswith(protein.id)
    ]
    # "ISOCONS" is short for "ISOFORM CONSTRAINT"
    # "COMPCONS" is short for "COMPARTMENT CONSTRAINT"
    # "CONS" for general constraint
    data[idx] = {
        "constraints": constraint_id,
        "genes": genes,
        "proteins": build_string([p.id for p in proteins]),
        "reactions": build_string([p.id for p in protein_dilutions]),
        # Assume sum of isoforms is a constant, works well with proteomic measurements that do not distinguish
        "coefficients": ";".join([str(1) for p in protein_dilutions]),
        "lower_bound": lower_bound,
        "upper_bound": upper_bound,
        "unit": "nmol / gDW",
        mapping_key: uniprot,
    }
df_additional_constraints = pd.DataFrame.from_dict(data, orient="index")
df_additional_constraints.to_csv(
    data_path / f"constraints_proteins_{pcmodel.id}.tsv", sep="\t", index=False
)
df_additional_constraints

## Add additional protein constraints to model

In [None]:
try:
    df_additional_constraints = pd.read_csv(
        data_path / f"constraints_proteins_{pcmodel.id}.tsv",
        sep="\t",
        index_col=None,
    )
except (FileNotFoundError, pd.errors.EmptyDataError):
    df_additional_constraints = pd.DataFrame()
else:
    if not df_additional_constraints.empty:
        for constraint_id, row in df_additional_constraints.set_index(
            "constraints"
        ).iterrows():
            reactions = pcmodel.reactions.get_by_any(row["reactions"].split(";"))
            coefficients = row["coefficients"].split(";")
            abundance = sum(
                [
                    int(coeff) * reaction.flux_expression
                    for reaction, coeff in zip(reactions, coefficients)
                ]
            )
            lower_bound = float(row.get("lower_bound")) if row.get("lower_bound") else 0
            upper_bound = (
                float(row.get("upper_bound"))
                if row.get("upper_bound")
                else DEFAULT_CONCENTRATION_BOUND
            )
            if constraint_id in pcmodel.constraints:
                # TODO warn
                pcmodel.remove_cons_vars(pcmodel.constraints[constraint_id])
            additional_constraint = pcmodel.problem.Constraint(
                abundance,
                name=constraint_id,
                lb=lower_bound,
                ub=upper_bound,
            )
            pcmodel.add_cons_vars(additional_constraint)

df_additional_constraints

### Annotate proteins with UniProt IDs and sequences

In [None]:
protein_to_uniprot = protein_table.set_index("proteins")["uniprot"].to_dict()
protein_to_sequence = protein_table.set_index("proteins")["sequence"].to_dict()

for protein_dilution in pcmodel.reactions.query(
    lambda x: isinstance(x, ProteinDilution)
):
    if protein_to_uniprot.get(f"{protein_dilution.id}".replace("PROTDL_", "")):
        protein_dilution.annotation["uniprot"] = protein_to_uniprot[
            f"{protein_dilution.id}".replace("PROTDL_", "")
        ]
        protein_dilution.annotation["uniprot.aa_sequence"] = protein_to_sequence[
            f"{protein_dilution.id}".replace("PROTDL_", "")
        ]

for protein in pcmodel.metabolites.query(lambda x: isinstance(x, Protein)):
    if protein_to_uniprot.get(protein.id):
        protein.annotation["uniprot"] = protein_to_uniprot[protein.id]
        protein.annotation["uniprot.aa_sequence"] = protein_to_sequence[protein.id]


enzyme_to_reaction = enzyme_table.set_index("enzymes")["reactions"].to_dict()
for enzyme_dilution in pcmodel.reactions.query(lambda x: isinstance(x, EnzymeDilution)):
    if enzyme_to_reaction.get(f"{enzyme_dilution.id}".replace("ENZDL_", "")):
        # Make it easier to use later
        enzyme_dilution.annotation["reaction"] = enzyme_to_reaction[
            f"{enzyme_dilution.id}".replace("ENZDL_", "")
        ]

for enzyme in pcmodel.metabolites.query(lambda x: isinstance(x, Enzyme)):
    if enzyme_to_reaction.get(enzyme.id):
        enzyme.annotation["reaction"] = enzyme_to_reaction[enzyme.id]

### Set proteome budget constraints for low abundance and high-abundance proteomes
* RBCs are enucleated, terminally differentiated cells that are composed of 95% to 98% Hb by dry mass (mass of all the constituents of a cell in the absence of water)
    * PMID: 13429433, PMID: 13999462, PMID: 21796773, **PMID: 34378368**
* Therefore, remove hemoglobin from the low abundance proteome budget constraint and create a new constraint specific to hemoglobin abundance.
* Assume 90% minimum of dry mass is hemoglobin, and up to 10% of dry mass are other proteins

In [None]:
# Proteome budget for low abundance proteins
PBDL_proteome_budget = pcmodel.reactions.get_by_id("PBDL_proteome_budget")
PBDL_proteome_budget.id = "PBDL_proteome_budget"
PBDL_proteome_budget.name = "Proteome budget demand (Low abundance)"

proteome_budget = pcmodel.metabolites.get_by_id("proteome_budget")
proteome_budget.id = "proteome_budget"
proteome_budget.name = "Proteome Budget Constraint (Low abundance)"

# Proteome budget for hemoglobin
PBDL_hemoglobin_budget = PBDL_proteome_budget.copy()
hemoglobin_budget = list(PBDL_hemoglobin_budget.metabolites).pop()
PBDL_hemoglobin_budget.id = "PBDL_hemoglobin_budget"
PBDL_hemoglobin_budget.name = "Proteome budget demand (Hemoglobin)"

hemoglobin_budget.id = "hemoglobin_budget"
hemoglobin_budget.name = "Hemoglobin Budget Constraint"

PBDL_hemoglobin_budget.bounds = (900, 1000)
PBDL_proteome_budget.bounds = (0.0, 100)

pcmodel.add_reactions([PBDL_hemoglobin_budget])

remove_from_low_abundance_budget = [
    "HBA",
    "HBB",
    "HBD",
    "HBE1",
    "HBG1",
    "HBG2",
    "HBM",
    "HBQ1",
    "HBZ",
]
for reaction in proteome_budget.reactions:
    if any(
        [f"protein_{gid}" in reaction.id for gid in remove_from_low_abundance_budget]
    ):
        reaction.add_metabolites(
            {hemoglobin_budget: reaction.get_coefficient(proteome_budget)}
        )
        reaction.subtract_metabolites(
            {proteome_budget: reaction.get_coefficient(proteome_budget)}
        )

for reaction in sorted(
    pcmodel.metabolites.get_by_id("hemoglobin_budget").reactions, key=lambda x: x.id
):
    print(reaction)

# Add total budget constraint for hemoglobin and protein
pcmodel.add_metabolites(
    [
        ProteomeBudget(
            id="total_budget",
            name="Total Budget Constraint",
            compartment=proteome_budget.compartment,
        )
    ]
)
total_budget = pcmodel.metabolites.get_by_id("total_budget")
PBDL_total_budget = f"PBDL_{total_budget.id}"
pcmodel.add_reactions(
    [
        ProteomeBudgetDilution(
            id=PBDL_total_budget,
            name="Total budget demand",
            lower_bound=0,
            upper_bound=1000,
        )
    ]
)
PBDL_total_budget = pcmodel.reactions.get_by_id(PBDL_total_budget)
print()
for reaction in pcmodel.reactions.query(
    lambda x: isinstance(x, ProteomeBudgetDilution)
):
    if PBDL_total_budget.id == reaction.id:
        reaction.add_metabolites({total_budget: -1}, combine=False)
    else:
        reaction.add_metabolites({total_budget: 1}, combine=False)
    print(reaction)

### Ensure model can be optimized for glucose uptake

In [None]:
objective_rxns = ["NaKt"]
pcmodel.objective = sum(
    [pcmodel.reactions.get_by_id(rid).flux_expression for rid in objective_rxns]
)
pcsol = pcmodel.optimize()
pcsol.fluxes.loc[
    [r.id for r in model.reactions if r.id in pcsol.fluxes[pcsol.fluxes != 0].index]
].sort_index()

In [None]:
pcsol.fluxes.loc[
    [
        r.id
        for r in pcmodel.reactions.query(lambda x: isinstance(x, ProteinDilution))
        if r.id in pcsol.fluxes[pcsol.fluxes != 0].index
    ]
].sort_index()

In [None]:
pcsol.fluxes.loc[
    [
        r.id
        for r in pcmodel.reactions.query(lambda x: isinstance(x, EnzymeDilution))
        if r.id in pcsol.fluxes[pcsol.fluxes != 0].index
    ]
].sort_index()

### Export model

In [None]:
# Regular model
write_cobra_model(model, filename=models_path / f"{model}.xml")
write_cobra_model(model, filename=models_path / f"{model}.json")

# Protein constrained  without curated keffs
write_cobra_model(pcmodel, filename=models_path / f"{pcmodel}.xml")
write_cobra_model(pcmodel, filename=models_path / f"{pcmodel}.json")

In [None]:
model

In [None]:
pcmodel

### Create PC-model representative of RBC-Omics
Can only be done after pcFVA results are generated

In [None]:
df_reaction_bounds = pd.read_csv(
    dataset_path / f"{pcmodel.id}_{dataset_name}_reaction_bounds.tsv",
    sep="\t",
    index_col="reactions",
)
df_reaction_bounds = df_reaction_bounds.rename(
    {"minimum": "lower_bound", "maximum": "upper_bound"}, axis=1
)

pcmodel_dataset_parameterized = pcmodel.copy()
pcmodel_dataset_parameterized.id += f"_{dataset_name}"
add_relaxation_budget(pcmodel_dataset_parameterized, 0, verbose=False)
for rid, bounds in df_reaction_bounds.iterrows():
    reaction = pcmodel_dataset_parameterized.reactions.get_by_id(rid)
    reaction.bounds = bounds


# Protein constrained  without curated keffs
write_cobra_model(
    pcmodel_dataset_parameterized,
    filename=models_path / f"{pcmodel_dataset_parameterized}.xml",
)
write_cobra_model(
    pcmodel_dataset_parameterized,
    filename=models_path / f"{pcmodel_dataset_parameterized}.json",
)
pcmodel_dataset_parameterized