# Simulate models using pcFVA - Mouse G6PD Variants Omics
## Setup
### Import packages

In [None]:
import re

import gurobipy as gp
import pandas as pd
from cobra.exceptions import OptimizationError
from cobra.flux_analysis.variability import flux_variability_analysis
from rbc_gem_utils import (
    COBRA_CONFIGURATION,
    get_dirpath,
    read_cobra_model,
    show_versions,
)
from rbc_gem_utils.analysis.overlay import (
    DEFAULT_PREFIX_SUFFIX_VALUES,
    DEFAULT_PROTEOME_COMPARTMENT,
    BudgetDilution,
    ProteinDilution,
    add_relaxation_budget,
    load_overlay_model,
)

gp.setParam("OutputFlag", 0)
gp.setParam("LogToConsole", 0)

# Show versions of notebook
show_versions()

### Define configuration
#### COBRA Configuration

In [None]:
COBRA_CONFIGURATION.solver = "gurobi"
# Set bound defaults much larger to prevent model loading issues
COBRA_CONFIGURATION.bounds = (-1e-8, 1e8)
COBRA_CONFIGURATION.tolerance = 1e-7
COBRA_CONFIGURATION

### Define organism, model, and dataset

In [None]:
model_id = "RBC_GEM"
organism = "Mouse"
dataset_name = "G6PDvariants"

### Set variables for sample identification

In [None]:
# For sample IDs
timepoints = ["Pre", "Post", "TD"]
phenotypes = ["HumCan", "A", "MED"]
donor_re = re.compile(rf"(?P<donor>({'|'.join(phenotypes)})\d+)")
time_re = re.compile(rf"(?P<time>{'|'.join(timepoints)})")
phenotype_re = re.compile(rf"(?P<phenotype>({'|'.join(phenotypes)}))")

operations = "|".join([x.capitalize() for x in ["mean", "median"]])

operation_re = re.compile(r"(?P<op>" + operations + r")\_(?P<group>\w+)")
sample_id_re = re.compile(
    r"(?!" + operations + r")" + donor_re.pattern + r"\_" + time_re.pattern
)

### Set computation options

In [None]:
verbose = True
ftype = "xml"  # In our experience, SBML/XML loads faster, but will take up to 4x more space uncompressed as compared to JSON
run_computations = True  # Keep off to use previously computed results
overwrite = True  # Whether to allow overwriting of previous simulation results
objective_reactions = ["NaKt"]
minimal_relaxation_budget = True
use_only_necessary_reactions = False
relaxation_proteins_to_restrict = []
optimum_percents = [0.00, 0.50, 0.90, 0.99]


protein_rxn_prefix = DEFAULT_PREFIX_SUFFIX_VALUES["proteins"]["prefix.dilution"]
protein_met_prefix = DEFAULT_PREFIX_SUFFIX_VALUES["proteins"]["prefix.metabolite"]
relaxation_rxn_prefix = DEFAULT_PREFIX_SUFFIX_VALUES["proteins"]["prefix.relaxation"]
enzyme_met_suffix_total = DEFAULT_PREFIX_SUFFIX_VALUES["enzymes"]["suffix.total"]
enzyme_rxn_prefix = DEFAULT_PREFIX_SUFFIX_VALUES["enzymes"]["prefix.dilution"]
enzyme_met_prefix = DEFAULT_PREFIX_SUFFIX_VALUES["enzymes"]["prefix.metabolite"]
budget_rxn_prefix = DEFAULT_PREFIX_SUFFIX_VALUES["budgets"]["prefix.dilution"]
budget_met_prefix = DEFAULT_PREFIX_SUFFIX_VALUES["budgets"]["prefix.metabolite"]
comp_suffix = f"_{DEFAULT_PROTEOME_COMPARTMENT}"

### Set figure options

In [None]:
save_figures = True
transparent = False
imagetype = "svg"

### Set paths

In [None]:
# Set paths
overlay_dirpath = get_dirpath("analysis") / "OVERLAY" / organism
model_dirpath = overlay_dirpath / model_id
results_dirpath = (
    get_dirpath(use_temp="processed") / model_id / "OVERLAY" / organism / dataset_name
)

sample_pcmodels_dirpath = results_dirpath / "sample_pcmodels"
pcfva_results_dirpath = (
    results_dirpath / "pcFVA" / "_".join(("OBJ", *objective_reactions))
)
# Ensure directory  exists
pcfva_results_dirpath.mkdir(exist_ok=True, parents=True)

## Load RBC-GEM model

In [None]:
model = read_cobra_model(filename=model_dirpath / f"{model_id}.xml")
pcmodel = load_overlay_model(filename=model_dirpath / f"{model_id}_PC.xml")

# Add relaxation budget to initial PC model to get names of relaxation reactions
add_relaxation_budget(pcmodel, 0, verbose=False)
pcmodel

### Define list of PC-models to load for simulation

In [None]:
pcmodel_names = sorted(
    set(
        [
            fp.parts[-1].replace(fp.suffix, "")
            for fp in list(sample_pcmodels_dirpath.iterdir())
            if fp.parts[-1].startswith(pcmodel.id)
        ]
    )
)
all_sample_ids = [x.replace(f"{pcmodel.id}_", "") for x in pcmodel_names]
operation_ids = [x for x in all_sample_ids if operation_re.match(x)]
sample_ids = [x for x in all_sample_ids if sample_id_re.match(x)]

print(f"Total number of models: {len(all_sample_ids)}")
print(f"Number of measured samples: {len(sample_ids)}")
print(f"Number of operation samples: {len(operation_ids)}")


models_to_simulate = set(pcmodel_names)

# Possible differences performed on datasets
exclude_donors = []
exclude_by_timepoints = []
exclude_by_phenotype = []
exclude_by_operation = []
for to_exclude, to_search_re in zip(
    [
        exclude_donors,
        exclude_by_timepoints,
        exclude_by_phenotype,
        exclude_by_operation,
    ],
    [donor_re, time_re, phenotype_re, operation_re],
):
    if to_exclude:
        models_to_simulate = models_to_simulate.difference(
            [
                x
                for item in to_exclude
                for x in models_to_simulate
                if to_search_re.search(x.replace(f"{pcmodel.id}_", ""))
                and to_search_re.search(x.replace(f"{pcmodel.id}_", "")).group(1)
                == item
            ]
        )

models_to_simulate = sorted(models_to_simulate)
print(f"Number of models to simulate: {len(models_to_simulate)}")
models_to_simulate;

### Generate results using pcFVA for context specific models
Note that this can take a signficiant amount of time depending on the number of models and their sizes. Best to use a targeted approach in generating results. 
Alternatively, skip result generation and load the previously generated results.

In [None]:
# Reactions in addition to the minimum for flux-abundance correlations
list_of_reactions = []
# # Use to get ALL reactions in the original model
list_of_reactions += model.reactions.list_attr("id")
# # Use to get ALL reactions in the PC model
# list_of_reactions += pcmodel.reactions.list_attr("id")

#### Generate results for subset of PC model reactions
##### Reactions necessary for all flux-abundance correlation computations.
To reduce computation time, a subset of reactions can be defined. 
For flux-abundance correlations, the minimum reaction set are reactions associated with genes associated and the corresponding enzyme dilution reaction for total enzyme.

In [None]:
min_reaction_list = model.reactions.query(lambda x: x.gene_reaction_rule).list_attr(
    "id"
)
# Add protein dilutons to see effective protein concentrations used
min_reaction_list += pcmodel.reactions.query(
    lambda x: isinstance(x, ProteinDilution)
).list_attr("id")
min_reaction_list += pcmodel.reactions.query(
    lambda x: isinstance(x, BudgetDilution)
).list_attr("id")

# Already limited to reactions with gene reaction rules
enzyme_totals_list = pcmodel.metabolites.query(
    lambda x: x.id.startswith(f"{enzyme_met_prefix}")
    and enzyme_met_suffix_total in x.id
)
enzyme_reaction_map = {
    f"{enzyme_rxn_prefix}{x}": x.id.replace(f"{enzyme_met_prefix}", "").replace(
        f"{enzyme_met_suffix_total}_{x.compartment}", ""
    )
    for x in enzyme_totals_list
}
# Combine lists
min_reaction_list += list(enzyme_reaction_map)

print(
    f"Minimum number of reactions minimize/maximize (minimum): {len(min_reaction_list)} / {len(pcmodel.reactions)}"
)

##### Refined set of PC model reactions

In [None]:
if use_only_necessary_reactions:
    reaction_list = min_reaction_list.copy()
else:
    list_of_reactions = [getattr(rid, "_id", rid) for rid in list_of_reactions]
    reaction_list = sorted(
        [getattr(x, "_id", x) for x in set(min_reaction_list).union(list_of_reactions)]
    )
print(
    f"Number of reactions minimize/maximize (chosen): {len(reaction_list)} / {len(pcmodel.reactions)}"
)

## Run pcFVA
### Define functions for running FVA

In [None]:
def run_simulations_at_optimums_for_sample(
    pcmodel_sample,
    objective_reactions,
    reaction_list,
    optimum_percents,
    pcfva_results_dirpath,
    processes,
    **pcfva_kwargs,
):
    pcmodel_sample.objective = sum(
        [
            r.flux_expression
            for r in pcmodel_sample.reactions.get_by_any(objective_reactions)
        ]
    )
    if verbose:
        print(f"Starting simulations for {pcmodel_sample}")
    optimum_solutions = [
        run_pcfva_at_optimum_for_sample(
            pcmodel_sample,
            reaction_list=reaction_list,
            fraction_of_optimum=fraction_of_optimum,
            loopless=pcfva_kwargs.get("loopless", False),
            pfba_factor=pcfva_kwargs.get("pfba_factor"),
            processes=processes,
            error_log=pcfva_results_dirpath / "pcFVA-errors.log",
            verbose=verbose,
        )
        for fraction_of_optimum in optimum_percents
    ]
    pcfva_sols = pd.concat(optimum_solutions, axis=0)
    pcfva_sols.to_csv(
        pcfva_results_dirpath / f"{pcmodel_sample}_FVAsol.csv", index=True
    )
    return pcfva_sols


def run_pcfva_at_optimum_for_sample(
    pcmodel_sample,
    reaction_list,
    fraction_of_optimum,
    loopless=False,
    pfba_factor=None,
    processes=1,
    error_log=None,
    verbose=False,
):
    try:
        pcfva_sol = flux_variability_analysis(
            pcmodel_sample,
            reaction_list=reaction_list,
            loopless=loopless,
            pfba_factor=pfba_factor,
            fraction_of_optimum=fraction_of_optimum,
            processes=processes,
        )
    except OptimizationError as e:
        msg = f"{pcmodel_sample.id} failed due to an exception."
        if verbose:
            print(msg)
        if error_log is not None:
            with open(error_log, "a") as file:
                file.write(f"{msg} {str(e)}\n")
    else:
        pcfva_sol.index.name = "reactions"
        pcfva_sol = pcfva_sol.reset_index(drop=False)
        pcfva_sol["model"] = pcmodel_sample.id
        pcfva_sol["optimum"] = fraction_of_optimum
        pcfva_sol = pcfva_sol.rename({"minimum": "min", "maximum": "max"}, axis=1)
        if verbose:
            print(f"Finished pcFVA for fraction of optimum: {fraction_of_optimum}.")

        return pcfva_sol

In [None]:
invalid = sorted(
    set([x for x in relaxation_proteins_to_restrict if x not in pcmodel.genes])
)
if invalid:
    raise ValueError(f"Genes/Proteins not found in model: {invalid}")

pcfva_solutions = {}
columns = ["model", "reactions", "optimum", "min", "max"]
# models_to_simulate = [x for x in models_to_simulate if "Mean" in x and "Pre" in x]
if run_computations:
    for idx, pcmodel_sample_id in enumerate(models_to_simulate, start=1):
        pcfva_results_filepath = (
            pcfva_results_dirpath / f"{pcmodel_sample_id}_FVAsol.csv"
        )
        if pcfva_results_filepath.exists() and not overwrite:
            if verbose:
                print(
                    f"{idx}) Already finished {pcmodel_sample_id}, will load results after finishing remaining models."
                )
            continue

        if verbose:
            print("====================================================")
            print(f"Computing pcFVA results for {pcmodel_sample_id}")
            print("====================================================")
            print(f"Loading PC-model for {pcmodel_sample_id}")
        pcmodel_sample = load_overlay_model(
            sample_pcmodels_dirpath / f"{pcmodel_sample_id}.{ftype}"
        )
        if minimal_relaxation_budget:
            budget_rxn_relaxation = pcmodel_sample.reactions.get_by_id(
                f"{budget_rxn_prefix}{budget_met_prefix}relaxation"
            )
            budget_rxn_relaxation.upper_bound = budget_rxn_relaxation.lower_bound

        for protein in relaxation_proteins_to_restrict:
            protein_met = pcmodel_sample.metabolites.get_by_id(
                f"{protein_met_prefix}{protein}{comp_suffix}"
            )
            relax_prot_rxn = pcmodel_sample.reactions.get_by_id(
                f"{relaxation_rxn_prefix}{protein_met.id}"
            )
            relax_prot_rxn.bounds = (0, 0)

        pcfva_sols = run_simulations_at_optimums_for_sample(
            pcmodel_sample,
            objective_reactions=objective_reactions,
            reaction_list=reaction_list,
            optimum_percents=optimum_percents,
            pcfva_results_dirpath=pcfva_results_dirpath,
            processes=55,
            **{},
        )
        pcfva_solutions[pcmodel_sample_id] = pcfva_sols
# Load missing solutions if interuptions occured
for pcmodel_sample_id in models_to_simulate:
    if pcmodel_sample_id in pcfva_solutions:
        continue
    pcfva_solutions[pcmodel_sample_id] = pd.read_csv(
        pcfva_results_dirpath / f"{pcmodel_sample_id}_FVAsol.csv",
        index_col=None,
    )

pcfva_solutions = {
    pcmodel_sample_id: pcfva_solutions[pcmodel_sample_id][columns]
    for pcmodel_sample_id in sorted(models_to_simulate)
    if pcmodel_sample_id in pcfva_solutions
}
if pcfva_solutions:
    df_pcfva_all = pd.concat(list(pcfva_solutions.values()), axis=0)
    # Regroup solutions
    df_pcfva_all = (
        df_pcfva_all.sort_values(by=columns)[columns].reset_index(drop=True).copy()
    )
    df_pcfva_all.to_csv(
        pcfva_results_dirpath / f"{pcmodel.id}_All_FVAsols.csv",
        index=False,
    )
else:
    df_pcfva_all = pd.DataFrame()

## Load pcFVA generated results

In [None]:
# Test to see if results were recently generated in this run, otherwise load DataFrame of generated results
try:
    assert not df_pcfva_all.empty
except (NameError, AssertionError):
    df_pcfva_all = pd.read_csv(
        pcfva_results_dirpath / f"{pcmodel.id}_All_FVAsols.csv",
        index_col=None,
    )
    # Filter out results for models not in the desired model list
    df_pcfva_all = df_pcfva_all[df_pcfva_all["model"].isin([models_to_simulate])]

df_pcfva_all