# Create context-specific models using proteomic data
## Setup
### Import packages

In [None]:
from pathlib import Path

import gurobipy as gp
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib_venn as mpl_venn
import numpy as np
import pandas as pd
import seaborn as sns
import sympy
from cobra.flux_analysis import find_blocked_reactions
from rbc_gem_utils import (
    COBRA_CONFIGURATION,
    GEM_NAME,
    read_cobra_model,
    show_versions,
    write_cobra_model,
)
from rbc_gem_utils.analysis.overlay import (
    ProteinDilution,
    add_relaxation_budget,
    load_overlay_model,
    update_slack_value,
)
from rbc_gem_utils.util import AVOGADRO_NUMBER, DEFAULT_DRY_MASS_PER_CELL
from sklearn.metrics import r2_score

gp.setParam("OutputFlag", 0)
gp.setParam("LogToConsole", 0)

# Show versions of notebook
show_versions()

### Define configuration
#### COBRA Configuration

In [None]:
COBRA_CONFIGURATION.solver = "gurobi"
COBRA_CONFIGURATION.bounds = (-1e8, 1e8)
COBRA_CONFIGURATION

## Load RBC-GEM model

In [None]:
data_path = Path("data").resolve()
models_path = Path("models").resolve()
figures_path = Path("figures").resolve()

dataset_name = "RBComics"
dataset_path = Path(dataset_name).resolve()

imagetype = "svg"
transparent = True
save_figures = True

ftype = "xml"
model = read_cobra_model(models_path / f"{GEM_NAME.replace('-', '_')}.{ftype}")
pcmodel = load_overlay_model(filename=models_path / f"{model.id}_PC.{ftype}")

pcmodel

## Load RBC Proteomics

In [None]:
# Ensure index corresponds to sample IDs and columns are proteins IDs
sample_prefix, time_prefix = ("S", "D")
# Integers are easier to work with for time points
timepoints = [10, 23, 42]
df_copy_number_samples = pd.read_csv(
    dataset_path / f"{dataset_name}_CopyNumbers.tsv", sep="\t", index_col=0
).T
sample_ids = df_copy_number_samples.index
df_copy_number_samples

In [None]:
def get_sample_from_id(model_id, sample_prefix=""):
    sample = model_id.rsplit("_", 2)[-2]
    try:
        return int(sample.replace(sample_prefix, ""))
    except ValueError:
        return sample


def get_time_from_id(model_id, time_prefix=""):
    time = model_id.rsplit("_", 2)[-1]
    try:
        return int(time.replace(time_prefix, ""))
    except ValueError:
        return time

### Get data subsets using operations on data

In [None]:
operations = [
    "mean",
    "median",
    # "min",
    # "max",
]
percentiles = []


if operations or percentiles:
    df_operations = []
    for time in timepoints:
        df_time = df_copy_number_samples.loc[
            [
                x
                for x in df_copy_number_samples.index
                if get_time_from_id(x, time_prefix) == time
            ]
        ].T
        df_op = pd.concat(
            [getattr(df_time, op.lower())(axis=1) for op in operations], axis=1
        )
        df_op.columns = [f"{op.capitalize()}_{time_prefix}{time}" for op in operations]
        df_operations += [df_op]
        if percentiles:
            df_op = pd.concat(
                [df_time.quantile(percent, axis=1) for percent in percentiles], axis=1
            )
            df_op.columns = [
                f"Percentile{int(round(percent * 100, 0))}_{time_prefix}{time}"
                for percent in percentiles
            ]
            df_operations += [df_op]
    df_operations = pd.concat(df_operations, axis=1)
    operation_ids = df_operations.columns
    df_copy_number_all = pd.concat((df_copy_number_samples.T, df_operations), axis=1).T
else:
    df_copy_number_all = df_copy_number_samples.T
    operation_ids = []
df_copy_number_all

## Integrate proteomics with model
### Convert copy numbers to mg / gDW

In [None]:
df_protein_data = pd.read_csv(
    f"{dataset_path}/{dataset_name}_protein_data.tsv",
    sep="\t",
    index_col="Entry",
)
df_uniprot_to_mw = df_protein_data["Mass"] / 1000  # g/mol --> # kg / mol

df_mg_prot_per_gDW = (
    df_copy_number_all  # protein copies / cell
    * (1 / DEFAULT_DRY_MASS_PER_CELL)  # cell / pgDW
    * (1e12 / 1)  # pgDW / gDW
    * (1 / AVOGADRO_NUMBER)  # mol / protein copies
    * (df_uniprot_to_mw)  # kg / mol
    * (1e6 / 1)  # mg / kg
).copy()
df_mg_prot_per_gDW

### Scale measurements for proteome budget
Note that this step will help ensure its theoretically possible for a perfect fit 

In [None]:
proteome_budget_value = 50
hemoglobin_budget_value = 950
total_budget_value = None

In [None]:
# Split into hemoglobin and low abundance proteomes
hb_proteins = {
    # Model Gene: UniProt ID
    "HBA": "P69905",
    "HBB": "P68871",
    "HBD": "P02042",
    "HBM": "Q6B0K9",
    "HBE1": "P02100",
    "HBG1": "P69891",
    "HBG2": "P69892",
    "HBQ1": "P09105",
    "HBZ": "P02008",
}
df_mg_prot_per_gDW_hb = df_mg_prot_per_gDW.loc[:, list(hb_proteins.values())]
df_mg_prot_per_gDW_la = df_mg_prot_per_gDW.loc[
    :, [x for x in df_mg_prot_per_gDW.columns if not x in list(hb_proteins.values())]
]

PBDL_proteome_budget = pcmodel.reactions.get_by_id("PBDL_proteome_budget")
PBDL_hemoglobin_budget = pcmodel.reactions.get_by_id("PBDL_hemoglobin_budget")
PBDL_total_budget = pcmodel.reactions.get_by_id("PBDL_total_budget")

if proteome_budget_value is None:
    proteome_budget_value = PBDL_proteome_budget.upper_bound
if hemoglobin_budget_value is None:
    hemoglobin_budget_value = PBDL_hemoglobin_budget.upper_bound
if total_budget_value is None:
    total_budget_value = PBDL_total_budget.upper_bound

assert total_budget_value >= (proteome_budget_value + hemoglobin_budget_value)

PBDL_proteome_budget.upper_bound = proteome_budget_value
PBDL_hemoglobin_budget.upper_bound = hemoglobin_budget_value
PBDL_total_budget.upper_bound = total_budget_value

# Scale values for low abundance proteome
budget_value = proteome_budget_value
df_mg_prot_per_gDW_la = (
    budget_value * (df_mg_prot_per_gDW_la.T / df_mg_prot_per_gDW_la.sum(axis=1)).T
)
print(f"Low abundance budget:\t{budget_value}\t(={budget_value * 100 / 1000}%)")

# Scale values for hemoglobin proteome
budget_value = hemoglobin_budget_value
df_mg_prot_per_gDW_hb = (
    budget_value * (df_mg_prot_per_gDW_hb.T / df_mg_prot_per_gDW_hb.sum(axis=1)).T
)
print(f"Hemoglobin budget:\t{budget_value}\t(={budget_value * 100 / 1000}%)")

budget_value = total_budget_value - sum(
    [proteome_budget_value, hemoglobin_budget_value]
)
print(f"Remaining budget:\t{budget_value}\t(={budget_value * 100 / 1000}%)")


# Combine dataframes back into one
df_mg_prot_per_gDW_normalized = pd.concat(
    (df_mg_prot_per_gDW_hb, df_mg_prot_per_gDW_la), axis=1
)
df_mg_prot_per_gDW_normalized.sum(axis=1)

### Convert mg / gDW to nmol / gDW

In [None]:
df_nmol_prot_per_gDW = (
    df_mg_prot_per_gDW_normalized  # mg / gDW
    * (1 / df_uniprot_to_mw)  # mol / kg --> mmol / g --> umol / mg
    * (1e3 / 1)  # nmol / umol
).loc[:, df_mg_prot_per_gDW_normalized.columns]
df_nmol_prot_per_gDW = df_nmol_prot_per_gDW.T
df_nmol_prot_per_gDW

## Create DataFrame for protein dilution reactions

In [None]:
df_model_protein_dilutions = pd.concat(
    (
        pd.Series(
            {g.annotation.get("uniprot"): g.id for g in model.genes}, name="genes"
        ),
        pd.Series(
            {
                protdl.annotation.get("uniprot"): protdl.id
                for protdl in pcmodel.reactions.query(
                    lambda x: isinstance(x, ProteinDilution)
                )
            },
            name="PROTDL",
        ),
    ),
    axis=1,
)
df_model_protein_dilutions.index.name = "uniprot"
df_model_protein_dilutions = df_model_protein_dilutions[
    df_model_protein_dilutions["genes"].isin(model.genes.list_attr("id"))
].sort_values("PROTDL")
df_model_protein_dilutions

## Organize samples

In [None]:
df_samples = df_nmol_prot_per_gDW.copy()

# Best to sort by donor, if sample fails solving at any point, only need to redo the n_timepoints associated with sample.
sort_samples_by_donor = True
merge_key = "uniprot"
run_computations = False

# Use this line of code to determine which samples are used.
# Useful for picking up where notebook may have prematurely stopped.
# Set as 0 or less to use all samples
start_from_sample_id = 0
# Set as 0 or less to use only operation samples
# end_on_sample_id = 0
# Set as length of samples to use all samples
end_on_sample_id = len(df_samples)

# Use this line of code to determine which time points are used.
# Comment out to keep all original points defined in samples
timepoints = [10, 23, 42]
df_samples.index.name = merge_key
# Filter out time points
df_samples = df_samples.loc[
    :, [x for x in df_samples.columns if get_time_from_id(x, time_prefix) in timepoints]
]
# Filter out irrelevant samples
df_samples = pd.concat(
    (
        df_samples.loc[
            :,
            [
                x
                for x in sample_ids
                if start_from_sample_id
                <= get_sample_from_id(x, sample_prefix)
                <= end_on_sample_id
            ],
        ],
        df_samples.loc[:, [x for x in df_samples.columns if x in operation_ids]],
    ),
    axis=1,
)

if sort_samples_by_donor:
    df_samples = pd.concat(
        (
            df_samples.loc[
                :,
                sorted(
                    [x for x in df_samples.columns if x not in operation_ids],
                    key=lambda x: (
                        get_sample_from_id(x, sample_prefix),
                        get_time_from_id(x, time_prefix),
                    ),
                ),
            ],
            df_samples.loc[
                :,
                sorted(
                    [x for x in df_samples.columns if x in operation_ids],
                    key=lambda x: get_time_from_id(x, time_prefix),
                ),
            ],
        ),
        axis=1,
    )
else:
    df_samples = pd.concat(
        [
            pd.concat(
                (
                    df_samples.loc[
                        :,
                        sorted(
                            [
                                x
                                for x in df_samples.columns
                                if get_time_from_id(x, time_prefix) == time
                                and x not in operation_ids
                            ],
                            key=lambda x: get_sample_from_id(x, sample_prefix),
                        ),
                    ],
                    df_samples.loc[
                        :,
                        sorted(
                            [
                                x
                                for x in df_samples.columns
                                if get_time_from_id(x, time_prefix) == time
                                and x in operation_ids
                            ],
                        ),
                    ],
                ),
                axis=1,
            )
            for time in timepoints
        ],
        axis=1,
    )

### Map samples to model

In [None]:
df_model = (
    df_model_protein_dilutions[["PROTDL"]]
    .merge(df_samples, left_index=True, right_index=True, how="left")
    .set_index("PROTDL")
    .sort_index()
)
no_experimental_measurements = [
    protein_dilution
    for protein_dilution, has_measurement in df_model.isna().all(axis=1).items()
    if has_measurement
]
print(
    f"Model proteins mapped to measurements: {len(df_model) - len(no_experimental_measurements)}"
)
print(f"Model proteins without measurements: {len(no_experimental_measurements)}")
df_model.dropna()

#### Summarize mapping

In [None]:
dataset_proteins = set(df_samples.index)
model_proteins = set(df_model_protein_dilutions.index)

df_mg_prot_per_gDW_hb = df_mg_prot_per_gDW_normalized.loc[:, list(hb_proteins.values())]
df_mg_prot_per_gDW_la = df_mg_prot_per_gDW_normalized.loc[
    :, [x for x in df_mg_prot_per_gDW.columns if not x in list(hb_proteins.values())]
]

df_mapped_mass_la = df_mg_prot_per_gDW_la.loc[
    :, df_mg_prot_per_gDW_la.columns.isin(model_proteins)
].sum(axis=1)
df_unmapped_mass_la = df_mg_prot_per_gDW_la.loc[
    :, ~df_mg_prot_per_gDW_la.columns.isin(model_proteins)
].sum(axis=1)
df_mapped_mass_hb = df_mg_prot_per_gDW_hb.loc[
    :, df_mg_prot_per_gDW_hb.columns.isin(model_proteins)
].sum(axis=1)
df_unmapped_mass_hb = df_mg_prot_per_gDW_hb.loc[
    :, ~df_mg_prot_per_gDW_hb.columns.isin(model_proteins)
].sum(axis=1)

proteomes = {}
round_int = 6
for label, df in zip(
    ["hemoglobin", "low abundance"], [df_mg_prot_per_gDW_hb, df_mg_prot_per_gDW_la]
):
    df_modeled = df.loc[:, df.columns.isin(model_proteins)].sum(axis=1)
    df_remaining = df.loc[:, ~df.columns.isin(model_proteins)].sum(axis=1)
    means = (df_modeled.mean(), df_remaining.mean())
    stdevs = (df_modeled.std(), df_remaining.std())
    proteomes[(label, "modeled")] = round(means[0], round_int)
    proteomes[(label, "remaining")] = round(means[1], round_int)
proteomes = pd.Series(proteomes, name="Mean value across samples")
proteomes.index = [f"Mean {k[0]} mass {k[1]}" for k in proteomes.index]
print(proteomes.head())
proteomes = proteomes[proteomes != 0]

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(3, 6))
subsets = (
    len(dataset_proteins),
    len(model_proteins),
    len(dataset_proteins.intersection(model_proteins)),
)


venn = mpl_venn.venn2(
    subsets=subsets,
    set_labels=(dataset_name, GEM_NAME),
    set_colors=("red", "blue"),
    alpha=0.5,
    ax=ax1,
)
circles = mpl_venn.venn2_circles(
    subsets=subsets, linestyle="-", color="black", ax=ax1, linewidth=1
)
for text in venn.set_labels:
    text.set_fontsize("x-large")
for text in venn.subset_labels:
    text.set_fontsize("x-large")
ax1.set_title("Modeled proteome", fontsize="xx-large")


label_color_map = {
    "Mean hemoglobin mass modeled": ("Hemoglobin", "xkcd:dark red"),
    "Mean low abundance mass modeled": ("Low abundance", "xkcd:light blue"),
    "Mean low abundance mass remaining": ("Not modeled", "xkcd:green"),
}
edgecolor = "black"
linewidth = 1
ax2.pie(
    x=proteomes.values,
    colors=[label_color_map[k][1] for k in proteomes.index],
    pctdistance=1.35,
    counterclock=False,
    autopct=lambda pct: f"{pct * 1000/100:.2f}\n",
    textprops=dict(fontsize="large", ha="center", va="top"),
    wedgeprops=dict(edgecolor=edgecolor, linewidth=linewidth),
)
handles = [
    mpl.patches.Patch(
        edgecolor=edgecolor,
        linewidth=linewidth,
        label=label_color_map[k][0],
        facecolor=label_color_map[k][1],
    )
    for k in proteomes.index
]
ax2.legend(
    handles=handles,
    ncols=1,
    bbox_to_anchor=(0.5, 0),
    loc="upper center",
    fontsize="large",
    frameon=False,
)
ax2.set_xlabel("Mass (mg/gDW)", fontsize="large", labelpad=-10)
fig.tight_layout()
if save_figures:
    fig.savefig(
        figures_path / f"Fig5_PanelEF_ModeledProteome.{imagetype}",
        transparent=transparent,
        format=imagetype,
    )
fig;

### Reduce model using maximum values from proteomics
Create a reduced model using the maximum value measured of each protein across all data samples. By finding the reactions that always remain blocked prior to analysis, the computation time may be reduced.
Also ensure that proteins not found in the dataset don't significantly sway results,

Note: May be problematic if proteins essential for catalysis are not measured. Keep protein bounds unchanged to remove only normal blocked reactions.

In [None]:
remove_blocked_reactions = False

if remove_blocked_reactions:
    pcmodel_blocked = pcmodel.copy()
    df = df_model.max(axis=1)
    for protein_dilution, bound in df.items():
        protein_dilution = pcmodel_blocked.reactions.get_by_id(protein_dilution)
        # if not np.isnan(bound):
        #     protein_dilution.bounds = (0, bound)
        # else:
        #     protein_dilution.bounds = (0, 0)
    blocked = find_blocked_reactions(pcmodel_blocked)
    pcmodel.remove_reactions(blocked, remove_orphans=True)

for objective_reaction in ["NaKt"]:
    objective_reaction = pcmodel.reactions.get_by_id(objective_reaction)
    pcmodel.objective = objective_reaction.flux_expression
    print(pcmodel.slim_optimize())

pcmodel

## Create QP model for each sample

In [None]:
def solve_qp(pcmodel, df):
    x = []  # Variables
    c = []  # Data * Weights
    F = []  # Weights

    for protdl, (data_value, weight) in df.iterrows():
        protdl = pcmodel.reactions.get_by_id(protdl)
        x.append(protdl.flux_expression)
        c.append(weight * data_value)
        F.append(weight)

    x = sympy.Matrix(x)
    c = sympy.Matrix(c)
    F = sympy.DiagMatrix(sympy.Matrix(F))
    # # QP Objective must be in form of 0.5 * x.T * F * x - c.T * x
    objective = 0.5 * x.T * F * x - c.T * x
    pcmodel.objective = objective[0]
    pcmodel.objective_direction = "min"
    pcmodel.tolerance = 1e-9

    qp_sol = pcmodel.optimize()
    return qp_sol


def solve_qp_for_samples(pcmodel, df_samples, df_weights=None, verbose=False):
    qp_solutions_dict = {}
    for sample_id, data_series in df_samples.items():
        # Get protein values
        data_series.name = "Data"
        if df_weights is None:
            data_weights = 1 / data_series.replace(0, 1)
            data_weights = data_weights / data_weights.mean()
        else:
            data_weights = df_weights.loc[:, sample_id]
        # Get protein weights
        data_weights.name = "Weights"

        # Map to model, currently model mapping DataFrame generated outside scope of function
        df_model_data_weights = (
            df_model_protein_dilutions[["PROTDL"]]
            .merge(data_series, left_index=True, right_index=True, how="left")
            .merge(data_weights, left_index=True, right_index=True, how="left")
            .set_index("PROTDL")
            .sort_index()
        )

        df = df_model_data_weights.loc[:, [data_series.name, data_weights.name]].dropna(
            axis=0, how="all"
        )

        with pcmodel:
            qp_sol = solve_qp(pcmodel, df)

        df_qp_sol = qp_sol.fluxes.loc[
            pcmodel.reactions.query(lambda x: isinstance(x, ProteinDilution)).list_attr(
                "id"
            )
        ]
        df_qp_sol = (
            pd.concat((df_model_data_weights, df_qp_sol), axis=1).dropna().sort_index()
        )
        # data_weights = df_qp_sol.loc[:, "Weights"]

        df_qp_sol = df_qp_sol.rename(
            {"Data": "Measured Proteome", "fluxes": "Best-Fitted Proteome"}, axis=1
        )
        df_qp_sol = df_qp_sol.loc[:, ["Measured Proteome", "Best-Fitted Proteome"]]

        r2 = r2_score(
            df_qp_sol.iloc[:, 0].values,
            df_qp_sol.iloc[:, 1].values,
            multioutput="uniform_average",
        )
        qp_solutions_dict[sample_id] = (df_qp_sol, r2, qp_sol.objective_value)
        if verbose:
            # Recall that the objective is designed to try to minimize fitting error via maximizing R2, so 1 is a possibility
            print(
                "R^2 (objective) value for Sample '{}': {:.9f} ({:.5f})".format(
                    sample_id, r2, qp_sol.objective_value
                )
            )
        # TODO catch bad fits

    return qp_solutions_dict

### Set weightings for QP problem

In [None]:
# Ensure data is provided as (Protein IDs x Sample IDs)
# Use original measurements for weights
df_weights = df_copy_number_all.T.loc[:, df_samples.columns]
df_weights = 1 / df_weights.replace(0, 1)
df_weights /= df_weights.mean()

In [None]:
if run_computations:
    qp_solutions_dict = solve_qp_for_samples(
        pcmodel, df_samples, df_weights=df_weights, verbose=True
    )
    df_measured = {}
    df_best_fit = {}
    df_r2_objective = {}
    for sample_id, (df_qp_sol, r2, objective_value) in qp_solutions_dict.items():
        df_measured[sample_id] = df_qp_sol["Measured Proteome"].to_dict()
        df_best_fit[sample_id] = df_qp_sol["Best-Fitted Proteome"].to_dict()
        df_r2_objective[sample_id] = {"R2": r2, "Objective": objective_value}

    df_measured = pd.DataFrame.from_dict(df_measured, orient="columns")
    df_measured.to_csv(f"{dataset_path}/proteome_measured.tsv", sep="\t", index=True)

    df_best_fit = pd.DataFrame.from_dict(df_best_fit, orient="columns")
    df_best_fit.to_csv(f"{dataset_path}/proteome_best_fit.tsv", sep="\t", index=True)

    df_r2_objective = pd.DataFrame.from_dict(df_r2_objective, orient="columns")
    df_r2_objective.to_csv(
        f"{dataset_path}/proteome_r2_objective.tsv", sep="\t", index=True
    )
else:
    df_measured = pd.read_csv(
        f"{dataset_path}/proteome_measured.tsv", sep="\t", index_col=0
    )
    df_best_fit = pd.read_csv(
        f"{dataset_path}/proteome_best_fit.tsv", sep="\t", index_col=0
    )
    df_r2_objective = pd.read_csv(
        f"{dataset_path}/proteome_r2_objective.tsv", sep="\t", index_col=0
    )

    qp_solutions_dict = {}
    for sample_id in list(sample_ids) + list(operation_ids):
        if not sample_id.replace(f"{pcmodel.id}_", "") in df_samples.columns:
            continue
        df_qp_sol = pd.concat(
            (
                df_measured.loc[:, sample_id],
                df_best_fit.loc[:, sample_id],
            ),
            axis=1,
        )
        df_qp_sol.columns = ["Measured Proteome", "Best-Fitted Proteome"]
        r2, objective_value = df_r2_objective.loc[:, sample_id].values
        qp_solutions_dict[sample_id] = (df_qp_sol, r2, objective_value)
print(f"Number of QP solutions: {len(qp_solutions_dict)}")

### Plot fitting for the mean and median samples

In [None]:
samples_to_plot = np.array(
    [
        # Best for 1 or 3 columns
        [x for x in operation_ids if "Mean" in x],
        [x for x in operation_ids if "Median" in x],
    ]
)

r2_text_loc = "lower right"

length = 4
nrows, ncols = samples_to_plot.shape
fig, axes = plt.subplots(
    nrows=nrows,
    ncols=ncols,
    figsize=(length * ncols, length * nrows),
    sharex=True,
    sharey=True,
)
sns.despine(fig)

for idx, (sample_id, ax) in enumerate(zip(samples_to_plot.flatten(), axes.flatten())):
    df_qp_sol, r2, objective_value = qp_solutions_dict[sample_id]
    xlabel, ylabel = df_qp_sol.columns

    ticks = np.geomspace(1e-6, 1e6, 5)
    perfect_fit_line = ax.plot(
        [0, ticks[-1]],
        [0, ticks[-1]],
        linestyle=":",
        color="black",
        linewidth=1,
        alpha=1,
    )

    df_zeros = df_qp_sol[(df_qp_sol.apply(lambda x: np.isclose(x, 0))).any(axis=1)]
    df_perfect = df_qp_sol[
        np.isclose(
            abs(df_qp_sol["Measured Proteome"] - df_qp_sol["Best-Fitted Proteome"]), 0
        )
    ]
    df_perfect = df_perfect[~df_perfect.index.isin(df_zeros.index)]

    df_altered = df_qp_sol[
        ~np.isclose(
            abs(df_qp_sol["Measured Proteome"] - df_qp_sol["Best-Fitted Proteome"]), 0
        )
    ]
    df_altered = df_altered[~df_altered.index.isin(df_zeros.index)]
    df_always_zero = df_zeros[(df_zeros == 0).all(axis=1)]
    df_zeros = df_zeros[~df_zeros.index.isin(df_always_zero.index)]
    df_from_zeros = df_zeros[np.isclose(df_zeros["Measured Proteome"], 0)]
    df_to_zeros = df_zeros[np.isclose(df_zeros["Best-Fitted Proteome"], 0)]

    handles = [
        ax.scatter(
            data=df_perfect.replace(0, ticks[0]),
            x=xlabel,
            y=ylabel,
            color="xkcd:blue",
            alpha=0.5,
            edgecolors="black",
            linewidths=1,
        ),
        ax.scatter(
            data=df_altered.replace(0, ticks[0]),
            x=xlabel,
            y=ylabel,
            color="xkcd:yellow",
            alpha=0.5,
            edgecolors="black",
            linewidths=1,
        ),
        ax.scatter(
            data=df_from_zeros.replace(0, ticks[0]),
            x=xlabel,
            y=ylabel,
            color="xkcd:green",
            alpha=0.5,
            edgecolors="black",
            linewidths=1,
        ),
        ax.scatter(
            data=df_to_zeros.replace(0, ticks[0]),
            x=xlabel,
            y=ylabel,
            color="xkcd:red",
            alpha=0.5,
            edgecolors="black",
            linewidths=1,
        ),
        ax.scatter(
            data=df_always_zero.replace(0, ticks[0]),
            x=xlabel,
            y=ylabel,
            color="xkcd:black",
            alpha=0.5,
            edgecolors="black",
            linewidths=1,
        ),
    ]
    labels = [
        f"Perfect fit",
        f"Adjusted abundance",
        f"Unexpressed to expressed",
        f"Expressed to unexpressed",
        f"Never expressed",
    ]

    donor, day = sample_id.split("_")
    if not ("Mean" == donor or "Median" == donor):
        donor = donor.replace("S", "; Donor ")
    day = day.replace("D", "Day ")
    fancy_sample_id = " ".join((day, donor))

    ax.set_xscale("log")
    ax.set_yscale("log")

    fontdict = {"size": "xx-large"}
    if idx == len(samples_to_plot.flatten()) - np.ceil(ncols / 2):
        ax.set_xlabel(xlabel, fontdict=fontdict)
    fig.legend(
        handles=handles,
        labels=labels,
        loc="lower center",
        ncols=len(labels),
        frameon=False,
        fontsize="large",
        markerscale=2,
        bbox_to_anchor=(0.5, -0.05),
    )
    if idx % ncols == 0:
        ax.set_ylabel(ylabel, fontdict=fontdict)

    ax.set_xticks(ticks)
    ax.set_yticks(ticks)

    ax.xaxis.set_tick_params(labelsize="x-large")
    ax.yaxis.set_tick_params(labelsize="x-large")

    r2_format = " = {:.4f}"
    if r2_text_loc == "lower right":
        ax.text(
            0.95,
            0.1,
            r"$R^{2}$" + r2_format.format(r2) + ("\n{}".format(fancy_sample_id)),
            transform=ax.transAxes,
            color="black",
            fontsize="large",
            ha="right",
        )
    elif r2_text_loc == "upper left":
        ax.text(
            0.05,
            0.9,
            ("{}\n".format(fancy_sample_id)) + r"$R^{2}$" + r2_format.format(r2),
            transform=ax.transAxes,
            color="black",
            fontsize="large",
            ha="left",
        )
    else:
        pass
fig.tight_layout()
if save_figures:
    fig.savefig(
        figures_path / f"S2Fig_Panel_QPfitting.{imagetype}",
        transparent=transparent,
        format=imagetype,
    )

### Determine best value for slack variables

In [None]:
list_of_pcmodels = []
verbose = True
objective_rxns = ["NaKt"]

slack_min = 1e-5  # Slack %
slack_max = 1.5
if run_computations:
    for sample_id in operation_ids:
        df_qp_sol, r2, objective_value = qp_solutions_dict[sample_id]
        # Create a copy of the model
        pcmodel_sample = pcmodel.copy()
        pcmodel_sample.id = f"{pcmodel.id}_{sample_id}"
        for protdl in pcmodel_sample.reactions.query(
            lambda x: isinstance(x, ProteinDilution)
        ):
            if protdl.id in df_qp_sol.index:
                prot_bound = df_qp_sol.loc[protdl.id]["Best-Fitted Proteome"]
            else:
                prot_bound = 0
            protdl.bounds = (float(prot_bound), float(prot_bound))
        # Add the relaxation budget with slack = 0 first
        add_relaxation_budget(pcmodel_sample, 0, verbose)
        list_of_pcmodels += [pcmodel_sample]

    solutions = {
        pcmodel_sample.id: {
            "model": [],
            "slack": [],
            "objective": [],
            "relaxation": [],
            "_".join(objective_rxns): [],
        }
        for pcmodel_sample in list_of_pcmodels
    }
    slack_min = 1e-5
    slack_max = 1.5
    for slack_value in np.geomspace(slack_min, slack_max, 251):
        print(f"Updating slack variable to {100 * slack_value:.4f}%.")
        for pcmodel_sample in list_of_pcmodels:
            update_slack_value(pcmodel_sample, slack_value, verbose=False)
            relaxation_demand = pcmodel_sample.reactions.get_by_id(
                f"PBDL_relaxation_budget"
            )
            pcmodel_sample.objective = (
                sum(
                    [
                        r.flux_expression
                        for r in pcmodel_sample.reactions.get_by_any(objective_rxns)
                    ]
                )
                - relaxation_demand.flux_expression
            )
            pcmodel_sample.objective_direction = "max"
            sol = pcmodel_sample.optimize()
            obj_value = sol.objective_value
            if not obj_value or np.isnan(obj_value):
                continue
            else:
                demand = relaxation_demand.flux
                budget = relaxation_demand.upper_bound
            solutions[pcmodel_sample.id]["model"].append(pcmodel_sample.id)
            solutions[pcmodel_sample.id]["slack"].append(slack_value)
            solutions[pcmodel_sample.id]["objective"].append(obj_value)
            solutions[pcmodel_sample.id]["_".join(objective_rxns)].append(
                obj_value + demand
            )
            solutions[pcmodel_sample.id]["relaxation"].append(demand / budget)
    solutions = {
        pcmodel_sample: pd.DataFrame.from_dict(sol)
        for pcmodel_sample, sol in solutions.items()
    }

    df_relaxation = pd.concat(list(solutions.values()), axis=0)
    df_relaxation.to_csv(
        f"{dataset_path}/SlackPercentDeterminationData_.tsv", sep="\t", index=False
    )
else:
    df_relaxation = pd.read_csv(
        f"{dataset_path}/SlackPercentDeterminationData.tsv", sep="\t", index_col=None
    )
    solutions = {
        mid: df_relaxation[df_relaxation["model"] == mid].drop("model", axis=1)
        for mid in df_relaxation["model"].unique()
    }
df_relaxation

In [None]:
fig, axes = plt.subplots(
    3, 1, figsize=(4, 10), sharex=True, gridspec_kw=dict(hspace=0.05)
)
axes = axes.flatten()

# ax3d = fig.add_subplot(2, 2, 4, projection="3d")
sns.despine(fig)

handles = []
labels = []
chosen_slack_var = 0.03
colors = {
    "D10": "xkcd:green",
    "D23": "xkcd:gold",
    "D42": "xkcd:red",
}
linestyles = {
    "Mean": "-",
    "Median": "-.",
}
use_percents = True
for pcmodel_sample in list(solutions):
    donor, day = str(pcmodel_sample).split("_")[-2:]
    linestyle = linestyles.get(donor, ":")
    color = colors.get(day, "xkcd:light blue")
    if not ("Mean" == donor or "Median" == donor):
        donor = donor.replace("S", "; Donor ")
    day = day.replace("D", "Day ")
    fancy_sample_id = " ".join((day, donor))

    labels.append(fancy_sample_id)
    s_values = solutions[str(pcmodel_sample)]["slack"].values
    r_values = solutions[str(pcmodel_sample)]["relaxation"].values * (
        100 if use_percents else 1
    )
    o_values = solutions[str(pcmodel_sample)]["objective"].values
    rxn_values = solutions[str(pcmodel_sample)]["_".join(objective_rxns)].values

    zorder = 1
    lw = 2
    axes[0].plot(
        s_values,
        r_values,
        label=str(pcmodel_sample),
        color=color,
        linestyle=linestyle,
        linewidth=lw,
        zorder=zorder,
    )
    axes[1].plot(
        s_values,
        o_values,
        label=str(pcmodel_sample),
        color=color,
        linestyle=linestyle,
        linewidth=lw,
        zorder=zorder,
    )
    axes[2].plot(
        s_values,
        rxn_values,
        label=str(pcmodel_sample),
        color=color,
        linestyle=linestyle,
        linewidth=lw,
        zorder=zorder,
    )

    # index = list(s_values).index(s_values[s_values >= chosen_slack_var][0])
    # spt = s_values[index]
    # rpt = r_values[index]
    # opt = o_values[index]
    # rxnpt = rxn_values[index]
    # c = "black"
    # ls = ""
    # marker = "o"
    # markersize = 8
    # axes[0].plot(spt, rpt, color=c, linestyle=ls, marker=marker, markersize=markersize)
    # axes[1].plot(spt, opt, color=c, linestyle=ls, marker=marker, markersize=markersize)
    # axes[2].plot(spt, rxnpt, color=c, linestyle=ls, marker=marker, markersize=markersize)
    # ax3d.plot(spt, rpt, opt, color=c, linestyle=ls, marker=marker, markersize=markersize)
    # print(f"Elbow point for {pcmodel_sample}: ({spt:.5f}, {rpt:.5f}, {opt:.5f})")
fontdict = {"size": "x-large"}
axes[-1].set_xlabel(r"Slack variable $s$", fontdict=fontdict)

i = 0
zorder = 2
alpha = 0.7
limit_pad_sclar = 1.2
ymin, ymax = (-0.001 * (100 if use_percents else 1), max(r_values) * limit_pad_sclar)
smin = s_values[list(o_values).index(o_values[o_values <= 0][0])]
axes[i].vlines(chosen_slack_var, ymin=ymin, ymax=ymax, color="black", linestyle=":")
axes[i].vlines(
    smin, ymin=ymin, ymax=ymax, color="black", linestyle="-", zorder=zorder, alpha=alpha
)
axes[i].vlines(
    1, ymin=ymin, ymax=ymax, color="black", linestyle="-", zorder=zorder, alpha=alpha
)
axes[i].set_xlim(smin / 2, slack_max)
axes[i].set_ylim(ymin, ymax)
axes[i].set_xscale("log")
axes[i].annotate(
    rf"$s = {chosen_slack_var}$",
    xy=(chosen_slack_var, 0),
    xycoords="data",
    xytext=(5, 0),
    textcoords="offset points",
    ha="left",
    fontsize=fontdict["size"],
)
axes[i].annotate(
    rf"$s > 0$",
    xy=(smin, ymax),
    xycoords="data",
    xytext=(10, 5),
    textcoords="offset points",
    ha="center",
    fontsize=fontdict["size"],
)
axes[i].annotate(
    rf"$s \leq 1$",
    xy=(1, ymax),
    xycoords="data",
    xytext=(10, 5),
    textcoords="offset points",
    ha="center",
    fontsize=fontdict["size"],
)
axes[i].fill_between((smin / 2, smin), ymin, ymax, color="xkcd:light grey")
axes[i].annotate(
    "Infeasible",
    xy=(smin, (ymax + ymin) / 2),
    xycoords="data",
    rotation=90,
    xytext=(-10, 0),
    textcoords="offset points",
    va="center",
    ha="right",
    fontsize=fontdict["size"],
)


i += 1
ymin, ymax = (min(o_values) * limit_pad_sclar, max(o_values) * limit_pad_sclar)
axes[i].vlines(chosen_slack_var, ymin=ymin, ymax=ymax, color="black", linestyle=":")
axes[i].vlines(
    smin, ymin=ymin, ymax=ymax, color="black", linestyle="-", zorder=zorder, alpha=alpha
)
axes[i].vlines(
    1, ymin=ymin, ymax=ymax, color="black", linestyle="-", zorder=zorder, alpha=alpha
)
axes[i].set_xlim(smin / 2, slack_max)
axes[i].set_ylim(ymin, ymax)
axes[i].set_xscale("log")
# axes[i].annotate(rf'$s = {chosen_slack_var}$', xy=(chosen_slack_var, 0), xycoords='data', xytext=(5, 0), textcoords='offset points',  ha="left", fontsize=fontdict["size"])
# axes[i].annotate(rf'$s > 0$', xy=(smin, ymax), xycoords='data', xytext=(10, 5), textcoords='offset points', ha="center", fontsize=fontdict["size"])
# axes[i].annotate(rf'$s \leq 1$', xy=(1, ymax), xycoords='data', xytext=(10, 5), textcoords='offset points', ha="center", fontsize=fontdict["size"])
axes[i].fill_between((smin / 2, smin), ymin, ymax, color="xkcd:light grey")
axes[i].annotate(
    "Infeasible",
    xy=(smin, (ymax + ymin) / 2),
    xycoords="data",
    rotation=90,
    xytext=(-10, 0),
    textcoords="offset points",
    va="center",
    ha="right",
    fontsize=fontdict["size"],
)

i += 1

ymin, ymax = (0, max(rxn_values) * limit_pad_sclar)
axes[i].vlines(chosen_slack_var, ymin=ymin, ymax=ymax, color="black", linestyle=":")
axes[i].vlines(
    smin, ymin=ymin, ymax=ymax, color="black", linestyle="-", zorder=zorder, alpha=alpha
)
axes[i].vlines(
    1, ymin=ymin, ymax=ymax, color="black", linestyle="-", zorder=zorder, alpha=alpha
)
axes[i].set_xlim(smin / 2, slack_max)
axes[i].set_ylim(ymin, ymax)
axes[i].set_xscale("log")
# axes[i].annotate(rf'$s = {chosen_slack_var}$', xy=(chosen_slack_var, 0), xycoords='data', xytext=(5, 0), textcoords='offset points',  ha="left", fontsize=fontdict["size"])
# axes[i].annotate(rf'$s > 0$', xy=(smin, ymax), xycoords='data', xytext=(10, 5), textcoords='offset points', ha="center", fontsize=fontdict["size"])
# axes[i].annotate(rf'$s \leq 1$', xy=(1, ymax), xycoords='data', xytext=(10, 5), textcoords='offset points', ha="center", fontsize=fontdict["size"])
axes[i].fill_between((smin / 2, smin), ymin, ymax, color="xkcd:light grey")
axes[i].annotate(
    "Infeasible",
    xy=(smin, (ymax + ymin) / 2),
    xycoords="data",
    rotation=90,
    xytext=(-10, 0),
    textcoords="offset points",
    va="center",
    ha="right",
    fontsize=fontdict["size"],
)

handles, labels = axes[2].get_legend_handles_labels()
handles_labels = dict(zip(labels, handles))
handles_labels = {
    k: handles_labels[k] for k in sorted(handles_labels, key=get_time_from_id)
}
handles, labels = (list(handles_labels.values()), list(handles_labels.keys()))
labels = [
    "Day " + " ".join(l.replace(f"{pcmodel.id}_", "").split("_")[::-1])[1:]
    for l in labels
]
axes[2].legend(
    handles=handles,
    labels=labels,
    ncols=1,
    frameon=False,
    loc="upper center",
    fontsize="large",
    bbox_to_anchor=(0.6, 0.7),
)


axes[0].set_ylabel("Relaxation budget used (%)", fontdict=fontdict)
axes[1].set_ylabel("Objective value", fontdict=fontdict)
axes[2].set_ylabel("NaKt (mmol/gDW/hr)", fontdict=fontdict)

fig.align_labels()
if save_figures:
    fig.savefig(
        figures_path / f"S2Fig_PanelD_SlackPercentDetermination.{imagetype}",
        transparent=transparent,
        format=imagetype,
    )

### Formulate models from QP solutions

In [None]:
list_of_relaxed_models = []
slack_value = chosen_slack_var  # Slack %
verbose = True

# In our experience, SBML/XML loads faster, but will take up to 4x more space uncompressed as compared to JSON
ftypes = {
    "xml"
    # "json",
}

model_values = {}
ftypes = set([ftypes]) if isinstance(ftypes, str) else set(ftypes)
for sample_id, (df_qp_sol, r2, objective_value) in qp_solutions_dict.items():
    # Create a copy of the model
    sample_id = f"{pcmodel.id}_{sample_id}"
    filenames = [
        Path(f"{dataset_path}/pcmodels/{sample_id}.{ftype}") for ftype in ftypes
    ]
    if all([filename.exists() for filename in filenames]):
        print(f"Model already created for {sample_id}")
        continue
    pcmodel_sample = pcmodel.copy()
    pcmodel_sample.id = sample_id
    for protdl in pcmodel_sample.reactions.query(
        lambda x: isinstance(x, ProteinDilution)
    ):
        if protdl.id in df_qp_sol.index:
            prot_bound = df_qp_sol.loc[protdl.id]["Best-Fitted Proteome"]
        else:
            prot_bound = 0
        protdl.bounds = (float(prot_bound), float(prot_bound))
    # Add the relaxation budget
    add_relaxation_budget(pcmodel_sample, slack_value, verbose)
    # # Store model for later use
    # list_of_relaxed_models += [pcmodel_sample]
    for filename in filenames:
        # Might as well overwrite all files, especially if model needed to be regenerated anyways
        write_cobra_model(
            pcmodel_sample,
            filename=filename,
        )