# Derive base model
## Setup
### Import packages

In [None]:
from warnings import warn

import pandas as pd
from cobra.core import Group
from cobra.flux_analysis import find_blocked_reactions
from cobra.manipulation import remove_genes
from cobra.util.array import create_stoichiometric_matrix, nullspace
from rbc_gem_utils import (
    COBRA_CONFIGURATION,
    GEM_NAME,
    get_annotation_df,
    get_dirpath,
    read_cobra_model,
    show_versions,
    split_string,
    write_cobra_model,
)
from rbc_gem_utils.database.mgi import MGI_DB_TAG
from rbc_gem_utils.database.uniprot import UNIPROT_DB_TAG

pd.set_option("display.precision", 6)

import gurobipy as gp

gp.setParam("OutputFlag", 0)
gp.setParam("LogToConsole", 0)

# Show versions of notebook
show_versions()

### Define configuration
#### COBRA Configuration

In [None]:
COBRA_CONFIGURATION.solver = "gurobi"
# Set bound defaults much larger to prevent model loading issues
COBRA_CONFIGURATION.bounds = (-1e3, 1e3)
COBRA_CONFIGURATION.tolerance = 1e-9
COBRA_CONFIGURATION

### Define organism

In [None]:
organism = "Human"
reduced_model_id = None
remove_blocked_reaction = True

### Set paths

In [None]:
sequence_dirpath = get_dirpath("database", UNIPROT_DB_TAG)
model_dirpath = get_dirpath("model")
overlay_dirpath = get_dirpath("analysis") / "OVERLAY" / organism
overlay_dirpath.mkdir(exist_ok=True, parents=True)

## Load RBC-GEM model

In [None]:
rbc_gem = read_cobra_model(model_dirpath / f"{GEM_NAME}.xml")
for r in rbc_gem.reactions:
    if r.bounds == (0.0, 1000.0):
        r.bounds = (0, COBRA_CONFIGURATION.upper_bound)
    elif r.bounds == (-1000.0, 0.0):
        r.bounds = (COBRA_CONFIGURATION.lower_bound, 0.0)
    else:
        r.bounds = COBRA_CONFIGURATION.bounds


rbc_gem

### Extract model
Load the full reconstruction or use a pre-defined model for a reduced RBC model.

In [None]:
model = rbc_gem.copy()

boundary_types_dict = {
    "exchange": "EX_",
    "sink": "SK_",
    "demand": "DM_",
}


if reduced_model_id is not None and reduced_model_id != rbc_gem.id:
    df_reactions = pd.read_csv(
        get_dirpath("analysis")
        / organism
        / "reduced_models"
        / f"model_reactions_{reduced_model_id}.tsv",
        sep="\t",
        index_col=0,
        dtype=str,
    ).fillna("")

    # Determine pseudoreactions if any
    pseudoreactions = set()
    boundaries = {}
    for btype, prefix in boundary_types_dict.items():
        boundaries[btype] = {
            r: r.replace(prefix, "") for r in df_reactions.index if r.startswith(prefix)
        }
        pseudoreactions.update([r for r in df_reactions.index if r.startswith(prefix)])

    # Extract reactions from model that are not pseudoreactions
    reactions = sorted(list(df_reactions.index.difference(pseudoreactions)))
    reactions = model.reactions.get_by_any(reactions)
    # Remove reactions not in list
    model.remove_reactions(
        [x for x in model.reactions if x not in reactions], remove_orphans=True
    )
    # Remove orphaned groups not in list
    model.remove_groups([x for x in model.groups if not x.members])
    # Cleanup gene reaction rules
    for reaction in reactions:
        reaction.gene_reaction_rule = df_reactions.loc[
            reaction.id, "gene_reaction_rule"
        ]
    to_remove = model.genes.query(lambda x: len(x.reactions) < 1)
    remove_genes(model, gene_list=to_remove, remove_reactions=False)

    # Add pseudoreactions, defining new ones if needed.
    for btype, boundary_dict in boundaries.items():
        for reaction, met in boundary_dict.items():
            try:
                met = model.metabolites.get_by_id(met)
            except KeyError as e:
                warn(f"{met} not in model")
            else:
                try:
                    reaction = model.add_boundary(met, type=btype)
                except ValueError:
                    reaction = model.reactions.get_by_id(reaction)

    for reaction in model.boundary:
        reaction.subsystem = "Pseudoreactions"

    model.id = reduced_model_id
model

### Remove blocked reactions

In [None]:
if remove_blocked_reaction:
    blocked_reactions = find_blocked_reactions(
        model=model,
        reaction_list=None,
        zero_cutoff=COBRA_CONFIGURATION.tolerance,
        open_exchanges=True,
        processes=min(50, COBRA_CONFIGURATION.processes),
    )
    blocked_reactions = sorted(blocked_reactions)
    model.remove_reactions(blocked_reactions, remove_orphans=True)
model

### Set objective(s)

In [None]:
test_model = model.copy()
for r in test_model.boundary:
    r.bounds = (0, COBRA_CONFIGURATION.upper_bound)
test_model.reactions.get_by_id("EX_glc__D_e").bounds = (-1, 0)


sol = test_model.optimize()
sol.fluxes[sol.fluxes != 0]
objective_rxns = ["NaKt"]
model.objective = sum(
    [test_model.reactions.get_by_id(rid).flux_expression for rid in objective_rxns]
)
sol = test_model.optimize()
print(sol[objective_rxns])
sol.fluxes[sol.fluxes != 0]

### Get dimension of nullspace

In [None]:
S = create_stoichiometric_matrix(model)
ns = nullspace(S)
ns.shape

### Reset subsystems

In [None]:
model.remove_groups(model.groups)
for subsystem in sorted(set(model.reactions.list_attr("subsystem"))):
    reaction_list = model.reactions.query(lambda x: x.subsystem == subsystem)
    if subsystem not in model.groups:
        group = Group(id=subsystem, name=subsystem, members=reaction_list)
        model.add_groups([group])
    else:
        group = model.groups.get_by_id(subsystem).add_members(reaction_list)

### Assemble protein data
#### Get model mappings

In [None]:
annotation_type = "genes"
mapping_key = "uniprot"
annotation_cols = [mapping_key]

df_model_mappings = (
    get_annotation_df(model.genes, annotation_cols)
    .rename({"id": annotation_type}, axis=1)
    .dropna(subset=[mapping_key])
)
for col in df_model_mappings.columns:
    df_model_mappings[col] = df_model_mappings[col].apply(lambda x: split_string(x))
    df_model_mappings = df_model_mappings.explode(col).drop_duplicates().dropna()
df_model_mappings = df_model_mappings.sort_values(annotation_type)

print(df_model_mappings.nunique(dropna=True))
df_model_mappings = df_model_mappings.reset_index(drop=True)
df_model_mappings

#### Load protein amino acid sequences

In [None]:
df_isoforms_sequences = pd.read_csv(
    sequence_dirpath / f"{UNIPROT_DB_TAG}_isoforms_sequences.tsv",
    sep="\t",
    index_col=None,
).fillna(pd.NA)
print(df_isoforms_sequences[df_isoforms_sequences["erythroid"]]["uniprot"].unique())


df_sequences = df_model_mappings.merge(
    df_isoforms_sequences,
    left_on=mapping_key,
    right_on=mapping_key,
    how="left",
)
df_sequences = df_sequences[["genes", "sequence.id", "sequence"]].dropna(
    subset=["sequence"], axis=0
)
df_sequences

### Remove genes without sequence data

In [None]:
genes_to_remove = model.genes.query(lambda x: x.id not in set(df_sequences["genes"]))
remove_genes(model, gene_list=genes_to_remove, remove_reactions=False)

### Export model

In [None]:
new_model_dirpath = overlay_dirpath / model.id
new_model_dirpath.mkdir(exist_ok=True)
write_cobra_model(model, filename=new_model_dirpath / f"{model.id}.xml")
model