# Compute statistically significant fluxes between groups - REDS Recall
## Setup
### Import packages

In [1]:
import re
import textwrap
import warnings
from collections import defaultdict
from itertools import combinations

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from rbc_gem_utils import (
    COBRA_CONFIGURATION,
    ensure_iterable,
    get_dirpath,
    read_cobra_model,
    show_versions,
)
from rbc_gem_utils.analysis.overlay import (
    DEFAULT_PREFIX_SUFFIX_VALUES,
    DEFAULT_PROTEOME_COMPARTMENT,
    EnzymeDilution,
    add_relaxation_budget,
    load_overlay_model,
    plot_correlations,
)
from rbc_gem_utils.visualization import cmap_map
from scipy.stats import false_discovery_control, kruskal, mannwhitneyu, spearmanr

plt.rcParams["font.family"] = "Arial"

show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.3

Dependency Information
----------------------
beautifulsoup4                       4.13.4
bio                                   1.8.0
cobra                                0.29.1
depinfo                               2.2.0
gurobipy                             12.0.3
matplotlib                           3.10.3
matplotlib-venn                       1.1.2
memote                               0.17.0
networkx                                3.5
notebook                              7.4.4
openpyxl                              3.1.5
pandas                                2.3.1
pre-commit                            4.2.0
rbc-gem-utils[database,network,vis] missing
requests                             2.32.4
scikit-learn                          1.7.0
scipy                                1.16.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip          25.1
setuptools 78.1.1
wheel      0.45

### Define configuration
#### COBRA Configuration

In [2]:
COBRA_CONFIGURATION.solver = "gurobi"
# Set bound defaults much larger to prevent model loading issues
COBRA_CONFIGURATION.bounds = (-1e-8, 1e8)
COBRA_CONFIGURATION.tolerance = 1e-9
COBRA_CONFIGURATION

Attribute,Description,Value
solver,Mathematical optimization solver,gurobi
tolerance,"General solver tolerance (feasibility, integrality, etc.)",1e-07
lower_bound,Default reaction lower bound,-1e-08
upper_bound,Default reaction upper bound,100000000.0
processes,Number of parallel processes,127
cache_directory,Path for the model cache,C:\Users\P7875\AppData\Local\opencobra\cobrapy\Cache
max_cache_size,Maximum cache size in bytes,104857600
cache_expiration,Model cache expiration time in seconds (if any),


### Define organism, model, and dataset

In [3]:
organism = "Human"
model_id = "RBC_GEM"
dataset_name = "REDSRecall"
grouped_data_key = "Sample"

### Set variables for sample identification

In [4]:
# For sample IDs
sample_key = "SAMPLE ID"
donor_key = "PUBLIC RECALL DONOR ID"
time_key = "DAY"
timepoints = ["D10", "D23", "D42"]
genotypes = ["G6PD_V68M", "ATP11C_V972M"]
donor_re = re.compile(rf"(?P<donor>S(?P<num>\d+))")
time_re = re.compile(rf"(?P<time>{'|'.join(timepoints)})")
genotype_re = re.compile(rf"(?P<genotype>({'|'.join(genotypes)}))")

operations = "|".join([x.capitalize() for x in ["mean", "median"]])

operation_re = re.compile(r"(?P<op>" + operations + r")\_(?P<group>\w+)")
sample_id_re = re.compile(
    r"(?!" + operations + r")" + donor_re.pattern + r"\_" + time_re.pattern
)

### Set computation options

In [5]:
ftype = "xml"  # In our experience, SBML/XML loads faster, but will take up to 4x more space uncompressed as compared to JSON
run_computations = True  # Keep off to use previously computed results
overwrite = False  # Whether to allow overwriting of previous simulation results
verbose = True

# Objective reactions
objective_reactions = ["NaKt"]
# Reactions that must have the capability to carry flux, sort for consistency
required_flux_reactions = []  # Add reactions to this list
required_flux_reactions = sorted(set(objective_reactions + required_flux_reactions))


enzyme_rxn_prefix = DEFAULT_PREFIX_SUFFIX_VALUES["enzymes"]["prefix.dilution"]
enzyme_met_prefix = DEFAULT_PREFIX_SUFFIX_VALUES["enzymes"]["prefix.metabolite"]
enzyme_met_suffix_total = DEFAULT_PREFIX_SUFFIX_VALUES["enzymes"]["suffix.total"]
comp_suffix = f"_{DEFAULT_PROTEOME_COMPARTMENT}"

#### Set prefixes/suffixes to expect

In [1]:
enzyme_rxn_prefix = DEFAULT_PREFIX_SUFFIX_VALUES["enzymes"]["prefix.dilution"]
enzyme_met_prefix = DEFAULT_PREFIX_SUFFIX_VALUES["enzymes"]["prefix.metabolite"]
enzyme_met_suffix_total = DEFAULT_PREFIX_SUFFIX_VALUES["enzymes"]["suffix.total"]
comp_suffix = f"_{DEFAULT_PROTEOME_COMPARTMENT}"

NameError: name 'DEFAULT_PREFIX_SUFFIX_VALUES' is not defined

### Set figure options

In [6]:
save_figures = True
transparent = False
imagetype = "svg"

### Set paths

In [7]:
# Set paths
processed_data_dirpath = get_dirpath(use_temp="processed") / organism / dataset_name
overlay_dirpath = get_dirpath("analysis") / "OVERLAY" / organism
model_dirpath = overlay_dirpath / model_id
results_dirpath = (
    get_dirpath(use_temp="processed")
    / model_id
    / "OVERLAY"
    / organism
    / dataset_name
    / grouped_data_key
)
pcfva_results_dirpath = (
    results_dirpath
    / "pcFVA"
    / "_".join(("REQ", *required_flux_reactions))
    / "_".join(("OBJ", *objective_reactions))
)

# Objective reaction does not matter since correlations are computed
# based on min and max fluxes and abundance, which are obtained when optimum is 0.
corr_results_dirpath = results_dirpath / "correlations"
# Ensure directory  exists
corr_results_dirpath.mkdir(exist_ok=True, parents=True)

## Load RBC-GEM model

In [8]:
model = read_cobra_model(filename=model_dirpath / f"{model_id}.xml")
pcmodel = load_overlay_model(filename=model_dirpath / f"{model_id}_PC.xml")

# Add relaxation budget to initial PC model to get names of relaxation reactions
add_relaxation_budget(pcmodel, 0, verbose=False)
pcmodel

Set parameter Username
Set parameter LicenseID to value 2664191
Academic license - for non-commercial use only - expires 2026-05-12


0,1
Name,RBC_GEM_PC
Memory address,25671cdca10
Number of metabolites,10411
Number of reactions,19619
Number of genes,820
Number of groups,78
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space, protein compartment"


## Load pcFVA generated results

In [9]:
# Test to see if results were recently generated in this run, otherwise load DataFrame of generated results
df_pcfva_all = pd.read_csv(
    pcfva_results_dirpath / f"{pcmodel.id}_All_FVAsols.zip",
    index_col=None,
)

df_pcfva_all

Unnamed: 0,model,reactions,optimum,min,max
0,RBC_GEM_PC_Mean_ATP11C_V972M_0,13DAMPPOX,0.00,0.0,0.0
1,RBC_GEM_PC_Mean_ATP11C_V972M_0,13DAMPPOX,0.50,0.0,0.0
2,RBC_GEM_PC_Mean_ATP11C_V972M_0,13DAMPPOX,0.90,0.0,0.0
3,RBC_GEM_PC_Mean_ATP11C_V972M_0,13DAMPPOX,0.99,0.0,0.0
4,RBC_GEM_PC_Mean_ATP11C_V972M_0,15KPGE1Ry,0.00,0.0,0.0
...,...,...,...,...,...
54755171,RBC_GEM_PC_S650_D42,ZN2_HCO3_SELNIt,0.99,0.0,0.0
54755172,RBC_GEM_PC_S650_D42,ZN2t,0.00,0.0,0.0
54755173,RBC_GEM_PC_S650_D42,ZN2t,0.50,0.0,0.0
54755174,RBC_GEM_PC_S650_D42,ZN2t,0.90,0.0,0.0


## Create DataFrame for calculations and visualizations
### Get maximum reaction fluxes and associated abundance values
#### Get maximum reaction fluxes and ranges

In [10]:
rxns = model.reactions.query(lambda x: len(x.genes))
df_max_flux_per_model = df_pcfva_all[df_pcfva_all["reactions"].isin(rxns)].copy()
df_max_flux_per_model = df_max_flux_per_model.groupby(
    ["model", "reactions", "optimum"]
)[["min", "max"]].agg(
    {
        "min": "min",  # Minimum reaction flux per model
        "max": "max",  # Maximum reaction flux per model
    }
)
# Address issues possibly caused by floating point precision, ideally a value that prevents any negative ranges
df_max_flux_per_model.loc[
    df_max_flux_per_model["max"] < df_max_flux_per_model["min"], ["max", "min"]
] = [0, 0]
atol = COBRA_CONFIGURATION.tolerance
df_max_flux_per_model["max"] = df_max_flux_per_model["max"].apply(
    lambda x: 0 if np.isclose(x, 0, atol=atol) else round(x, -int(np.log10(atol)))
)
df_max_flux_per_model["min"] = df_max_flux_per_model["min"].apply(
    lambda x: 0 if np.isclose(x, 0, atol=atol) else round(x, -int(np.log10(atol)))
)
df_max_flux_per_model["range"] = (
    df_max_flux_per_model["max"] - df_max_flux_per_model["min"]
)
# Ensure no negative values, if results appear then tolerance should be adjusted
df_max_flux_per_model[df_max_flux_per_model["range"] < 1e-9]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,min,max,range
model,reactions,optimum,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


#### Get maximum "enzyme" abundances

In [11]:
rxns = pcmodel.reactions.query(
    lambda x: isinstance(x, EnzymeDilution)
    and x.id.endswith(f"{enzyme_met_suffix_total}{comp_suffix}")
).list_attr("id")
df_max_abundance_per_model = df_pcfva_all[df_pcfva_all["reactions"].isin(rxns)].copy()
# Rename dilution reactions to match
reaction_enzyme_map = {
    enzyme_rid: enzyme_rid.replace(
        f"{enzyme_rxn_prefix}{enzyme_met_prefix}", ""
    ).replace(f"{enzyme_met_suffix_total}{comp_suffix}", "")
    for enzyme_rid in df_max_abundance_per_model["reactions"]
}
df_max_abundance_per_model["reactions"] = df_max_abundance_per_model[
    "reactions"
].replace(reaction_enzyme_map)
df_max_abundance_per_model = df_max_abundance_per_model.groupby(
    ["model", "reactions", "optimum"]
)[["max"]].max()
# Address issues possibly caused by floating point precision, atol is ideally a value that prevents any negative ranges
atol = COBRA_CONFIGURATION.tolerance
df_max_abundance_per_model["max"] = df_max_abundance_per_model["max"].apply(
    lambda x: 0 if x < 0 else x
)
df_max_abundance_per_model["max"] = df_max_abundance_per_model["max"].apply(
    lambda x: 0 if np.isclose(x, 0, atol=atol) else round(x, -int(np.log10(atol)))
)
df_max_abundance_per_model = df_max_abundance_per_model.rename(
    {"max": "abundance"}, axis=1
)
# Ensure no negative values, if results appear then tolerance should be adjusted
df_max_abundance_per_model[(df_max_abundance_per_model < 0).any(axis=1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,abundance
model,reactions,optimum,Unnamed: 3_level_1


#### Merge DataFrames

In [12]:
df_data_all = pd.merge(
    df_max_flux_per_model,
    df_max_abundance_per_model,
    left_index=True,
    right_index=True,
    how="left",
)
df_data_all = df_data_all.reset_index(drop=False)
df_data_all

Unnamed: 0,model,reactions,optimum,min,max,range,abundance
0,RBC_GEM_PC_Mean_ATP11C_V972M_0,13DAMPPOX,0.00,0.0,0.0,0.0,
1,RBC_GEM_PC_Mean_ATP11C_V972M_0,13DAMPPOX,0.50,0.0,0.0,0.0,
2,RBC_GEM_PC_Mean_ATP11C_V972M_0,13DAMPPOX,0.90,0.0,0.0,0.0,
3,RBC_GEM_PC_Mean_ATP11C_V972M_0,13DAMPPOX,0.99,0.0,0.0,0.0,
4,RBC_GEM_PC_Mean_ATP11C_V972M_0,15KPGE1Ry,0.00,0.0,0.0,0.0,0.005077
...,...,...,...,...,...,...,...
25249083,RBC_GEM_PC_S650_D42,ZN2_HCO3_SELNIt,0.99,0.0,0.0,0.0,0.000000
25249084,RBC_GEM_PC_S650_D42,ZN2t,0.00,0.0,0.0,0.0,0.000000
25249085,RBC_GEM_PC_S650_D42,ZN2t,0.50,0.0,0.0,0.0,0.000000
25249086,RBC_GEM_PC_S650_D42,ZN2t,0.90,0.0,0.0,0.0,0.000000


### Identify donor, timepoints, and genotypes for results

In [14]:
df_metadata = pd.read_csv(
    processed_data_dirpath / f"{dataset_name}_Metadata.csv",
    index_col=[sample_key],
).convert_dtypes()
df_metadata = df_metadata.loc[:, genotypes]

df = df_metadata.reset_index(drop=False)
df[sample_key] = df[sample_key].str.split("_", expand=True).iloc[:, 0]
df = df.drop_duplicates().set_index(sample_key)
for col, series in df.items():
    print(series.value_counts().sort_index())
    print()

G6PD_V68M
0    621
1      7
2     11
Name: count, dtype: Int64

ATP11C_V972M
0    625
1     12
2      2
Name: count, dtype: Int64



In [15]:
metadata_columns = ["sample", "donor", "time", "genotype"]
for key, search_re in zip(
    metadata_columns, [sample_id_re, donor_re, time_re, genotype_re]
):
    if key == "sample":
        df_data_all[key] = df_data_all["model"].apply(
            lambda x: x.replace(f"{pcmodel.id}_", "")
        )
    else:
        df_data_all[key] = df_data_all["model"].apply(
            lambda x: search_re.search(x).group(1) if search_re.search(x) else pd.NA
        )

df_data_all = df_data_all.merge(
    df_metadata, left_on="sample", right_index=True, how="left"
)

# Add genotypes alleles
for genotype in genotypes:
    df = df_data_all[df_data_all["genotype"] == genotype]["sample"]
    df = df.str.rsplit("_", n=1, expand=True)
    if df.empty:
        continue
    df = df.iloc[:, -1]
    df_data_all.loc[df.index, genotype] = df.values
df_data_all = df_data_all.drop("genotype", axis=1)
metadata_columns = metadata_columns[:-1] + genotypes
df_data_all

Unnamed: 0,model,reactions,optimum,min,max,range,abundance,sample,donor,time,G6PD_V68M,ATP11C_V972M
0,RBC_GEM_PC_Mean_ATP11C_V972M_0,13DAMPPOX,0.00,0.0,0.0,0.0,,Mean_ATP11C_V972M_0,,,,0
1,RBC_GEM_PC_Mean_ATP11C_V972M_0,13DAMPPOX,0.50,0.0,0.0,0.0,,Mean_ATP11C_V972M_0,,,,0
2,RBC_GEM_PC_Mean_ATP11C_V972M_0,13DAMPPOX,0.90,0.0,0.0,0.0,,Mean_ATP11C_V972M_0,,,,0
3,RBC_GEM_PC_Mean_ATP11C_V972M_0,13DAMPPOX,0.99,0.0,0.0,0.0,,Mean_ATP11C_V972M_0,,,,0
4,RBC_GEM_PC_Mean_ATP11C_V972M_0,15KPGE1Ry,0.00,0.0,0.0,0.0,0.005077,Mean_ATP11C_V972M_0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
25249083,RBC_GEM_PC_S650_D42,ZN2_HCO3_SELNIt,0.99,0.0,0.0,0.0,0.000000,S650_D42,S650,D42,0,0
25249084,RBC_GEM_PC_S650_D42,ZN2t,0.00,0.0,0.0,0.0,0.000000,S650_D42,S650,D42,0,0
25249085,RBC_GEM_PC_S650_D42,ZN2t,0.50,0.0,0.0,0.0,0.000000,S650_D42,S650,D42,0,0
25249086,RBC_GEM_PC_S650_D42,ZN2t,0.90,0.0,0.0,0.0,0.000000,S650_D42,S650,D42,0,0


## Compute statistically significant results between groups
### Remove models based on data operations

In [16]:
df_data_samples = df_data_all[
    [not bool(operation_re.search(x)) for x in df_data_all["model"]]
].reset_index(drop=True)
df_data_samples

Unnamed: 0,model,reactions,optimum,min,max,range,abundance,sample,donor,time,G6PD_V68M,ATP11C_V972M
0,RBC_GEM_PC_S001_D10,13DAMPPOX,0.00,0.0,0.0,0.0,,S001_D10,S001,D10,0,0
1,RBC_GEM_PC_S001_D10,13DAMPPOX,0.50,0.0,0.0,0.0,,S001_D10,S001,D10,0,0
2,RBC_GEM_PC_S001_D10,13DAMPPOX,0.90,0.0,0.0,0.0,,S001_D10,S001,D10,0,0
3,RBC_GEM_PC_S001_D10,13DAMPPOX,0.99,0.0,0.0,0.0,,S001_D10,S001,D10,0,0
4,RBC_GEM_PC_S001_D10,15KPGE1Ry,0.00,0.0,0.0,0.0,0.0,S001_D10,S001,D10,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
25013355,RBC_GEM_PC_S650_D42,ZN2_HCO3_SELNIt,0.99,0.0,0.0,0.0,0.0,S650_D42,S650,D42,0,0
25013356,RBC_GEM_PC_S650_D42,ZN2t,0.00,0.0,0.0,0.0,0.0,S650_D42,S650,D42,0,0
25013357,RBC_GEM_PC_S650_D42,ZN2t,0.50,0.0,0.0,0.0,0.0,S650_D42,S650,D42,0,0
25013358,RBC_GEM_PC_S650_D42,ZN2t,0.90,0.0,0.0,0.0,0.0,S650_D42,S650,D42,0,0


### Combine sample results based on donor genotype

In [30]:
id_key = "donor"
groupby_operation = "mean"

In [31]:
if groupby_operation == "mean":
    df_data_for_analyses = df_data_samples.groupby(
        ["reactions", id_key, "optimum"], as_index=False
    )[["min", "max", "range", "abundance"] + genotypes].mean()
elif groupby_operation == "median":

    df_data_for_analyses = df_data_samples.groupby(
        ["reactions", id_key, "optimum"], as_index=False
    )[["min", "max", "range", "abundance"] + genotypes].median()
else:
    raise ValueError(f"Unrecognized operation to perform: '{groupby_operation}'")
df_data_for_analyses = df_data_for_analyses.dropna(subset=genotypes, axis=0, how="any")
df_data_for_analyses[genotypes] = df_data_for_analyses[genotypes].astype(int)

df_metadata = (
    df_data_for_analyses[[id_key] + genotypes].drop_duplicates().set_index(id_key)
)
df_data_for_analyses

Unnamed: 0,reactions,donor,optimum,min,max,range,abundance,G6PD_V68M,ATP11C_V972M
0,13DAMPPOX,S001,0.00,0.0,0.0,0.0,,0,0
1,13DAMPPOX,S001,0.50,0.0,0.0,0.0,,0,0
2,13DAMPPOX,S001,0.90,0.0,0.0,0.0,,0,0
3,13DAMPPOX,S001,0.99,0.0,0.0,0.0,,0,0
4,13DAMPPOX,S002,0.00,0.0,0.0,0.0,,0,0
...,...,...,...,...,...,...,...,...,...
8512395,ZN2t,S649,0.99,0.0,0.0,0.0,0.0,0,0
8512396,ZN2t,S650,0.00,0.0,0.0,0.0,0.0,0,0
8512397,ZN2t,S650,0.50,0.0,0.0,0.0,0.0,0,0
8512398,ZN2t,S650,0.90,0.0,0.0,0.0,0.0,0,0


### Create groups of models

In [32]:
all_key = "ALL"
model_groups = {all_key: list(df_data_for_analyses[id_key].unique())}


def create_group_of_models(df, id_key, groupby, verbose=False):
    grouped = df.groupby(groupby)[id_key].agg(lambda x: list(x.unique()))
    grouped = {
        "_".join([str(x) for x in ensure_iterable(k)]): v
        for k, v in grouped.to_dict().items()
    }
    if verbose:
        max_name_len = max([len(group_name) for group_name in list(grouped)])
        for group_name, model_list in grouped.items():
            spacepad = "".join([" "] * (max_name_len - len(group_name)))
            print(f"{group_name}:{spacepad}\t{len(model_list)} samples")
    return grouped

#### Based on genotype

In [33]:
for genotype in genotypes:
    grouped = create_group_of_models(
        df_data_for_analyses, id_key=id_key, groupby=genotype, verbose=False
    )
    grouped = {f"{genotype}_{str(k)}": v for k, v in grouped.items()}
    if verbose:
        max_name_len = max([len(group_name) for group_name in list(grouped)])
        for group_name, model_list in grouped.items():
            spacepad = "".join([" "] * (max_name_len - len(group_name)))
            print(f"{group_name}:{spacepad}\t{len(model_list)} samples")
    model_groups.update(grouped)
    print()

G6PD_V68M_0:	620 samples
G6PD_V68M_1:	7 samples
G6PD_V68M_2:	11 samples

ATP11C_V972M_0:	624 samples
ATP11C_V972M_1:	12 samples
ATP11C_V972M_2:	2 samples



### View groups

In [34]:
print("Possible groups for analyses\n============================")
max_name_len = max([len(group_name) for group_name in list(model_groups)])
for group_name, model_list in model_groups.items():
    spacepad = "".join([" "] * (max_name_len - len(group_name)))
    print(f"{group_name}:{spacepad}\t{len(model_list)} samples")

df_data_for_analyses = df_data_for_analyses.set_index(["reactions", id_key])
df_data_for_analyses

Possible groups for analyses
ALL:           	638 samples
G6PD_V68M_0:   	620 samples
G6PD_V68M_1:   	7 samples
G6PD_V68M_2:   	11 samples
ATP11C_V972M_0:	624 samples
ATP11C_V972M_1:	12 samples
ATP11C_V972M_2:	2 samples


Unnamed: 0_level_0,Unnamed: 1_level_0,optimum,min,max,range,abundance,G6PD_V68M,ATP11C_V972M
reactions,donor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
13DAMPPOX,S001,0.00,0.0,0.0,0.0,,0,0
13DAMPPOX,S001,0.50,0.0,0.0,0.0,,0,0
13DAMPPOX,S001,0.90,0.0,0.0,0.0,,0,0
13DAMPPOX,S001,0.99,0.0,0.0,0.0,,0,0
13DAMPPOX,S002,0.00,0.0,0.0,0.0,,0,0
...,...,...,...,...,...,...,...,...
ZN2t,S649,0.99,0.0,0.0,0.0,0.0,0,0
ZN2t,S650,0.00,0.0,0.0,0.0,0.0,0,0
ZN2t,S650,0.50,0.0,0.0,0.0,0.0,0,0
ZN2t,S650,0.90,0.0,0.0,0.0,0.0,0,0


#### Ensure groups exist and setup directory structure

In [35]:
groups_dict = defaultdict(dict)
item_list = genotypes
groups_dict[all_key].update({item: {} for item in item_list})

header = "Expected directory structure"
print("\n".join((header, "=" * len(header), all_key)))
for idx, (group_name, subgroups) in enumerate(sorted(groups_dict[all_key].items())):
    print("\u2514\u2500\u2500" + f" {group_name}")

group_results_dirpath_dict = {all_key: corr_results_dirpath}
for group_name, subgroups in groups_dict[all_key].items():
    group_results_dirpath_dict[group_name] = (
        group_results_dirpath_dict[all_key] / group_name
    )

Expected directory structure
ALL
└── ATP11C_V972M
└── G6PD_V68M


#### Load subsystems and metabolic categories to enrich results

In [36]:
subsystems_to_exclude = {"Pseudoreactions"}
use_abbrevs = True
abbreviations = {
    "Amino acid metabolism": "A",
    "Carbohydrate metabolism": "C",
    "Lipid metabolism": "L",
    "Metabolism of cofactors and vitamins": "V",
    "Nucleotide metabolism": "N",
    "Reactive species": "R",
    "Transport reactions": "T",
    "Other": "O",
}
categories_to_keep = list(abbreviations)

df_pathways = pd.read_csv(
    get_dirpath("curation") / "subsystems.tsv", sep="\t", dtype=str
).fillna("")

# Rename "name" to subsystem to match reaction attribute
df_pathways = df_pathways.rename({"name": "subsystem"}, axis=1)
# Group "Metabolism of other amino acids" with amino acids rather than treat as "other"
df_pathways["category"] = df_pathways["category"].replace(
    "Metabolism of other amino acids", "Amino acid metabolism"
)

df_pathways["category"] = df_pathways["category"].apply(
    lambda x: ("Other" if x not in categories_to_keep else x)
)
df_pathways = df_pathways[~df_pathways["subsystem"].isin(subsystems_to_exclude)].copy()
subsystem_to_category_dict = df_pathways.set_index("subsystem")["category"].to_dict()
df_pathways

Unnamed: 0,subsystem,category,kegg.pathway.name,kegg.pathway,notes
0,"Alanine, aspartate and glutamate metabolism",Amino acid metabolism,"Alanine, aspartate and glutamate metabolism",hsa00250,
1,Arginine and proline metabolism,Amino acid metabolism,Arginine and proline metabolism,hsa00330,
2,Cysteine and methionine metabolism,Amino acid metabolism,Cysteine and methionine metabolism,hsa00270,
3,"Glycine, serine and threonine metabolism",Amino acid metabolism,"Glycine, serine and threonine metabolism",hsa00260,
4,Histidine metabolism,Amino acid metabolism,Histidine metabolism,hsa00340,
...,...,...,...,...,...
73,Aminoacyl-tRNA biosynthesis,Other,Aminoacyl-tRNA biosynthesis,has00970,
74,"Transport, extracellular",Transport reactions,,,Representative subsystem for all transport rea...
75,5-fluorouracil metabolism,Other,Drug metabolism - other enzymes,hsa00983,"Subnetwork of KEGG pathway ""Drug metabolism - ..."
76,Azathioprine and 6-mercaptopurine metabolism,Other,Drug metabolism - other enzymes,hsa00983,"Subnetwork of KEGG pathway ""Drug metabolism - ..."


## Compute significant results between groups
#### Compare all subgroups at once

In [37]:
group_name = "G6PD_V68M"
optimum = 0
value_to_compare = "range"
group_timepoints_by = None
compare_pairwise = True
compare_all_groups = True
ordered_group_to_compare = [f"{group_name}_{alleles}" for alleles in [0, 1, 2]]
all_samples_for_comparison = [
    value for g in ordered_group_to_compare for value in np.array(model_groups[g])
]
df_data_for_correlations = df_data_for_analyses.loc[
    pd.IndexSlice[:, all_samples_for_comparison], :
]
df_data_for_correlations = df_data_for_correlations[
    df_data_for_correlations["optimum"] == optimum
].drop("optimum", axis=1)

print("Groups to compare\n=================")
pairwise_group_combos = []
if compare_all_groups:
    print(tuple(ordered_group_to_compare))
if compare_pairwise:
    pairwise_group_combos += list(combinations(ordered_group_to_compare, 2))
    for group in pairwise_group_combos:
        print(group)
df_data_for_correlations

Groups to compare
('G6PD_V68M_0', 'G6PD_V68M_1', 'G6PD_V68M_2')
('G6PD_V68M_0', 'G6PD_V68M_1')
('G6PD_V68M_0', 'G6PD_V68M_2')
('G6PD_V68M_1', 'G6PD_V68M_2')


Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,range,abundance,G6PD_V68M,ATP11C_V972M
reactions,donor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
13DAMPPOX,S001,0.0,0.0,0.0,,0,0
15KPGE1Ry,S001,0.0,0.0,0.0,0.000000,0,0
15KPGE2Ry,S001,0.0,0.0,0.0,0.000000,0,0
15KPGE3Ry,S001,0.0,0.0,0.0,0.000000,0,0
15KPGF1Ry,S001,0.0,0.0,0.0,0.000000,0,0
...,...,...,...,...,...,...,...
YYYTPAP,S605,0.0,0.0,0.0,0.312651,2,0
ZN2Htex2,S605,0.0,0.0,0.0,0.028201,2,0
ZN2_2HCO3t,S605,0.0,0.0,0.0,0.000000,2,0
ZN2_HCO3_SELNIt,S605,0.0,0.0,0.0,0.000000,2,0


### Kruskal Wallis H-test (3 or more groups)

In [None]:
ordered_group_to_compare = ["G6PD_V68M_0", "G6PD_V68M_1", "G6PD_V68M_2"]
results_dict = defaultdict(dict)

In [None]:
if len(ordered_group_to_compare) > 2:
    for rid in df_data_for_correlations.index.get_level_values("reactions").unique():
        df_data_rxn = df_data_for_correlations.loc[rid]
        df_data_rxn_opt_value = df_data_rxn[value_to_compare].copy()
        data_arrays = {
            group_name: df_data_rxn_opt_value.loc[model_groups[group_name]].values
            for group_name in ordered_group_to_compare
        }
        values = list(data_arrays.values())
        unique_values = set(
            [v for value_list in values for v in value_list if not np.isnan(v)]
        )
        if len(unique_values) <= 1:
            # Skip variables that do not have any differences
            results_dict[tuple(ordered_group_to_compare)][rid] = dict(
                zip(["statistic", "pvalue"], [pd.NA, pd.NA])
            )
        else:
            result = kruskal(*values, nan_policy="omit")
            results_dict[tuple(ordered_group_to_compare)][rid] = {
                attr: getattr(result, attr) for attr in ["statistic", "pvalue"]
            }

dataframes = {
    key: pd.DataFrame.from_dict(values, orient="index")
    for key, values in results_dict.items()
}
print(f"Number of different comparisons made: {len(dataframes)}")
print("Groups compared\n===============")
for key in list(dataframes):
    print(key)

### Mann Whiteney U test (2 groups)

In [None]:
if len(ordered_group_to_compare) == 2 or compare_pairwise:
    for rid in df_data_for_correlations.index.get_level_values("reactions").unique():
        df_data_rxn = df_data_for_correlations.loc[rid]
        df_data_rxn_opt_value = df_data_rxn[value_to_compare].copy()
        data_arrays = {
            group_name: df_data_rxn_opt_value.loc[model_groups[group_name]].values
            for group_name in ordered_group_to_compare
        }
        combos = (
            [tuple(ordered_group_to_compare)]
            if not pairwise_group_combos
            else pairwise_group_combos
        )
        for combo in combos:
            values = [data_arrays[group] for group in combo]
            unique_values = set(
                [v for value_list in values for v in value_list if not np.isnan(v)]
            )
            if len(unique_values) <= 1:
                # Skip variables that do not have any differences
                result = dict(zip(["statistic", "pvalue"], [pd.NA, pd.NA]))
            else:
                result = mannwhitneyu(*values, nan_policy="omit")
                result = {
                    attr: getattr(result, attr) for attr in ["statistic", "pvalue"]
                }
            results_dict[combo][rid] = result
dataframes = {
    key: pd.DataFrame.from_dict(values, orient="index")
    for key, values in results_dict.items()
}
print(f"Number of different comparisons made: {len(dataframes)}")
print("Groups compared\n===============")
for key in list(dataframes):
    print(key)

### Determine significance using p-values

In [None]:
pvalue_sig = 0.0002
enzyme_reactions_only = False
include_boundary_reactions = False
sort_by_subsystem = False
standardize_by = "mean"
use_group_means = False
fdr_method = None

significant_dataframes = {}
for met in model.metabolites.query(lambda x: x.compartment == "e"):
    met.name += " (extracellular)"
metadata_columns = [
    "name",
    "stoichiometry",
    "proteins",
    "pvalue" if not fdr_method else "adj_pvalue",
    "subsystem",
    "category",
]
for key, df in dataframes.items():
    df = df.dropna().copy()
    df["pvalue"] = df["pvalue"].astype(float)
    if fdr_method is not None and not fdr_method in {"bon", "bh", "by"}:
        raise ValueError(f"Unrecognized FDR correction method : {fdr_method}")
    elif fdr_method == "bon":
        pvalue_key = "adj_pvalue"
        df[pvalue_key] = df["pvalue"] * len(df["pvalue"])
    elif fdr_method in {"bh", "by"}:
        pvalue_key = "adj_pvalue"
        df[pvalue_key] = false_discovery_control(
            df["pvalue"].astype(float), method=fdr_method
        )
    else:
        pvalue_key = "pvalue"

    df = df[df[pvalue_key] <= pvalue_sig].drop("statistic", axis=1)
    if enzyme_reactions_only:
        df_pivot = df_data_for_correlations.loc[
            df.index, ["abundance", value_to_compare]
        ].dropna(subset="abundance")
        df_pivot = df_pivot.drop("abundance", axis=1)
    else:
        df_pivot = df_data_for_correlations.loc[df.index, value_to_compare]
    if not include_boundary_reactions:
        df_pivot = df_pivot[
            ~df_pivot.index.isin(
                model.reactions.query(lambda x: x.boundary).list_attr("id"),
                level="reactions",
            )
        ]
    df_pivot = df_pivot.reset_index(drop=False)
    df_pivot = df_pivot.pivot(
        columns=id_key, index="reactions", values=value_to_compare
    )
    df = pd.merge(df, df_pivot, left_index=True, right_index=True).sort_values(
        pvalue_key
    )
    df.index.name = "reactions"
    df = df.reset_index(drop=False).set_index(["reactions", pvalue_key]).T
    if df.empty:
        df = pd.DataFrame([], columns=metadata_columns)
    else:
        df = pd.concat(
            [
                # Sort index by donor number and subgroup while concatenating
                df.loc[model_groups[g]].sort_index(
                    key=lambda x: [
                        int(donor_re.search(v).group("num")) for v in x.values
                    ]
                )
                for g in key
            ],
            axis=0,
        )

        df = df.T.reset_index(drop=False)
        # Enrich results
        df["name"] = [
            r.name.replace(",", "")
            for r in model.reactions.get_by_any(list(df["reactions"].values))
        ]
        df["stoichiometry"] = [
            r.build_reaction_string(use_metabolite_names=True)
            for r in model.reactions.get_by_any(list(df["reactions"].values))
        ]
        df["subsystem"] = [
            r.subsystem
            for r in model.reactions.get_by_any(list(df["reactions"].values))
        ]
        df["category"] = df["subsystem"].replace(subsystem_to_category_dict)
        df["proteins"] = [
            ";".join(sorted([g.id for g in r.genes]))
            for r in model.reactions.get_by_any(list(df["reactions"].values))
        ]
        # Replace commas to prevent issues with CSV export
        df["subsystem"] = df["subsystem"].apply(lambda x: x.replace(",", ""))
        df["category"] = df["category"].apply(lambda x: x.replace(",", ""))
        df[pvalue_key] = df[pvalue_key].apply(lambda x: round(x, 5))

        df = df.set_index("reactions")
        if sort_by_subsystem:
            df = df.sort_values(by=["category", "subsystem", "proteins"])

        df_meta = df.loc[:, metadata_columns].copy()
        df_data = df.loc[:, ~df.columns.isin(df_meta.columns)].copy()
        if use_group_means:
            df_data = pd.concat(
                [df_data.loc[:, model_groups[g]].mean(axis=1) for g in key], axis=1
            )
            df_data.columns = list(key)

        if standardize_by == "mean":
            df_data = (
                df_data.sub(df_data.mean(axis=1), axis=0)
                .div(df_data.std(axis=1), axis=0)
                .dropna(how="all", axis=0)
            )
        elif standardize_by == "median":
            df_data = (
                (df_data.T - df_data.median(axis=1))
                / (df_data.quantile(q=0.75, axis=1) - df_data.quantile(q=0.25, axis=1))
            ).T
        else:
            pass
        # Put dataframes back together for custom reordering
        df = df_data.merge(df_meta, left_index=True, right_index=True)
    significant_dataframes[key] = df
    print(key)
    print(f"Min & Max values: ({df_data.min().min():.4f}, {df_data.max().max():.4f})")
    print()
key = tuple(ordered_group_to_compare)
df = significant_dataframes[key]
df

In [None]:
# df = significant_dataframes[key].copy()
# df = df.loc[:, ~df.columns.isin(["proteins", "pvalue", "subsystem", "category"])].T.merge(df_metadata[["G6PD_V68M"]], left_index=True, right_index=True).reset_index(drop=False)
# df = df.set_index(["index", "G6PD_V68M"]).groupby(level=1).mean().T
# print(df.min().min(), df.max().max())
# df = df.merge(df_meta, left_index=True, right_index=True)
# ordered_subsystems = [
#     "Pentose phosphate pathway",
#     "Glycolysis / Gluconeogenesis",
#     "Galactose metabolism",
#     "Transport extracellular",
#     "Glycerophospholipid metabolism",
#     "Ether lipid metabolism",
# ]
# df = pd.concat(
#     [
#         df[df["subsystem"] == subsystem].sort_values(by="pvalue", ascending=False if subsystem == "Glycolysis / Gluconeogenesis" else True)
#         for subsystem in ordered_subsystems
#     ],
#     axis=0,
# )
# significant_dataframes[key] = df.copy()
# df

## Export results

In [None]:
ftype = "csv"
for key, df_main in significant_dataframes.items():
    df_meta = df_main.loc[:, metadata_columns].copy()
    df_data = df_main.loc[:, ~df_main.columns.isin(df_meta.columns)].copy()
    for df_type, df in zip(
        ["data", "meta_row", "meta_col"], [df_data, df_meta, df_metadata]
    ):
        filename = "_".join(
            ["MannWhiteney" if len(key) == 2 else "Kruskal"]
            + [g.split("_")[-1] for g in key]
            + [id_key]
            + [df_type]
        )
        if use_group_means:
            filename += "_mean"
        group_results_dirpath_dict[group_name].mkdir(exist_ok=True)
        filename = group_results_dirpath_dict[group_name] / filename
        df.to_csv(
            f"{filename}.{ftype}", sep="\t" if ftype == "tsv" else ",", index=True
        )
    break