# Compute statistically significant fluxes between groups - REDS Recall, ATP11C V972M
## Setup
### Import packages

In [None]:
import re
import matplotlib.pyplot as plt
from collections import defaultdict
import pandas as pd
import numpy as np
from scipy.stats import kruskal, mannwhitneyu, false_discovery_control
from itertools import combinations
from rbc_gem_utils import (
    COBRA_CONFIGURATION,
    get_dirpath,
    read_cobra_model,
    show_versions,
    ensure_iterable,
)
from rbc_gem_utils.analysis.overlay import (
    DEFAULT_PROTEOME_COMPARTMENT,
    DEFAULT_PREFIX_SUFFIX_VALUES,
    add_relaxation_budget,
    load_overlay_model,
    EnzymeDilution,
)
plt.rcParams["font.family"] = "Arial"

show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.3

Dependency Information
----------------------
beautifulsoup4                       4.13.4
bio                                   1.8.0
cobra                                0.29.1
depinfo                               2.2.0
gurobipy                             12.0.3
matplotlib                           3.10.3
matplotlib-venn                       1.1.2
memote                               0.17.0
networkx                                3.5
notebook                              7.4.4
openpyxl                              3.1.5
pandas                                2.3.1
pre-commit                            4.2.0
rbc-gem-utils[database,network,vis] missing
requests                             2.32.4
scikit-learn                          1.7.0
scipy                                1.16.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip          25.1
setuptools 78.1.1
wheel      0.45

### Define configuration
#### COBRA Configuration

In [2]:
COBRA_CONFIGURATION.solver = "gurobi"
# Set bound defaults much larger to prevent model loading issues
COBRA_CONFIGURATION.bounds = (-1e-8, 1e8)
COBRA_CONFIGURATION.tolerance = 1e-9
COBRA_CONFIGURATION

Attribute,Description,Value
solver,Mathematical optimization solver,gurobi
tolerance,"General solver tolerance (feasibility, integrality, etc.)",1e-09
lower_bound,Default reaction lower bound,-1e-08
upper_bound,Default reaction upper bound,100000000.0
processes,Number of parallel processes,127
cache_directory,Path for the model cache,C:\Users\P7875\AppData\Local\opencobra\cobrapy\Cache
max_cache_size,Maximum cache size in bytes,104857600
cache_expiration,Model cache expiration time in seconds (if any),


### Define organism, model, and dataset

In [3]:
organism = "Human"
model_id = "RBC_GEM"
dataset_name = "REDSRecall"
genotype = "ATP11C_V972M"
grouped_data_key = f'{genotype}_Sample'
grouped_data_key

'ATP11C_V972M_Sample'

### Set variables for sample identification

In [4]:
sample_key = "SAMPLE ID"
donor_key = "PUBLIC RECALL DONOR ID"

# For sample IDs
donor_re = re.compile(rf"(?P<donor>S(?P<num>\d\d\d))")
operations = "|".join([x.capitalize() for x in ["mean", "median"]])
operation_re = re.compile(r"(?P<op>" + operations + r")\_(?P<group>\w+)")
sample_id_re = re.compile(
    r"(?!" + operations + r")" + donor_re.pattern + r"\_"
)

### Set computation options

In [5]:
ftype = "xml"  # In our experience, SBML/XML loads faster, but will take up to 4x more space uncompressed as compared to JSON
run_computations = True  # Keep off to use previously computed results
overwrite = False  # Whether to allow overwriting of previous simulation results
verbose = True

# Objective reactions
objective_reactions = ["NaKt"] 
# Reactions that must have the capability to carry flux, sort for consistency
required_flux_reactions = ["PSFLIPt"] # Add reactions to this list
required_flux_reactions = sorted(set(objective_reactions + required_flux_reactions))


#### Set prefixes/suffixes to expect

In [6]:
enzyme_rxn_prefix = DEFAULT_PREFIX_SUFFIX_VALUES["enzymes"]["prefix.dilution"]
enzyme_met_prefix = DEFAULT_PREFIX_SUFFIX_VALUES["enzymes"]["prefix.metabolite"]
enzyme_met_suffix_total = DEFAULT_PREFIX_SUFFIX_VALUES["enzymes"]["suffix.total"]
comp_suffix = f"_{DEFAULT_PROTEOME_COMPARTMENT}"

### Set figure options

In [7]:
save_figures = True
transparent = False
imagetype = "svg"

### Set paths

In [8]:
# Set paths
overlay_dirpath = get_dirpath("analysis") / "OVERLAY" / organism
model_dirpath = overlay_dirpath / model_id
results_dirpath = (get_dirpath(use_temp="processed") / model_id / "OVERLAY" / organism / dataset_name / grouped_data_key)
pcfva_results_dirpath = (results_dirpath / "pcFVA" / "_".join(("REQ", *required_flux_reactions)) /  "_".join(("OBJ", *objective_reactions)))
corr_results_dirpath = results_dirpath / "correlations"
# Ensure directory  exists
corr_results_dirpath.mkdir(exist_ok=True, parents=True)

## Load RBC-GEM model

In [9]:
model = read_cobra_model(filename=model_dirpath / f"{model_id}.xml")
pcmodel = load_overlay_model(filename=model_dirpath / f"{model_id}_PC.xml")

# Add relaxation budget to initial PC model to get names of relaxation reactions
add_relaxation_budget(pcmodel, 0, verbose=False)
pcmodel

Set parameter Username
Set parameter LicenseID to value 2664191
Academic license - for non-commercial use only - expires 2026-05-12


0,1
Name,RBC_GEM_PC
Memory address,1e2288ae350
Number of metabolites,7815
Number of reactions,15687
Number of genes,723
Number of groups,68
Objective expression,1.0*NaKt - 1.0*NaKt_reverse_db47e
Compartments,"cytosol, extracellular space, protein compartment"


## Load pcFVA generated results

In [10]:
# Load DataFrame of generated results
df_pcfva_all = pd.read_csv(
    pcfva_results_dirpath / f"{pcmodel.id}_All_FVAsols.zip",
    index_col=None,
).fillna(0)

df_pcfva_all

Unnamed: 0,reactions,model,optimum,min,max
0,1SGTH2OHE1ABCte,RBC_GEM_PC_Allele0_C0,0.0,0.000000,0.014688
1,1SGTH2OHE1ABCte,RBC_GEM_PC_Allele0_C1,0.0,0.000000,0.015410
2,1SGTH2OHE1ABCte,RBC_GEM_PC_Allele0_C10,0.0,0.000000,0.014478
3,1SGTH2OHE1ABCte,RBC_GEM_PC_Allele0_C11,0.0,0.000000,0.013896
4,1SGTH2OHE1ABCte,RBC_GEM_PC_Allele0_C12,0.0,0.000000,0.011391
...,...,...,...,...,...
123875,XYLU_Dt,RBC_GEM_PC_Mean_Allele1,0.0,-0.040705,0.040236
123876,XYLU_Dt,RBC_GEM_PC_Mean_Allele2,0.0,-0.053735,0.052435
123877,XYLU_Dt,RBC_GEM_PC_Median_Allele0,0.0,-0.037787,0.036772
123878,XYLU_Dt,RBC_GEM_PC_Median_Allele1,0.0,-0.040307,0.040021


## Create DataFrame for calculations and visualizations
### Get maximum reaction fluxes and associated abundance values
#### Get maximum reaction fluxes and ranges

In [11]:
rxns = model.reactions.list_attr("id")
df_max_flux_per_model = df_pcfva_all[df_pcfva_all["reactions"].isin(rxns)].copy()
df_max_flux_per_model = df_max_flux_per_model.groupby(["model", "reactions", "optimum"])[["min", "max"]].agg(
    {
        "min": "min", # Minimum reaction flux per model
        "max": "max", # Maximum reaction flux per model
    }
)
# Address issues possibly caused by floating point precision, ideally a value that prevents any negative ranges
df_max_flux_per_model.loc[df_max_flux_per_model["max"] < df_max_flux_per_model["min"], ["max", "min"]] = [0, 0]
atol = COBRA_CONFIGURATION.tolerance
df_max_flux_per_model["max"] = df_max_flux_per_model["max"].apply(lambda x: 0 if np.isclose(x, 0, atol=atol) else round(x, -int(np.log10(atol))))
df_max_flux_per_model["min"] = df_max_flux_per_model["min"].apply(lambda x: 0 if np.isclose(x, 0, atol=atol) else round(x, -int(np.log10(atol))))
df_max_flux_per_model["range"] = df_max_flux_per_model["max"] - df_max_flux_per_model["min"]
# Ensure no negative values, if results appear then tolerance should be adjusted
df_max_flux_per_model[df_max_flux_per_model["range"] < 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,min,max,range
model,reactions,optimum,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


#### Get maximum "enzyme" abundances

In [12]:
rxns = pcmodel.reactions.query(lambda x: isinstance(x, EnzymeDilution) and x.id.endswith(f"{enzyme_met_suffix_total}{comp_suffix}")).list_attr("id")
df_max_abundance_per_model = df_pcfva_all[df_pcfva_all["reactions"].isin(rxns)].copy()
# Rename dilution reactions to match 
reaction_enzyme_map = {
    enzyme_rid: enzyme_rid.replace(
        f"{enzyme_rxn_prefix}{enzyme_met_prefix}", ""
    ).replace(
        f"{enzyme_met_suffix_total}{comp_suffix}", ""
    )
    for enzyme_rid in df_max_abundance_per_model["reactions"]
}
df_max_abundance_per_model["reactions"] = df_max_abundance_per_model["reactions"].replace(reaction_enzyme_map)
df_max_abundance_per_model = df_max_abundance_per_model.groupby(["model", "reactions", "optimum"])[["max"]].max()
# Address issues possibly caused by floating point precision, atol is ideally a value that prevents any negative ranges
atol = COBRA_CONFIGURATION.tolerance
df_max_abundance_per_model["max"] = df_max_abundance_per_model["max"].apply(lambda x: 0 if x < 0 else x)
df_max_abundance_per_model["max"] = df_max_abundance_per_model["max"].apply(lambda x: 0 if np.isclose(x, 0, atol=atol) else round(x, -int(np.log10(atol))))
df_max_abundance_per_model = df_max_abundance_per_model.rename({"max": "abundance"}, axis=1)
# Ensure no negative values, if results appear then tolerance should be adjusted
df_max_abundance_per_model[(df_max_abundance_per_model < 0).any(axis=1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,abundance
model,reactions,optimum,Unnamed: 3_level_1


#### Merge DataFrames

In [13]:
df_data_all = pd.merge(
    df_max_flux_per_model,
    df_max_abundance_per_model,
    left_index=True,
    right_index=True,
    how="left",
)
df_data_all = df_data_all.reset_index(drop=False)
df_data_all

Unnamed: 0,model,reactions,optimum,min,max,range,abundance
0,RBC_GEM_PC_Allele0_C0,1SGTH2OHE1ABCte,0.0,0.000000,0.014688,0.014688,0.062771
1,RBC_GEM_PC_Allele0_C0,1SGTH2OHE2ABCte,0.0,0.000000,0.014688,0.014688,0.062771
2,RBC_GEM_PC_Allele0_C0,23E1QN1GST,0.0,0.000000,0.014688,0.014688,21.663839
3,RBC_GEM_PC_Allele0_C0,23E1QN4GST,0.0,0.000000,0.014688,0.014688,21.663839
4,RBC_GEM_PC_Allele0_C0,23E1SQOX,0.0,0.000000,1.296063,1.296063,
...,...,...,...,...,...,...,...
63569,RBC_GEM_PC_Median_Allele2,XPPT,0.0,0.000000,0.001258,0.001258,1.137815
63570,RBC_GEM_PC_Median_Allele2,XYLK,0.0,0.000000,0.000000,0.000000,0.000000
63571,RBC_GEM_PC_Median_Allele2,XYLTD_Dx,0.0,-0.056075,0.056075,0.112151,0.239638
63572,RBC_GEM_PC_Median_Allele2,XYLTt,0.0,-0.056075,0.056075,0.112151,


### Identify genotypes for results

In [14]:
df_data_all["sample"] = df_data_all["model"].apply(lambda x: x.replace(f"{pcmodel.id}_", ""))
df_data_all[genotype] = df_data_all["sample"].apply(lambda x: x.split("_", 1)[0].replace("Allele", ""))
df_data_all

Unnamed: 0,model,reactions,optimum,min,max,range,abundance,sample,ATP11C_V972M
0,RBC_GEM_PC_Allele0_C0,1SGTH2OHE1ABCte,0.0,0.000000,0.014688,0.014688,0.062771,Allele0_C0,0
1,RBC_GEM_PC_Allele0_C0,1SGTH2OHE2ABCte,0.0,0.000000,0.014688,0.014688,0.062771,Allele0_C0,0
2,RBC_GEM_PC_Allele0_C0,23E1QN1GST,0.0,0.000000,0.014688,0.014688,21.663839,Allele0_C0,0
3,RBC_GEM_PC_Allele0_C0,23E1QN4GST,0.0,0.000000,0.014688,0.014688,21.663839,Allele0_C0,0
4,RBC_GEM_PC_Allele0_C0,23E1SQOX,0.0,0.000000,1.296063,1.296063,,Allele0_C0,0
...,...,...,...,...,...,...,...,...,...
63569,RBC_GEM_PC_Median_Allele2,XPPT,0.0,0.000000,0.001258,0.001258,1.137815,Median_Allele2,Median
63570,RBC_GEM_PC_Median_Allele2,XYLK,0.0,0.000000,0.000000,0.000000,0.000000,Median_Allele2,Median
63571,RBC_GEM_PC_Median_Allele2,XYLTD_Dx,0.0,-0.056075,0.056075,0.112151,0.239638,Median_Allele2,Median
63572,RBC_GEM_PC_Median_Allele2,XYLTt,0.0,-0.056075,0.056075,0.112151,,Median_Allele2,Median


## Compute statistically significant results between groups

In [15]:
df_data_for_analyses = df_data_all[[not bool(operation_re.search(x)) for x in df_data_all["model"]]].reset_index(drop=True)
df_data_for_analyses

Unnamed: 0,model,reactions,optimum,min,max,range,abundance,sample,ATP11C_V972M
0,RBC_GEM_PC_Allele0_C0,1SGTH2OHE1ABCte,0.0,0.000000,0.014688,0.014688,0.062771,Allele0_C0,0
1,RBC_GEM_PC_Allele0_C0,1SGTH2OHE2ABCte,0.0,0.000000,0.014688,0.014688,0.062771,Allele0_C0,0
2,RBC_GEM_PC_Allele0_C0,23E1QN1GST,0.0,0.000000,0.014688,0.014688,21.663839,Allele0_C0,0
3,RBC_GEM_PC_Allele0_C0,23E1QN4GST,0.0,0.000000,0.014688,0.014688,21.663839,Allele0_C0,0
4,RBC_GEM_PC_Allele0_C0,23E1SQOX,0.0,0.000000,1.296063,1.296063,,Allele0_C0,0
...,...,...,...,...,...,...,...,...,...
53531,RBC_GEM_PC_Allele2_S417_D42,XPPT,0.0,0.000000,0.000715,0.000715,1.189980,Allele2_S417_D42,2
53532,RBC_GEM_PC_Allele2_S417_D42,XYLK,0.0,0.000000,0.006065,0.006065,0.025918,Allele2_S417_D42,2
53533,RBC_GEM_PC_Allele2_S417_D42,XYLTD_Dx,0.0,-0.065284,0.065284,0.130568,0.278992,Allele2_S417_D42,2
53534,RBC_GEM_PC_Allele2_S417_D42,XYLTt,0.0,-0.065284,0.065284,0.130568,,Allele2_S417_D42,2


### Create groups of models

In [16]:
all_key = grouped_data_key
id_key = "sample"
model_groups = {all_key: list(df_data_for_analyses[id_key].unique())}

def create_group_of_models(df, id_key, groupby, verbose=False):
    grouped = df.groupby(groupby)[id_key].agg(lambda x: list(x.unique()))
    grouped = {"_".join([str(x) for x in ensure_iterable(k)]): v for k, v in grouped.to_dict().items()}
    if verbose:
        max_name_len = max([len(group_name) for group_name in list(grouped)])
        for group_name, model_list in grouped.items():
            spacepad = "".join([" "] * (max_name_len - len(group_name)))
            print(f"{group_name}:{spacepad}\t{len(model_list)} samples")
    return grouped

#### Based on allele count

In [17]:
grouped = create_group_of_models(df_data_for_analyses, id_key=id_key, groupby=genotype, verbose=False)
grouped = {f"{genotype}_{str(k)}": v for k, v in grouped.items()}
if verbose:
    max_name_len = max([len(group_name) for group_name in list(grouped)])
    for group_name, model_list in grouped.items():
        spacepad = "".join([" "] * (max_name_len - len(group_name)))
        print(f"{group_name}:{spacepad}\t{len(model_list)} samples")
model_groups.update(grouped)
print()

ATP11C_V972M_0:	18 samples
ATP11C_V972M_1:	8 samples
ATP11C_V972M_2:	6 samples



### View groups

In [18]:
print("Possible groups for analyses\n============================")
max_name_len = max([len(group_name) for group_name in list(model_groups)])
for group_name, model_list in model_groups.items():
    spacepad = "".join([" "] * (max_name_len - len(group_name)))
    print(f"{group_name}:{spacepad}\t{len(model_list)} samples")

df_data_for_analyses = df_data_for_analyses.set_index(["reactions", id_key])
df_data_for_analyses

Possible groups for analyses
ATP11C_V972M_Sample:	32 samples
ATP11C_V972M_0:     	18 samples
ATP11C_V972M_1:     	8 samples
ATP11C_V972M_2:     	6 samples


Unnamed: 0_level_0,Unnamed: 1_level_0,model,optimum,min,max,range,abundance,ATP11C_V972M
reactions,sample,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1SGTH2OHE1ABCte,Allele0_C0,RBC_GEM_PC_Allele0_C0,0.0,0.000000,0.014688,0.014688,0.062771,0
1SGTH2OHE2ABCte,Allele0_C0,RBC_GEM_PC_Allele0_C0,0.0,0.000000,0.014688,0.014688,0.062771,0
23E1QN1GST,Allele0_C0,RBC_GEM_PC_Allele0_C0,0.0,0.000000,0.014688,0.014688,21.663839,0
23E1QN4GST,Allele0_C0,RBC_GEM_PC_Allele0_C0,0.0,0.000000,0.014688,0.014688,21.663839,0
23E1SQOX,Allele0_C0,RBC_GEM_PC_Allele0_C0,0.0,0.000000,1.296063,1.296063,,0
...,...,...,...,...,...,...,...,...
XPPT,Allele2_S417_D42,RBC_GEM_PC_Allele2_S417_D42,0.0,0.000000,0.000715,0.000715,1.189980,2
XYLK,Allele2_S417_D42,RBC_GEM_PC_Allele2_S417_D42,0.0,0.000000,0.006065,0.006065,0.025918,2
XYLTD_Dx,Allele2_S417_D42,RBC_GEM_PC_Allele2_S417_D42,0.0,-0.065284,0.065284,0.130568,0.278992,2
XYLTt,Allele2_S417_D42,RBC_GEM_PC_Allele2_S417_D42,0.0,-0.065284,0.065284,0.130568,,2


#### Ensure groups exist and setup directory structure

In [19]:
group_results_dirpath_dict = {all_key: corr_results_dirpath}
header = "Expected directory structure"
print("\n".join((header, "=" * len(header), all_key)))
print(u"\u2514\u2500\u2500" + f" {group_results_dirpath_dict[all_key].name}")

Expected directory structure
ATP11C_V972M_Sample
└── correlations


#### Load subsystems and metabolic categories to enrich results

In [20]:
subsystems_to_exclude = {"Pseudoreactions"}
use_abbrevs = True
abbreviations = {
    "Amino acid metabolism": "A",
    "Carbohydrate metabolism": "C",
    "Lipid metabolism": "L",
    "Metabolism of cofactors and vitamins": "V",
    "Nucleotide metabolism": "N",
    "Reactive species": "R",
    "Transport reactions": "T",
    "Other": "O",
}
categories_to_keep = list(abbreviations)

df_pathways = pd.read_csv(
    get_dirpath("curation") / "subsystems.tsv", sep="\t", dtype=str
).fillna("")

# Rename "name" to subsystem to match reaction attribute
df_pathways = df_pathways.rename({"name": "subsystem"}, axis=1)
# Group "Metabolism of other amino acids" with amino acids rather than treat as "other"
df_pathways["category"] = df_pathways["category"].replace(
    "Metabolism of other amino acids", "Amino acid metabolism"
)

df_pathways["category"] = df_pathways["category"].apply(lambda x: ("Other" if x not in categories_to_keep else x))
df_pathways = df_pathways[~df_pathways["subsystem"].isin(subsystems_to_exclude)].copy()
subsystem_to_category_dict = df_pathways.set_index("subsystem")["category"].to_dict()
df_pathways

Unnamed: 0,subsystem,category,kegg.pathway.name,kegg.pathway,notes
0,"Alanine, aspartate and glutamate metabolism",Amino acid metabolism,"Alanine, aspartate and glutamate metabolism",hsa00250,
1,Arginine and proline metabolism,Amino acid metabolism,Arginine and proline metabolism,hsa00330,
2,Cysteine and methionine metabolism,Amino acid metabolism,Cysteine and methionine metabolism,hsa00270,
3,"Glycine, serine and threonine metabolism",Amino acid metabolism,"Glycine, serine and threonine metabolism",hsa00260,
4,Histidine metabolism,Amino acid metabolism,Histidine metabolism,hsa00340,
...,...,...,...,...,...
73,Aminoacyl-tRNA biosynthesis,Other,Aminoacyl-tRNA biosynthesis,has00970,
74,"Transport, extracellular",Transport reactions,,,Representative subsystem for all transport rea...
75,5-fluorouracil metabolism,Other,Drug metabolism - other enzymes,hsa00983,"Subnetwork of KEGG pathway ""Drug metabolism - ..."
76,Azathioprine and 6-mercaptopurine metabolism,Other,Drug metabolism - other enzymes,hsa00983,"Subnetwork of KEGG pathway ""Drug metabolism - ..."


## Compute significant results between groups
#### Compare all subgroups at once

In [21]:
compare_group = genotype
optimum = 0
value_to_compare = "range"
group_timepoints_by = None
compare_pairwise = True
compare_all_groups = True
ordered_group_to_compare = [f"{compare_group}_{alleles}" for alleles in [0, 1, 2]]
all_samples_for_comparison = [value for g in ordered_group_to_compare for value in np.array(model_groups[g])]
df_data_for_correlations = df_data_for_analyses.loc[pd.IndexSlice[:, all_samples_for_comparison], :]
df_data_for_correlations = df_data_for_correlations[df_data_for_correlations["optimum"] == optimum].drop("optimum", axis=1)

print("Groups to compare\n=================")
pairwise_group_combos = []
if compare_all_groups:
    print(tuple(ordered_group_to_compare))
if compare_pairwise:
    pairwise_group_combos += list(combinations(ordered_group_to_compare, 2))
    for group in pairwise_group_combos:
        print(group)
df_data_for_correlations

Groups to compare
('ATP11C_V972M_0', 'ATP11C_V972M_1', 'ATP11C_V972M_2')
('ATP11C_V972M_0', 'ATP11C_V972M_1')
('ATP11C_V972M_0', 'ATP11C_V972M_2')
('ATP11C_V972M_1', 'ATP11C_V972M_2')


Unnamed: 0_level_0,Unnamed: 1_level_0,model,min,max,range,abundance,ATP11C_V972M
reactions,sample,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1SGTH2OHE1ABCte,Allele0_C0,RBC_GEM_PC_Allele0_C0,0.000000,0.014688,0.014688,0.062771,0
1SGTH2OHE2ABCte,Allele0_C0,RBC_GEM_PC_Allele0_C0,0.000000,0.014688,0.014688,0.062771,0
23E1QN1GST,Allele0_C0,RBC_GEM_PC_Allele0_C0,0.000000,0.014688,0.014688,21.663839,0
23E1QN4GST,Allele0_C0,RBC_GEM_PC_Allele0_C0,0.000000,0.014688,0.014688,21.663839,0
23E1SQOX,Allele0_C0,RBC_GEM_PC_Allele0_C0,0.000000,1.296063,1.296063,,0
...,...,...,...,...,...,...,...
XPPT,Allele2_S417_D42,RBC_GEM_PC_Allele2_S417_D42,0.000000,0.000715,0.000715,1.189980,2
XYLK,Allele2_S417_D42,RBC_GEM_PC_Allele2_S417_D42,0.000000,0.006065,0.006065,0.025918,2
XYLTD_Dx,Allele2_S417_D42,RBC_GEM_PC_Allele2_S417_D42,-0.065284,0.065284,0.130568,0.278992,2
XYLTt,Allele2_S417_D42,RBC_GEM_PC_Allele2_S417_D42,-0.065284,0.065284,0.130568,,2


### Kruskal Wallis H-test (3 or more groups)

In [22]:
results_dict = defaultdict(dict)

In [23]:
if len(ordered_group_to_compare) > 2:
    for rid in df_data_for_correlations.index.get_level_values("reactions").unique():
        df_data_rxn = df_data_for_correlations.loc[rid]
        df_data_rxn_opt_value = df_data_rxn[value_to_compare].copy()
        data_arrays = {
            group_name: df_data_rxn_opt_value.loc[model_groups[group_name]].values 
            for group_name in ordered_group_to_compare
        }
        values = list(data_arrays.values())
        unique_values = set([v for value_list in values for v in value_list if not np.isnan(v)])
        if len(unique_values) <= 1:
            # Skip variables that do not have any differences
            results_dict[tuple(ordered_group_to_compare)][rid] = dict(zip(["statistic", "pvalue"], [pd.NA, pd.NA]))
        else:
            result = kruskal(*values, nan_policy="omit")
            results_dict[tuple(ordered_group_to_compare)][rid] = {
                attr: getattr(result, attr) 
                for attr in ["statistic", "pvalue"]
            }

dataframes = {key: pd.DataFrame.from_dict(values, orient="index") for key, values in results_dict.items()}

### Mann Whiteney U test (2 groups)

In [24]:
if len(ordered_group_to_compare) == 2 or compare_pairwise:
    for rid in df_data_for_correlations.index.get_level_values("reactions").unique():
        df_data_rxn = df_data_for_correlations.loc[rid]
        df_data_rxn_opt_value = df_data_rxn[value_to_compare].copy()
        data_arrays = {
            group_name: df_data_rxn_opt_value.loc[model_groups[group_name]].values 
            for group_name in ordered_group_to_compare
        }
        combos = [tuple(ordered_group_to_compare)] if (len(ordered_group_to_compare) == 2) else pairwise_group_combos
        for combo in combos:
            values = [data_arrays[group] for group in combo]
            unique_values = set([v for value_list in values for v in value_list if not np.isnan(v)])
            if len(unique_values) <= 1:
                # Skip variables that do not have any differences
                result = dict(zip(["statistic", "pvalue"], [pd.NA, pd.NA]))
            else:
                result = mannwhitneyu(*values, nan_policy="omit")
                result = {
                    attr: getattr(result, attr) 
                    for attr in ["statistic", "pvalue"]
                }
            results_dict[combo][rid] = result
dataframes = {key: pd.DataFrame.from_dict(values, orient="index") for key, values in results_dict.items()}


### View comparisons made

In [25]:
print(f"Number of different comparisons made: {len(dataframes)}")
print("Groups compared\n===============")
for key in list(dataframes):
    print(key)

Number of different comparisons made: 4
Groups compared
('ATP11C_V972M_0', 'ATP11C_V972M_1', 'ATP11C_V972M_2')
('ATP11C_V972M_0', 'ATP11C_V972M_1')
('ATP11C_V972M_0', 'ATP11C_V972M_2')
('ATP11C_V972M_1', 'ATP11C_V972M_2')


### Create ordered metadata for visualization of clustered samples

In [26]:
df_metadata = df_data_for_analyses.reset_index(drop=False)[["sample", "ATP11C_V972M"]].drop_duplicates()
df_metadata = df_metadata[~df_metadata["sample"].apply(lambda x: bool(operation_re.search(x)))]
df_metadata["ATP11C_V972M"] = df_metadata["ATP11C_V972M"].astype(int)

to_concat = []
for allele_count in df_metadata["ATP11C_V972M"].unique():
    df = df_metadata[df_metadata["ATP11C_V972M"] == allele_count]
    df.loc[:, "sample"] = df["sample"].apply(lambda x: (allele_count, x.split("_", maxsplit=1)[-1] if "_C" not in x else int(x.split("_C")[-1])) )
    to_concat += [df]
df_metadata = pd.concat(to_concat, axis=0).sort_values(by=["ATP11C_V972M", "sample"])
# Reformat as original clustered sample IDs
df_metadata["sample"] = df_metadata["sample"].apply(lambda x: "Allele{}_C{}".format(*x) if isinstance(x[1], int) else "Allele{}_{}".format(*x))
df_metadata = df_metadata.set_index("sample")
df_metadata

Unnamed: 0_level_0,ATP11C_V972M
sample,Unnamed: 1_level_1
Allele0_C0,0
Allele0_C1,0
Allele0_C2,0
Allele0_C3,0
Allele0_C4,0
Allele0_C5,0
Allele0_C6,0
Allele0_C7,0
Allele0_C8,0
Allele0_C9,0


### Determine significance using p-values

In [27]:
pvalue_sig = {
    ('ATP11C_V972M_0', 'ATP11C_V972M_1', 'ATP11C_V972M_2'): 0.0025,
    ('ATP11C_V972M_0', 'ATP11C_V972M_1'): 0.001,
    ('ATP11C_V972M_0', 'ATP11C_V972M_2'): 0.002,
    ('ATP11C_V972M_1', 'ATP11C_V972M_2'): 0.05,
}

enzyme_reactions_only = False
include_boundary_reactions = False
sort_by_subsystem = False
standardize_by = "mean"
use_group_means = False
fdr_method = None

significant_dataframes = {}
for met in model.metabolites.query(lambda x: x.compartment == "e"):
    met.name += " (extracellular)"
metadata_columns = [
    "name", 
    "stoichiometry", 
    "proteins", 
    "pvalue" if not fdr_method else "adj_pvalue",
    "subsystem", 
    "category",
]
for key, df in dataframes.items():
    df = df.dropna().copy()
    df["pvalue"] = df["pvalue"].astype(float)
    if fdr_method is not None and not fdr_method in {"bon", "bh", "by"}:
        raise ValueError(f"Unrecognized FDR correction method : {fdr_method}")
    elif fdr_method == "bon":
        pvalue_key = "adj_pvalue"
        df[pvalue_key] = df["pvalue"] * len(df["pvalue"])
    elif fdr_method in {"bh", "by"}:
        pvalue_key = "adj_pvalue"
        df[pvalue_key] = false_discovery_control(df["pvalue"].astype(float), method=fdr_method)
    else:
        pvalue_key = "pvalue"
    print(df.loc["PSFLIPt", "pvalue"])
    pvalue = pvalue_sig if isinstance(pvalue_sig, (float, int)) else pvalue_sig[key]
    df = df[df[pvalue_key] <= pvalue].drop("statistic", axis=1)
    if enzyme_reactions_only:
        df_pivot = df_data_for_correlations.loc[df.index, ["abundance", value_to_compare]].dropna(subset="abundance")
        df_pivot = df_pivot.drop("abundance", axis=1)
    else:
        df_pivot = df_data_for_correlations.loc[df.index, value_to_compare]
    if not include_boundary_reactions:
        df_pivot = df_pivot[~df_pivot.index.isin(model.reactions.query(lambda x: x.boundary).list_attr("id"), level="reactions")]
    df_pivot = df_pivot.reset_index(drop=False)
    df_pivot = df_pivot.pivot(columns=id_key, index="reactions", values=value_to_compare)
    df = pd.merge(df, df_pivot, left_index=True, right_index=True).sort_values(pvalue_key)
    df.index.name = "reactions"
    df = df.reset_index(drop=False).set_index(["reactions", pvalue_key]).T
    if df.empty:
        df = pd.DataFrame([], columns=metadata_columns)
    else:    
        df = pd.concat(
            [
                 # Sort index by donor number and subgroup while concatenating
                df.loc[model_groups[g]].sort_index() 
                for g in key
            ],
            axis=0
        )
    
        df = df.T.reset_index(drop=False)
        # Enrich results
        # df["name"] = [r.name.replace(",", "") for r in model.reactions.get_by_any(list(df["reactions"].values))]
        df["name"] = [r.name for r in model.reactions.get_by_any(list(df["reactions"].values))]
        df["stoichiometry"] = [r.build_reaction_string(use_metabolite_names=True) for r in model.reactions.get_by_any(list(df["reactions"].values))]
        df["subsystem"] = [r.subsystem for r in model.reactions.get_by_any(list(df["reactions"].values))]
        df["category"] = df["subsystem"].replace(subsystem_to_category_dict)
        df["proteins"] = [";".join(sorted([g.id for g in r.genes])) for r in model.reactions.get_by_any(list(df["reactions"].values))]
        # Replace commas to prevent issues with CSV export
        # df["subsystem"] = df["subsystem"].apply(lambda x: x.replace(",", ""))
        # df["category"] = df["category"].apply(lambda x: x.replace(",", ""))
        df[pvalue_key] = df[pvalue_key].apply(lambda x: round(x, 5))
    
        df = df.set_index("reactions")
        if sort_by_subsystem:
            df = df.sort_values(by=["category", "subsystem", "proteins"])
    
        df_meta = df.loc[:, metadata_columns].copy()
        df_data = df.loc[:, ~df.columns.isin(df_meta.columns)].copy()
        if use_group_means:
            df_data = pd.concat([df_data.loc[:, model_groups[g]].mean(axis=1) for g in key], axis=1)
            df_data.columns = list(key)
        if standardize_by == "mean":
            df_data = df_data.sub(df_data.mean(axis=1), axis=0).div(df_data.std(axis=1), axis=0).dropna(how="all", axis=0)
        elif standardize_by == "median":
            df_data = ((df_data.T - df_data.median(axis=1)) / (df_data.quantile(q=0.75, axis=1) - df_data.quantile(q=0.25, axis=1))).T
        else:
            df_data = df_data.loc[:, [x for x in df_metadata.index if x in df_data.columns]]
        # Put dataframes back together for custom reordering
        df = df_data.merge(df_meta, left_index=True, right_index=True)
    significant_dataframes[key] = df
    print(key)
    print(f"Min & Max values: ({df_data.min().min():.4f}, {df_data.max().max():.4f})")
    print()
key = tuple(ordered_group_to_compare)
key = ('ATP11C_V972M_0', 'ATP11C_V972M_2')
df = significant_dataframes[key]
df

0.0024778499408964127
('ATP11C_V972M_0', 'ATP11C_V972M_1', 'ATP11C_V972M_2')
Min & Max values: (-3.2095, 4.9987)

0.9783104767086463
('ATP11C_V972M_0', 'ATP11C_V972M_1')
Min & Max values: (-2.5928, 2.4905)

0.000357291836140036
('ATP11C_V972M_0', 'ATP11C_V972M_2')
Min & Max values: (-3.0402, 3.1123)

0.026472236641991963
('ATP11C_V972M_1', 'ATP11C_V972M_2')
Min & Max values: (-2.2483, 3.0754)



Unnamed: 0_level_0,Allele0_C0,Allele0_C1,Allele0_C10,Allele0_C11,Allele0_C12,Allele0_C13,Allele0_C14,Allele0_C15,Allele0_C16,Allele0_C17,...,Allele2_S379_D42,Allele2_S417_D10,Allele2_S417_D23,Allele2_S417_D42,name,stoichiometry,proteins,pvalue,subsystem,category
reactions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PROTSPP,-0.144614,-0.929996,-0.736722,0.108218,-1.304976,-0.760306,-0.360157,-0.514591,-1.023557,-0.292863,...,1.280868,1.801966,1.798204,1.699028,O-phospho-L-seryl-[protein] phosphatase,H2O + O-phospho-L-seryl-[protein] --> Orthopho...,CPPED1;DUSP23;PPP1CA;PPP1CB;PPP1CC;PPP2CA;PPP2...,0.00001,Protein modification,Other
PROTTPP,-0.144614,-0.929996,-0.736722,0.108218,-1.304976,-0.760306,-0.360157,-0.514591,-1.023557,-0.292863,...,1.280868,1.801966,1.798204,1.699028,O-phospho-L-threonyl-[protein] phosphatase,H2O + O-phospho-L-threonyl-[protein] --> Ortho...,CPPED1;DUSP23;PPP1CA;PPP1CB;PPP1CC;PPP2CA;PPP2...,0.00001,Protein modification,Other
GALK,0.960666,0.754957,0.203758,0.145185,0.481374,-0.453299,1.220174,0.461633,-0.026970,-0.336114,...,-0.775643,-3.040196,-0.616288,-0.824819,Galactokinase,ATP + galactose --> ADP + Alpha-D-Galactose 1-...,GALK1;GALK2,0.00001,Galactose metabolism,Carbohydrate metabolism
GAL1PP,0.496131,0.716346,0.071459,-0.543827,0.574790,1.354124,0.995176,0.076097,0.753239,-0.546921,...,-1.172473,-1.396024,-0.687117,-1.147408,alpha-D-galactose 1-phosphatase,Alpha-D-Galactose 1-phosphate + H2O --> galact...,IMPA1;IMPA2,0.00001,Galactose metabolism,Carbohydrate metabolism
SER_Cltex,0.728571,0.607369,-0.080595,1.297377,0.462513,0.247351,1.034612,0.085226,-0.255215,0.171550,...,-1.162790,-1.365677,-1.785034,-2.222901,Serine transport via chloride antiport,Chloride + L-serine (extracellular) <=> Chlori...,SLC4A1,0.00001,"Transport, extracellular",Transport reactions
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SELGTHGTHR,1.464823,1.125985,0.046722,-1.050288,-0.405483,0.779571,1.042202,-1.076011,-0.240062,-0.223394,...,-0.011250,-0.062291,-1.890291,-1.773124,Hydrogen selenide:glutathione oxidoreductase,Reduced glutathione + Glutathioselenol --> Oxi...,GPX1,0.00186,Selenocompound metabolism,Amino acid metabolism
SELDIGTHOXy,1.464823,1.125985,0.046722,-1.050288,-0.405483,0.779571,1.042202,-1.076011,-0.240062,-0.223394,...,-0.011250,-0.062291,-1.890291,-1.773124,Glutathioselenol:NADP oxidoreductase,H+ + Nicotinamide adenine dinucleotide phospha...,GPX1,0.00186,Selenocompound metabolism,Amino acid metabolism
PPM,1.329435,0.861124,-1.025625,0.711761,0.634871,0.312986,0.888676,-0.544850,-0.521783,-0.923030,...,-1.768069,0.385402,-0.954955,-1.154751,Phosphopentomutase,Alpha-D-Ribose 1-phosphate <=> Alpha-D-Ribose ...,PGM2,0.00186,Pentose phosphate pathway,Carbohydrate metabolism
MGSA2,1.467234,-1.064964,-0.795996,1.360563,0.668676,1.329622,0.556424,0.702329,-0.164324,-0.466903,...,-1.530425,-0.294791,-0.736715,-0.396364,Erroneous formation of methylgloxal (glycerald...,Glyceraldehyde 3-phosphate --> Methylglyoxal +...,,0.00186,Pyruvate metabolism,Carbohydrate metabolism


In [None]:
key = ('ATP11C_V972M_0', 'ATP11C_V972M_2')
df = significant_dataframes[key].copy()

df.loc[["56DH5FUt", "56DTHMt", "56DURAt"], "subsystem"] = "Nucleotide transport"
df.loc[["PAFH", "PAFS"], "subsystem"] = "Ether lipid metabolism"
df.loc[["D5FURADy"], "category"] = "Nucleotide metabolism"
df.loc[["CYSTL2"], "subsystem"] = "Cysteine and methionine metabolism"
df.loc[[
    'ETHAt',
    'FAt_16_2E',
    "SPHGNSte", 
    "SPHGNte", 
    "SPHS1PABCte", 
    "PEFLIPt", 
    "PSFLIPt",
    'DHET1112te',
    'DHET1314te',
    'DHET56te',
    'DHET89te',
    'DIHDPA1011te',
    'DIHDPA1314te',
    'DIHDPA1617te',
    'DIHDPA1920te',
    'DIHDPA78te',
    'DIHETE1112te',
    'DIHETE1718te',
    'DIHETE56te',
    'DIHETE78te',
    'DIHETEt1314e',
    'DIHOME910te',
    'DIHOMEt1213e',
], "subsystem" ] = "Lipid transport"
df["subsystem"] = df["subsystem"].replace({
    '5-fluorouracil metabolism': 'Nucleotide metabolism',
    'Glycolysis / Gluconeogenesis': 'Glycolysis and pyruvate metabolism',
    'Pyruvate metabolism': 'Glycolysis and pyruvate metabolism',
    'Tricarboxylic acid cycle and glyoxylate/dicarboxylate metabolism': 'Glycolysis and pyruvate metabolism',
    'Purine metabolism': 'Nucleotide metabolism',
    'Pyrimidine metabolism': 'Nucleotide metabolism',
    'Transport, extracellular': 'Other transport'
})



df = df.sort_values(["category", "subsystem"])

ordered_subsystems = [
    'Alanine, aspartate and glutamate metabolism',
    'Glycine, serine and threonine metabolism',
    'Cysteine and methionine metabolism',
    'Selenocompound metabolism',
    'Amino sugar and nucleotide sugar metabolism',    
    'Galactose metabolism',
    'Glycolysis and pyruvate metabolism',
    'Pentose and glucuronate interconversions',
    'Pentose phosphate pathway',
    'Nicotinate and nicotinamide metabolism',
    'Pantothenate and CoA metabolism',
    'Thiamine metabolism',
    'Nucleotide metabolism',
    'Protein modification',
    'Linoleate metabolism',
    'Arachidonic acid metabolism',
    'Eicosapentaenoic acid metabolism',
    'Docosahexaenoic acid metabolism',
    'Ether lipid metabolism',
    'Glycerophospholipid metabolism',
    'Sphingolipid metabolism',
    "Lipid transport",
    "Nucleotide transport",
    "Other transport",
]
df = pd.concat(
    [
        df[df["subsystem"] == subsystem].sort_values(by="pvalue")
        for subsystem in ordered_subsystems
    ],
    axis=0,
)
significant_dataframes[key] = df.copy()
set(df["subsystem"]).difference(ordered_subsystems)

## Export results

In [None]:
ftype = "tsv"
for key, df_main in significant_dataframes.items(): 
    df_meta_row = df_main.loc[:, metadata_columns].copy()
    df_data = df_main.loc[:, ~df_main.columns.isin(df_meta.columns)].copy()
    df_meta_col = df_metadata.copy()
    if use_group_means:
        df_meta_col = df_meta_col.groupby(compare_group, as_index=False).mean()
        df_meta_col.index = [f"{genotype}_{x}" for x in df_meta_col["ATP11C_V972M"].values]
    
    df_meta_col = df_meta_col.loc[list(df_data.columns)].copy()
    for df_type, df in zip(["data", "meta_row", "meta_col"], [df_data, df_meta_row, df_meta_col]):
        filename = "_".join(
            ["MannWhiteney" if len(key) == 2 else "Kruskal"]
            + [g.split("_")[-1] for g in key]
            + [id_key]
            + [df_type]
        )
        if use_group_means:
            filename += "_mean"
        group_results_dirpath_dict[grouped_data_key].mkdir(exist_ok=True)
        filename = group_results_dirpath_dict[grouped_data_key] / filename
        df.to_csv(
            f"{filename}.{ftype}",
            sep="\t" if ftype == "tsv" else ',',
            index=True
        )
