# Prepare Proteomic Data - Intensities, REDS RBC Omics

1. Nemkov T, Stephenson D, Earley EJ, Keele GR, Hay A, Key A, Haiman ZB, Erickson C, Dzieciatkowska M, Reisz JA, Moore A, Stone M, Deng X, Kleinman S, Spitalnik SL, Hod EA, Hudson KE, Hansen KC, Palsson BO, Churchill GA, Roubinian N, Norris PJ, Busch MP, Zimring JC, Page GP, D'Alessandro A. Biological and genetic determinants of glycolysis: Phosphofructokinase isoforms boost energy status of stored red blood cells and transfusion outcomes. Cell Metab. 2024 Sep 3;36(9):1979-1997.e13. doi: 10.1016/j.cmet.2024.06.007. Epub 2024 Jul 3. PMID: 38964323; PMCID: PMC11374506.

2. D'Alessandro A, Culp-Hill R, Reisz JA, Anderson M, Fu X, Nemkov T, Gehrke S, Zheng C, Kanias T, Guo Y, Page G, Gladwin MT, Kleinman S, Lanteri M, Stone M, Busch M, Zimring JC; Recipient Epidemiology and Donor Evaluation Study-III (REDS-III). Heterogeneity of blood processing and storage additives in different centers impacts stored red blood cell metabolism as much as storage time: lessons from REDS-III-Omics. Transfusion. 2019 Jan;59(1):89-100. doi: 10.1111/trf.14979. Epub 2018 Oct 24. PMID: 30353560; PMCID: PMC6322946.

3. Josephson CD, Glynn S, Mathew S, Birch R, Bakkour S, Baumann Kreuziger L, Busch MP, Chapman K, Dinardo C, Hendrickson J, Hod EA, Kelly S, Luban N, Mast A, Norris P, Custer B, Sabino E, Sachais B, Spencer BR, Stone M, Kleinman S; National Heart, Lung, and Blood Institute (NHLBI) Recipient Epidemiology and Donor Evaluation Study-IV-Pediatric (REDS-IV-P). The Recipient Epidemiology and Donor Evaluation Study-IV-Pediatric (REDS-IV-P): A research program striving to improve blood donor safety and optimize transfusion outcomes across the lifespan. Transfusion. 2022 May;62(5):982-999. doi: 10.1111/trf.16869. Epub 2022 Apr 19. PMID: 35441384; PMCID: PMC9353062.

## Setup
### Import packages

In [1]:
import pandas as pd
from rbc_gem_utils import get_dirpath, show_versions
from rbc_gem_utils.util import AVOGADRO_NUMBER, ensure_iterable

# Show versions of notebook
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.2

Dependency Information
----------------------
beautifulsoup4                       4.13.4
bio                                   1.8.0
cobra                                0.29.1
depinfo                               2.2.0
gurobipy                             12.0.2
matplotlib                           3.10.3
matplotlib-venn                       1.1.2
memote                               0.17.0
networkx                              3.4.2
notebook                              7.4.2
openpyxl                              3.1.5
pandas                                2.2.3
pre-commit                            4.2.0
rbc-gem-utils[database,network,vis] missing
requests                             2.32.3
scikit-learn                          1.6.1
scipy                                1.15.3
seaborn                              0.13.2

Build Tools Information
-----------------------
pip          25.1
setuptools 78.1.1
wheel      0.45

## Set organism, dataset, and paths

In [2]:
organism = "Human"
dataset_name = "REDS_RBCOmics"
raw_data_dirpath = get_dirpath(use_temp="raw") / organism / dataset_name

# Ensure directory exists
processed_data_dirpath = get_dirpath(use_temp="processed") / organism / dataset_name
processed_data_dirpath.mkdir(exist_ok=True, parents=True)

## Set data value type and variables for columns keys 

In [3]:
protein_values_dtype = "Intensities"
sample_key = "SAMPLE ID"
donor_key = "PUBLIC DONOR ID"
time_key = "DAY"

time_abbrev = "D"

## Load RBC Proteomics
### Load protein data

In [4]:
df_protein_data = pd.read_csv(
    raw_data_dirpath / f"{dataset_name}_ProteinData.tsv",
    sep="\t",
    index_col=None,
)
# Check to see if expected columns are included. If so, then order columns as listed.
# Comes directly from UniProt if possible
df_protein_data = df_protein_data.loc[
    :,
    [
        "Entry",
        "Entry Name",
        "Protein",
        "Protein names",
        "Gene Names (primary)",
        "Length",
        "Mass",  # Should be in DA
    ],
]
# Sort the data via alphabetical order of protein IDs for consistency
df_protein_data = df_protein_data.set_index("Entry").sort_index()
df_protein_data.head()

Unnamed: 0_level_0,Entry Name,Protein,Protein names,Gene Names (primary),Length,Mass
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A0A075B6I0,LV861_HUMAN,LV861,Immunoglobulin lambda variable 8-61,IGLV8-61,122,12814
A0A075B6I9,LV746_HUMAN,LV746,Immunoglobulin lambda variable 7-46,IGLV7-46,117,12468
A0A075B6J9,LV218_HUMAN,LV218,Immunoglobulin lambda variable 2-18,IGLV2-18,118,12412
A0A075B6K4,LV310_HUMAN,LV310,Immunoglobulin lambda variable 3-10,IGLV3-10,115,12441
A0A075B6K5,LV39_HUMAN,LV39,Immunoglobulin lambda variable 3-9,IGLV3-9,115,12332


#### Load proteomics and map to UniProt if necessary

In [5]:
df_proteomics = pd.read_csv(
    raw_data_dirpath / f"{dataset_name}_Protein{protein_values_dtype}.tsv",
    sep="\t",
    index_col=None,
)
original_ids_type = "Protein"

# Create sample IDs from donor and time points, then set as index
df_proteomics.index = pd.Index(
    df_proteomics[[donor_key, time_key]]
    .apply(lambda x: f"{x[donor_key]}_{time_abbrev}{x[time_key]}", axis=1)
    .values,
    name=sample_key,
)

# Transform Protein IDs to UniProt IDs
if original_ids_type != "uniprot" and any(
    df_proteomics.columns.isin(df_protein_data[original_ids_type])
):
    mapping_dict = df_protein_data.reset_index(drop=False)
    mapping_dict = mapping_dict.set_index(original_ids_type)[df_protein_data.index.name]
    mapping_dict = mapping_dict.to_dict()
    df_proteomics = df_proteomics.rename(mapping_dict, axis=1)

# Sort for consistency
df_proteomics = df_proteomics.sort_index(axis=0)[
    [donor_key, time_key] + list(df_protein_data.index)
]
donor_ids = df_proteomics[donor_key].unique()
timepoints = df_proteomics[time_key].unique()
print(f"Number of donors: {len(donor_ids)}")
print(f"Number of timepoints: {len(timepoints)}")
print(f"Number of expected samples: {len(donor_ids) * len(timepoints)}")
print(f"Number of actual samples: {len(df_proteomics)}")
df_proteomics

Number of donors: 610
Number of timepoints: 3
Number of expected samples: 1830
Number of actual samples: 1830


Unnamed: 0_level_0,PUBLIC DONOR ID,DAY,A0A075B6I0,A0A075B6I9,A0A075B6J9,A0A075B6K4,A0A075B6K5,A0A075B6R2,A0A075B6S5,A0A075B6S9,...,Q9Y639,Q9Y666,Q9Y696,Q9Y6B6,Q9Y6B7,Q9Y6E0,Q9Y6I3,Q9Y6M4,Q9Y6M5,Q9Y6R7
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S001_D10,S001,10,114.106690,61.188179,16.998806,1.0,240.439072,0.000000,8.726534,0.000000,...,4.806035,0.000000,44.643505,130.273163,0.000000,30.318407,22.992273,0.000000,38.119408,202.467422
S001_D23,S001,23,90.694664,18.855646,0.000000,0.0,234.999283,0.000000,30.381741,148.992706,...,3.503707,0.000000,28.674469,0.000000,0.000000,33.782383,159.744751,21.729137,38.118774,210.148560
S001_D42,S001,42,123.064743,49.431454,61.730526,0.0,221.155167,0.000000,25.704966,0.000000,...,7.449622,1.478644,26.478333,139.307129,7.024934,45.400738,19.826700,22.173162,28.154861,191.642059
S002_D10,S002,10,0.000000,0.000000,0.000000,0.0,267.438843,12.034687,24.897249,0.000000,...,12.975469,0.000000,53.475803,91.111374,0.000000,50.815300,0.000000,0.000000,113.735794,264.970459
S002_D23,S002,23,64.492546,0.000000,490.452606,0.0,518.987244,0.000000,0.000000,0.000000,...,10.578275,48.285789,48.756619,13.925104,0.000000,77.841980,0.000000,0.000000,129.577789,93.992775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S609_D23,S609,23,0.000000,0.000000,0.000000,0.0,264.445312,0.000000,10.985426,0.000000,...,15.497660,0.000000,19.545952,264.495727,0.000000,38.225861,22.002222,31.119677,83.591034,125.030083
S609_D42,S609,42,0.000000,0.000000,0.000000,0.0,141.632477,0.000000,4.872647,0.000000,...,12.780012,0.000000,16.991152,112.677589,0.000000,37.955391,53.309387,3830.305176,61.713692,171.356857
S610_D10,S610,10,61.607353,0.000000,18.211782,0.0,169.156128,17.369522,12.156874,46.037743,...,16.118538,0.000000,0.000000,45.524883,0.000000,45.985256,36.088440,33.556423,36.672478,101.863182
S610_D23,S610,23,52.985199,0.000000,0.000000,0.0,87.499588,0.000000,12.358866,83.549934,...,9.255105,0.000000,37.994469,140.407440,0.000000,30.095741,13.189644,21.580671,40.094452,206.146057


### Load metadata corresponding to samples (optional)
#### Genotype data

In [6]:
try:
    df_genotypes = pd.read_csv(
        raw_data_dirpath / f"{dataset_name}_Genotypes.tsv",
        sep="\t",
        index_col=[donor_key],
    )
except FileNotFoundError:
    df_genotypes = pd.DataFrame([])
df_genotypes

Unnamed: 0_level_0,G6PD_V68M
PUBLIC DONOR ID,Unnamed: 1_level_1
S001,0.0
S002,0.0
S003,0.0
S004,0.0
S005,0.0
...,...
S647,0.0
S648,0.0
S649,0.0
S650,0.0


#### Phenotype data

In [7]:
try:
    df_phenotypes = pd.read_csv(
        raw_data_dirpath / f"{dataset_name}_Phenotypes.tsv",
        sep="\t",
        index_col=[donor_key],
    )
except FileNotFoundError:
    df_phenotypes = pd.DataFrame([])
df_phenotypes

Unnamed: 0_level_0,AS,Gender,DONDB.ABO_RH,RBCOmics.Race.Ethnicity.Group,Age,BMI,Weight,Height,Hemolysis.volume,Hemolysis.hct,...,Adjusted.Osmotic.Hemolysis,Adjusted.Oxidative.Hemolysis,CBC.WBC,CBC.RBC,CBC.HGB,CBC.HCT,CBC.MCV,CBC.RDW,CBC.PLT,Ferritin
PUBLIC DONOR ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S001,AS1,M,O+,CAUCASIAN,33,25.724339,195,73,14.0,65.0,...,51.412936,,5.56,5.33,15.5,46.6,87.4,12.7,274.0,19.0
S002,AS3,M,A+,HIGH,71,27.891291,200,71,14.0,67.0,...,72.000942,,7.32,5.38,16.5,47.9,89.0,12.7,222.0,23.0
S003,AS1,F,O+,HIGH,51,23.128284,139,65,11.5,64.0,...,25.647109,,5.67,4.38,14.2,41.7,95.2,12.1,293.0,17.0
S004,AS1,F,A+,CAUCASIAN,48,23.294675,140,65,12.5,62.0,...,12.702554,,6.10,4.32,13.1,40.9,94.7,12.3,376.0,21.0
S005,AS1,M,A+,OTHER,52,36.042480,210,64,11.0,63.5,...,55.111380,,7.52,5.81,15.9,47.8,82.3,14.4,414.0,93.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S647,AS1,F,O+,HIGH,80,30.996473,175,63,11.0,64.0,...,16.894124,32.578442,7.02,4.58,13.8,43.9,95.9,14.9,253.0,13.0
S648,AS1,F,A+,HIGH,62,24.293018,146,65,10.0,64.0,...,37.482130,18.510241,4.61,4.44,13.3,41.0,92.3,13.3,303.0,9.0
S649,AS1,F,A-,CAUCASIAN,66,22.140338,125,63,11.5,61.0,...,25.770390,12.200239,5.63,4.68,14.3,41.9,89.5,13.3,220.0,51.0
S650,AS1,M,O+,HISPANIC,31,30.128571,210,70,10.0,55.0,...,17.757095,57.301236,4.93,4.92,14.3,42.6,86.6,13.2,269.0,118.0


##### Cut phenotype data into ranges if desired

In [8]:
cuts = {
    "BMI": [0, 25, 30, 40, 60],
    "Age": [0, 20, 40, 60, 80, 100],
}
if not df_phenotypes.empty:
    for col, bins in cuts.items():
        labels = []
        for idx in range(1, len(bins)):
            if idx == 1 or idx == len(bins) - 1:
                labels += [f"lt{bins[idx]:d}"]
            elif idx:
                labels += [f"gt{bins[idx - 1]:d}"]
            else:
                labels += [f"{bins[idx - 1]:d}to{bins[idx]:d}"]
        df_phenotypes[f"{col}_Range"] = pd.cut(
            df_phenotypes[col], bins=bins, labels=labels, right=False
        )
df_phenotypes

Unnamed: 0_level_0,AS,Gender,DONDB.ABO_RH,RBCOmics.Race.Ethnicity.Group,Age,BMI,Weight,Height,Hemolysis.volume,Hemolysis.hct,...,CBC.WBC,CBC.RBC,CBC.HGB,CBC.HCT,CBC.MCV,CBC.RDW,CBC.PLT,Ferritin,BMI_Range,Age_Range
PUBLIC DONOR ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S001,AS1,M,O+,CAUCASIAN,33,25.724339,195,73,14.0,65.0,...,5.56,5.33,15.5,46.6,87.4,12.7,274.0,19.0,gt25,gt20
S002,AS3,M,A+,HIGH,71,27.891291,200,71,14.0,67.0,...,7.32,5.38,16.5,47.9,89.0,12.7,222.0,23.0,gt25,gt60
S003,AS1,F,O+,HIGH,51,23.128284,139,65,11.5,64.0,...,5.67,4.38,14.2,41.7,95.2,12.1,293.0,17.0,lt25,gt40
S004,AS1,F,A+,CAUCASIAN,48,23.294675,140,65,12.5,62.0,...,6.10,4.32,13.1,40.9,94.7,12.3,376.0,21.0,lt25,gt40
S005,AS1,M,A+,OTHER,52,36.042480,210,64,11.0,63.5,...,7.52,5.81,15.9,47.8,82.3,14.4,414.0,93.0,gt30,gt40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S647,AS1,F,O+,HIGH,80,30.996473,175,63,11.0,64.0,...,7.02,4.58,13.8,43.9,95.9,14.9,253.0,13.0,gt30,lt100
S648,AS1,F,A+,HIGH,62,24.293018,146,65,10.0,64.0,...,4.61,4.44,13.3,41.0,92.3,13.3,303.0,9.0,lt25,gt60
S649,AS1,F,A-,CAUCASIAN,66,22.140338,125,63,11.5,61.0,...,5.63,4.68,14.3,41.9,89.5,13.3,220.0,51.0,lt25,gt60
S650,AS1,M,O+,HISPANIC,31,30.128571,210,70,10.0,55.0,...,4.93,4.92,14.3,42.6,86.6,13.2,269.0,118.0,gt30,gt20


#### Combine into one DataFrame for MetaData

In [9]:
print(f"Proteomics: {df_proteomics[donor_key].nunique()} donors")
print(
    f"  Genomics: {df_genotypes.index.nunique() if not df_genotypes.empty else 0} donors"
)
print(
    f"Phenotypes: {df_phenotypes.index.nunique() if not df_phenotypes.empty else 0} donors"
)


df_metadata = pd.concat((df_genotypes, df_phenotypes), axis=1).convert_dtypes()

if not df_metadata.empty:
    df_metadata = df_metadata.reset_index(drop=False)
    # Ensure only metadata corresponds to the available omics data
    if not df_metadata[donor_key].isin(df_proteomics[donor_key]).all():
        df_metadata = df_metadata[df_metadata[donor_key].isin(df_proteomics[donor_key])]

    # If time was not included in metadata, add as a part of index to ensure index matches samples
    if time_key and time_key not in df_metadata.index:
        df_metadata = (
            pd.concat(
                (
                    df_metadata,
                    pd.Series(
                        [list(df_proteomics[time_key].unique())]
                        * len(df_metadata.index),
                        index=df_metadata.index,
                        name=time_key,
                    ),
                ),
                axis=1,
            )
            .explode(time_key)
            .reset_index(drop=True)
        )
    # Create sample IDs from donor and time points, then set as index
    df_metadata.index = pd.Index(
        df_metadata[[donor_key, time_key]]
        .apply(lambda x: f"{x[donor_key]}_{time_abbrev}{x[time_key]}", axis=1)
        .values,
        name=sample_key,
    )
    print(f"\nFinal data: {df_metadata[donor_key].nunique()} donors")
    df_metadata = df_metadata.drop([donor_key, time_key], axis=1)
else:
    print(f"\nFinal Meta: 0 donors")

df_metadata.head()

Proteomics: 610 donors
  Genomics: 651 donors
Phenotypes: 651 donors

Final data: 610 donors


Unnamed: 0_level_0,G6PD_V68M,AS,Gender,DONDB.ABO_RH,RBCOmics.Race.Ethnicity.Group,Age,BMI,Weight,Height,Hemolysis.volume,...,CBC.WBC,CBC.RBC,CBC.HGB,CBC.HCT,CBC.MCV,CBC.RDW,CBC.PLT,Ferritin,BMI_Range,Age_Range
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S001_D10,0,AS1,M,O+,CAUCASIAN,33,25.724339,195,73,14.0,...,5.56,5.33,15.5,46.6,87.4,12.7,274.0,19.0,gt25,gt20
S001_D23,0,AS1,M,O+,CAUCASIAN,33,25.724339,195,73,14.0,...,5.56,5.33,15.5,46.6,87.4,12.7,274.0,19.0,gt25,gt20
S001_D42,0,AS1,M,O+,CAUCASIAN,33,25.724339,195,73,14.0,...,5.56,5.33,15.5,46.6,87.4,12.7,274.0,19.0,gt25,gt20
S002_D10,0,AS3,M,A+,HIGH,71,27.891291,200,71,14.0,...,7.32,5.38,16.5,47.9,89.0,12.7,222.0,23.0,gt25,gt60
S002_D23,0,AS3,M,A+,HIGH,71,27.891291,200,71,14.0,...,7.32,5.38,16.5,47.9,89.0,12.7,222.0,23.0,gt25,gt60


### Get MCH per sample

In [10]:
# Provide in picograms. Set as None to use metadata if provided
mch_sample_value = None
if mch_sample_value is None:
    try:
        df_MCH_per_sample = pd.read_csv(
            raw_data_dirpath / f"{dataset_name}_Phenotypes.tsv",
            sep="\t",
            index_col=None,
        )

    except FileNotFoundError:
        raise ValueError(
            "Cannot determine MCH. No phenotype data provided and a default value is not provided"
        )

    # Ensure only metadata corresponds to the available omics data
    if not df_MCH_per_sample[donor_key].isin(df_proteomics[donor_key]).all():
        df_MCH_per_sample = df_MCH_per_sample[
            df_MCH_per_sample[donor_key].isin(df_proteomics[donor_key])
        ]

    if "CBC.MCH" not in df_MCH_per_sample.columns:
        if all([x in df_MCH_per_sample.columns for x in ["CBC.HGB", "CBC.RBC"]]):
            df_MCH_per_sample["CBC.MCH"] = (
                df_MCH_per_sample["CBC.HGB"] / df_MCH_per_sample["CBC.RBC"]
            ) * 10
        elif all([x in df_MCH_per_sample.columns for x in ["CBC.MCHC", "CBC.MCV"]]):
            df_MCH_per_sample["CBC.MCH"] = (
                df_MCH_per_sample["CBC.MCHC"] * df_MCH_per_sample["CBC.MCV"]
            ) / 100
        else:
            raise ValueError(
                "Cannot determine MCH, one of the following combinations is needed: (CBC.HGB and CBC.RBC) or (CBC.MCHC and CBC.MCV)"
            )
    df_MCH_per_sample = df_MCH_per_sample.set_index(donor_key)["CBC.MCH"]
    n_missing = len(df_MCH_per_sample[df_MCH_per_sample.isna()])
    print(f"Missing values for {n_missing} samples.")
    print(f"Mean MCH in pg: {df_MCH_per_sample.mean():.2f}")
    df_MCH_per_sample = df_MCH_per_sample.fillna(df_MCH_per_sample.mean())
else:
    print("Using default MCH value provided for all samples")
    df_MCH_per_sample = pd.Series(
        [mch_sample_value] * df_proteomics[donor_key].nunique(),
        index=pd.Index(df_proteomics[donor_key].unique(), name=donor_key),
        name="CBC.MCH",
    )
    print(f"Mean MCH in pg: {mch_sample_value:.2f}")

df_MCH_per_sample = df_MCH_per_sample.reset_index(drop=False)
# If time was not included in metadata, add as a part of index to ensure index matches samples
if time_key and time_key not in df_MCH_per_sample.index:
    df_MCH_per_sample = (
        pd.concat(
            (
                df_MCH_per_sample,
                pd.Series(
                    [list(df_proteomics[time_key].unique())]
                    * len(df_MCH_per_sample.index),
                    index=df_MCH_per_sample.index,
                    name=time_key,
                ),
            ),
            axis=1,
        )
        .explode(time_key)
        .reset_index(drop=True)
    )
# Create sample IDs from donor and time points, then set as index
df_MCH_per_sample.index = pd.Index(
    df_MCH_per_sample[[donor_key, time_key]]
    .apply(lambda x: f"{x[donor_key]}_{time_abbrev}{x[time_key]}", axis=1)
    .values,
    name=sample_key,
)
df_MCH_per_sample = df_MCH_per_sample.loc[df_proteomics.index]
df_MCH_per_sample.head()

Missing values for 12 samples.
Mean MCH in pg: 29.51


Unnamed: 0_level_0,PUBLIC DONOR ID,CBC.MCH,DAY
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S001_D10,S001,29.080675,10
S001_D23,S001,29.080675,23
S001_D42,S001,29.080675,42
S002_D10,S002,30.669145,10
S002_D23,S002,30.669145,23


### Get data subsets using operations

In [11]:
operations = [
    "mean",
    "median",
]
operation_dfs_proteomics = []
operation_dfs_MCH = []
fill_keys = set()


def group_data(df, operation, keys, columns, prefix_values=None, name_col=None):
    keys = ensure_iterable(keys)
    if not prefix_values:
        prefix_values = [""] * len(keys)
    if isinstance(prefix_values, dict):
        prefix_values = {k: prefix_values.get(k, "") for k in keys}
    else:
        prefix_values = dict(zip(keys, prefix_values))

    df = df.groupby(keys, as_index=False, observed=False)[columns]
    df = getattr(df, operation.lower())()
    labels = df[keys].apply(
        lambda x: "_".join([f"{prefix_values[key]}{x[key]}" for key in keys]),
        axis=1,
    )
    df[name_col] = [f"{operation.capitalize()}_{value}" for value in labels]
    return df

#### Group by donor only

In [12]:
keys = [donor_key]
prefix_values = {}

operation_dfs_proteomics += [
    group_data(
        df_proteomics.reset_index(drop=False),
        operation,
        keys=keys,
        columns=list(df_protein_data.index),
        prefix_values=prefix_values,
        name_col=sample_key,
    )
    for operation in operations
]

operation_dfs_MCH += [
    group_data(
        df_MCH_per_sample.reset_index(drop=False),
        operation,
        keys=keys,
        columns=["CBC.MCH"],
        prefix_values=prefix_values,
        name_col=sample_key,
    )
    for operation in operations
]

#### Group by time only

In [13]:
keys = [time_key]
prefix_values = {time_key: time_abbrev}

operation_dfs_proteomics += [
    group_data(
        df_proteomics.reset_index(drop=False),
        operation,
        keys=keys,
        columns=list(df_protein_data.index),
        prefix_values=prefix_values,
        name_col=sample_key,
    )
    for operation in operations
]

operation_dfs_MCH += [
    group_data(
        df_MCH_per_sample.reset_index(drop=False),
        operation,
        keys=keys,
        columns=["CBC.MCH"],
        prefix_values=prefix_values,
        name_col=sample_key,
    )
    for operation in operations
]

#### Group by metadata only

In [14]:
keys_prefixes = {
    "G6PD_V68M": "G6PD_V68M",
    # "Gender": "Sex",
    # "BMI_Range": "BMI",
    # "Age_Range": "Age",
}
keys_prefixes = {k: f"{v}_" for k, v in keys_prefixes.items()}
for key, prefix in keys_prefixes.items():

    operation_dfs_proteomics += [
        group_data(
            pd.merge(
                df_proteomics,
                df_metadata,
                left_index=True,
                right_index=True,
                how="left",
            ).reset_index(drop=False),
            operation,
            keys=[key],
            columns=list(df_protein_data.index),
            prefix_values={key: prefix},
            name_col=sample_key,
        )
        for operation in operations
    ]

    operation_dfs_MCH += [
        group_data(
            pd.merge(
                df_MCH_per_sample,
                df_metadata,
                left_index=True,
                right_index=True,
                how="left",
            ).reset_index(drop=False),
            operation,
            keys=[key],
            columns=["CBC.MCH"],
            prefix_values={key: prefix},
            name_col=sample_key,
        )
        for operation in operations
    ]

### Add to DataFrames

In [15]:
try:
    df_proteomics_op = pd.concat(operation_dfs_proteomics, axis=0).drop_duplicates()
except (KeyError, ValueError):
    df_proteomics_final = df_proteomics[df_protein_data.index].copy()
else:
    df_proteomics_final = pd.concat(
        (df_proteomics.reset_index(drop=False), df_proteomics_op), axis=0
    )
    df_proteomics_final = df_proteomics_final.set_index(sample_key)[
        df_protein_data.index
    ]

try:
    df_MCH_op = pd.concat(operation_dfs_MCH, axis=0).drop_duplicates()
except (KeyError, ValueError):
    df_MCH_final = df_MCH_per_sample["CBC.MCH"].copy()
else:
    df_MCH_final = pd.concat(
        (df_MCH_per_sample.reset_index(drop=False), df_MCH_op), axis=0
    )
    df_MCH_final = df_MCH_final.set_index(sample_key)["CBC.MCH"]

df_MCH_final.name = "MCH"
df_MCH_final
df_MCH_per_sample

Unnamed: 0_level_0,PUBLIC DONOR ID,CBC.MCH,DAY
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S001_D10,S001,29.080675,10
S001_D23,S001,29.080675,23
S001_D42,S001,29.080675,42
S002_D10,S002,30.669145,10
S002_D23,S002,30.669145,23
...,...,...,...
S609_D23,S609,28.942116,23
S609_D42,S609,28.942116,42
S610_D10,S610,29.025424,10
S610_D23,S610,29.025424,23


### Transform data to copy numbers and expected format

In [16]:
# Convert Da to kDa
df_uniprot_to_mw = df_protein_data["Mass"] / 1000

df_concentrations = df_proteomics_final.copy()
# Sum intensities and convert to pmol / mgDW sample, then to nmol / gDW sample
df_concentrations = (
    df_concentrations.apply(lambda x: x / x.sum(), axis=1) / df_uniprot_to_mw
) * 1e6

# Conversion to copy numbers
df_copy_numbers = (df_concentrations * 1e-9).mul(
    df_MCH_final * 1e-12, axis=0
) * AVOGADRO_NUMBER
df_copy_numbers

Unnamed: 0_level_0,A0A075B6I0,A0A075B6I9,A0A075B6J9,A0A075B6K4,A0A075B6K5,A0A075B6R2,A0A075B6S5,A0A075B6S9,A0A087WSY6,A0A0A0MRZ8,...,Q9Y639,Q9Y666,Q9Y696,Q9Y6B6,Q9Y6B7,Q9Y6E0,Q9Y6I3,Q9Y6M4,Q9Y6M5,Q9Y6R7
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S001_D10,114518.643899,63113.250142,17612.721227,1033.700010,250738.681069,0.000000,8828.312669,0.000000,323739.751835,61753.957120,...,1392.454161,0.000000,19954.371068,74758.901488,0.000000,7907.492758,4904.162257,0.000000,8864.838390,4551.934758
S001_D23,83733.413360,17891.485645,0.000000,0.000000,225441.984974,0.000000,28274.865365,138922.828686,323562.039664,63871.782876,...,933.842764,0.000000,11790.361238,0.000000,0.000000,8105.404114,31344.509782,5002.350276,8154.842476,4346.295603
S001_D42,124576.860801,51427.456139,64512.914721,0.000000,232622.665201,0.000000,26229.595680,0.000000,307083.059312,85379.086435,...,2177.042354,161.034294,11937.380407,80634.321987,1094.445937,11943.564971,4265.520169,5596.879103,6604.146775,4345.806113
S002_D10,0.000000,0.000000,0.000000,0.000000,246593.949891,10651.008268,22270.446851,0.000000,0.000000,11125.538105,...,3323.982341,0.000000,21133.854588,46229.818435,0.000000,11718.406394,0.000000,0.000000,23386.405820,5267.201961
S002_D23,52980.619738,0.000000,415956.112710,0.000000,443011.906351,0.000000,0.000000,0.000000,0.000000,0.000000,...,2508.714731,4267.543219,17838.394210,6541.070445,0.000000,16618.381242,0.000000,0.000000,24665.939736,1729.727805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Mean_G6PD_V68M_1,105391.274570,30961.241620,42421.858769,1347.856658,187642.117500,3266.311969,21942.828686,43225.833913,74004.822036,56964.433640,...,1610.294893,151.853976,15048.630839,23448.243627,332.320707,10365.975350,3687.654083,71995.840098,10155.722769,2387.570017
Mean_G6PD_V68M_2,139180.644210,7950.349642,85820.061479,0.000000,639072.385934,10701.784802,43976.044731,41722.885395,88747.465745,116737.419453,...,2025.980309,253.815695,14385.658987,37387.074842,107.794642,11594.126106,2486.340090,96799.575979,11731.980883,3071.605902
Median_G6PD_V68M_0,85702.941435,0.000000,28496.588135,0.000000,306724.661456,970.371175,19672.126112,0.000000,0.000000,70081.727009,...,2116.268667,0.000000,15758.429842,18894.912616,0.000000,11043.627985,3038.755031,0.000000,11623.333803,2973.194234
Median_G6PD_V68M_1,67805.966392,0.000000,18151.603094,0.000000,147396.213243,4009.305687,20890.583383,0.000000,0.000000,57177.180828,...,1495.539858,0.000000,14377.500738,11142.815372,0.000000,10888.814440,3659.549756,3681.340190,10281.253240,2393.835423


### Export absolute quantitative data and metadata per sample

In [17]:
dataframes_dict = {
    "ProteinData": df_protein_data,
    "ProteinIntensities": df_proteomics_final,
    "ProteinConcentrations": df_concentrations,
    "ProteinCopyNumbers": df_copy_numbers,
    "MCH": df_MCH_final,
    "Metadata": df_metadata,
}
for data_type, df in dataframes_dict.items():
    df.to_csv(
        processed_data_dirpath / f"{dataset_name}_{data_type}.tsv", sep="\t", index=True
    )
    print(f"Saved data for {data_type}")

Saved data for ProteinData
Saved data for ProteinIntensities
Saved data for ProteinConcentrations
Saved data for ProteinCopyNumbers
Saved data for MCH
Saved data for Metadata
