# Prepare Proteomic Data - Intensities, REDS Recall

1. Nemkov T, Stephenson D, Earley EJ, Keele GR, Hay A, Key A, Haiman ZB, Erickson C, Dzieciatkowska M, Reisz JA, Moore A, Stone M, Deng X, Kleinman S, Spitalnik SL, Hod EA, Hudson KE, Hansen KC, Palsson BO, Churchill GA, Roubinian N, Norris PJ, Busch MP, Zimring JC, Page GP, D'Alessandro A. Biological and genetic determinants of glycolysis: Phosphofructokinase isoforms boost energy status of stored red blood cells and transfusion outcomes. Cell Metab. 2024 Sep 3;36(9):1979-1997.e13. doi: 10.1016/j.cmet.2024.06.007. Epub 2024 Jul 3. PMID: 38964323; PMCID: PMC11374506.

2. D'Alessandro A, Culp-Hill R, Reisz JA, Anderson M, Fu X, Nemkov T, Gehrke S, Zheng C, Kanias T, Guo Y, Page G, Gladwin MT, Kleinman S, Lanteri M, Stone M, Busch M, Zimring JC; Recipient Epidemiology and Donor Evaluation Study-III (REDS-III). Heterogeneity of blood processing and storage additives in different centers impacts stored red blood cell metabolism as much as storage time: lessons from REDS-III-Omics. Transfusion. 2019 Jan;59(1):89-100. doi: 10.1111/trf.14979. Epub 2018 Oct 24. PMID: 30353560; PMCID: PMC6322946.

3. Josephson CD, Glynn S, Mathew S, Birch R, Bakkour S, Baumann Kreuziger L, Busch MP, Chapman K, Dinardo C, Hendrickson J, Hod EA, Kelly S, Luban N, Mast A, Norris P, Custer B, Sabino E, Sachais B, Spencer BR, Stone M, Kleinman S; National Heart, Lung, and Blood Institute (NHLBI) Recipient Epidemiology and Donor Evaluation Study-IV-Pediatric (REDS-IV-P). The Recipient Epidemiology and Donor Evaluation Study-IV-Pediatric (REDS-IV-P): A research program striving to improve blood donor safety and optimize transfusion outcomes across the lifespan. Transfusion. 2022 May;62(5):982-999. doi: 10.1111/trf.16869. Epub 2022 Apr 19. PMID: 35441384; PMCID: PMC9353062.

## Setup
### Import packages

In [1]:
import pandas as pd
from rbc_gem_utils import get_dirpath, show_versions
from rbc_gem_utils.util import AVOGADRO_NUMBER, ensure_iterable

# Show versions of notebook
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.3

Dependency Information
----------------------
beautifulsoup4                       4.13.4
bio                                   1.8.0
cobra                                0.29.1
depinfo                               2.2.0
gurobipy                             12.0.3
matplotlib                           3.10.3
matplotlib-venn                       1.1.2
memote                               0.17.0
networkx                                3.5
notebook                              7.4.4
openpyxl                              3.1.5
pandas                                2.3.1
pre-commit                            4.2.0
rbc-gem-utils[database,network,vis] missing
requests                             2.32.4
scikit-learn                          1.7.0
scipy                                1.16.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip          25.1
setuptools 78.1.1
wheel      0.45

## Set organism, dataset, and paths

In [2]:
organism = "Human"
dataset_name = "REDSRecall"
raw_data_dirpath = get_dirpath(use_temp="raw") / organism / dataset_name

# Ensure directory exists
processed_data_dirpath = get_dirpath(use_temp="processed") / organism / dataset_name
processed_data_dirpath.mkdir(exist_ok=True, parents=True)

## Set data value type and variables for columns keys 

In [3]:
protein_values_dtype = "Intensities"
sample_key = "SAMPLE ID"
donor_key = "PUBLIC RECALL DONOR ID"
time_key = "DAY"

time_abbrev = "D"

## Load RBC Proteomics
### Load protein data

In [4]:
df_protein_data = pd.read_csv(
    raw_data_dirpath / "ProteinData.csv",
    index_col=None,
).convert_dtypes()
# Check to see if expected columns are included. If so, then order columns as listed.
# Comes directly from UniProt if possible
df_protein_data = df_protein_data.loc[
    :,
    [
        "Entry",
        "Entry Name",
        "Protein",
        "Protein Names",
        "Gene Names (Primary)",
        "Length",
        "Mass",  # Should be in DA
    ],
]
# Sort the data via alphabetical order of protein IDs for consistency
df_protein_data = df_protein_data.set_index("Entry").sort_index()
df_protein_data.head()

Unnamed: 0_level_0,Entry Name,Protein,Protein Names,Gene Names (Primary),Length,Mass
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A0A075B6I0,LV861_HUMAN,LV861,Immunoglobulin lambda variable 8-61,IGLV8-61,122,12814
A0A075B6I9,LV746_HUMAN,LV746,Immunoglobulin lambda variable 7-46,IGLV7-46,117,12468
A0A075B6J9,LV218_HUMAN,LV218,Immunoglobulin lambda variable 2-18,IGLV2-18,118,12412
A0A075B6K4,LV310_HUMAN,LV310,Immunoglobulin lambda variable 3-10,IGLV3-10,115,12441
A0A075B6K5,LV39_HUMAN,LV39,Immunoglobulin lambda variable 3-9,IGLV3-9,115,12332


#### Load proteomics and map to UniProt if necessary

In [5]:
df_proteomics = pd.read_csv(
    raw_data_dirpath / f"Protein{protein_values_dtype}.csv",
    index_col=None,
).convert_dtypes()
original_ids_type = "Protein"

# Create sample IDs from donor and time points, then set as index
df_proteomics.index = pd.Index(
    df_proteomics[[donor_key, time_key]]
    .apply(lambda x: f"{x[donor_key]}_{time_abbrev}{x[time_key]}", axis=1)
    .values,
    name=sample_key,
)

# Transform Protein IDs to UniProt IDs
if original_ids_type != "uniprot" and any(
    df_proteomics.columns.isin(df_protein_data[original_ids_type])
):
    mapping_dict = df_protein_data.reset_index(drop=False)
    mapping_dict = mapping_dict.set_index(original_ids_type)[df_protein_data.index.name]
    mapping_dict = mapping_dict.to_dict()
    df_proteomics = df_proteomics.rename(mapping_dict, axis=1)

# Sort for consistency
df_proteomics = df_proteomics.sort_index(axis=0)[
    [donor_key, time_key] + list(df_protein_data.index)
]
donor_ids = df_proteomics[donor_key].unique()
timepoints = df_proteomics[time_key].unique()
print(f"Number of donors: {len(donor_ids)}")
print(f"Number of timepoints: {len(timepoints)}")
print(f"Number of expected samples: {len(donor_ids) * len(timepoints)}")
print(f"Number of actual samples: {len(df_proteomics)}")
print(
    f"Number of duplicated IDs: {len(df_proteomics[df_proteomics.index.duplicated()])}"
)
df_proteomics

Number of donors: 651
Number of timepoints: 3
Number of expected samples: 1953
Number of actual samples: 1920
Number of duplicated IDs: 5


Unnamed: 0_level_0,PUBLIC RECALL DONOR ID,DAY,A0A075B6I0,A0A075B6I9,A0A075B6J9,A0A075B6K4,A0A075B6K5,A0A075B6R2,A0A075B6S5,A0A075B6S9,...,Q9Y639,Q9Y666,Q9Y696,Q9Y6B6,Q9Y6B7,Q9Y6E0,Q9Y6I3,Q9Y6M4,Q9Y6M5,Q9Y6R7
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S001_D10,S001,10,114.10669,61.188179,16.998806,1.0,240.439072,0.0,8.726534,0.0,...,4.806035,0.0,44.643505,130.273163,0.0,30.318407,22.992273,0.0,38.119408,202.467422
S001_D23,S001,23,90.694664,18.855646,0.0,0.0,234.999283,0.0,30.381741,148.992706,...,3.503707,0.0,28.674469,0.0,0.0,33.782383,159.744751,21.729137,38.118774,210.14856
S001_D42,S001,42,123.064743,49.431454,61.730526,0.0,221.155167,0.0,25.704966,0.0,...,7.449622,1.478644,26.478333,139.307129,7.024934,45.400738,19.8267,22.173162,28.154861,191.642059
S002_D10,S002,10,0.0,0.0,0.0,0.0,267.438843,12.034687,24.897249,0.0,...,12.975469,0.0,53.475803,91.111374,0.0,50.8153,0.0,0.0,113.735794,264.970459
S002_D23,S002,23,64.492546,0.0,490.452606,0.0,518.987244,0.0,0.0,0.0,...,10.578275,48.285789,48.756619,13.925104,0.0,77.84198,0.0,0.0,129.577789,93.992775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S651_D10,S651,10,324.720825,0.0,104.024376,0.0,324.189484,9.646808,6.394954,75.480797,...,3.162988,8.217668,32.108208,194.34462,0.0,24.224831,26.392359,0.0,30.55924,869.670654
S651_D23,S651,23,269.245667,0.0,0.0,0.0,288.051331,9.713434,25.28808,0.0,...,1.0,0.0,0.0,0.0,0.0,38.350945,7.90285,0.0,41.832993,105.314491
S651_D23,S651,23,382.59378,13.661825,27.932926,0.0,229.929688,0.0,24.01726,62.770485,...,1.055674,0.0,6.279982,27.841568,0.0,30.486551,32.314617,44.323257,29.898968,104.690201
S651_D42,S651,42,309.82193,0.0,37.072918,0.0,251.007492,17.016117,20.869726,0.0,...,1.838209,0.0,11.911963,0.0,0.0,34.496513,0.0,0.0,42.288734,104.227791


### Load metadata corresponding to samples (optional)
#### Genotype data

In [6]:
try:
    df_genotypes = pd.read_csv(
        raw_data_dirpath / "Genotypes.csv",
        index_col=[donor_key],
    ).convert_dtypes()
except FileNotFoundError:
    df_genotypes = pd.DataFrame([])

for col, series in df_genotypes.items():
    counts = series.value_counts().sort_index()
    print(col)
    if len(counts) == 1:
        df_genotypes = df_genotypes.drop(col, axis=1)
    for k, v in counts.items():
        print(f"{k}: {v}")
    print()
df_genotypes = df_genotypes.replace(-1, pd.NA)
df_genotypes

ATP11C_V972M
0: 625
1: 12
2: 2

G6PD_V68M
0: 621
1: 7
2: 11

G6PD_S188F
0: 639



Unnamed: 0_level_0,ATP11C_V972M,G6PD_V68M
PUBLIC RECALL DONOR ID,Unnamed: 1_level_1,Unnamed: 2_level_1
S001,0,0
S002,0,0
S003,0,0
S004,0,0
S005,0,0
...,...,...
S647,0,0
S648,0,0
S649,0,0
S650,0,0


#### Phenotype data

In [7]:
try:
    df_phenotypes = pd.read_csv(
        raw_data_dirpath / "Phenotypes.csv",
        index_col=[donor_key],
    ).convert_dtypes()
except FileNotFoundError:
    df_phenotypes = pd.DataFrame([])
df_phenotypes

Unnamed: 0_level_0,AS,Gender,DONDB.ABO_RH,RBCOmics.Race.Ethnicity.Group,Age,BMI,Weight,Height,Hemolysis.volume,Hemolysis.hct,...,Adjusted.Osmotic.Hemolysis,Adjusted.Oxidative.Hemolysis,CBC.WBC,CBC.RBC,CBC.HGB,CBC.HCT,CBC.MCV,CBC.RDW,CBC.PLT,Ferritin
PUBLIC RECALL DONOR ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S001,AS1,M,O+,CAUCASIAN,33,25.724339,195,73,14.0,65.0,...,51.412936,,5.56,5.33,15.5,46.6,87.4,12.7,274.0,19.0
S002,AS3,M,A+,HIGH,71,27.891291,200,71,14.0,67.0,...,72.000942,,7.32,5.38,16.5,47.9,89.0,12.7,222.0,23.0
S003,AS1,F,O+,HIGH,51,23.128284,139,65,11.5,64.0,...,25.647109,,5.67,4.38,14.2,41.7,95.2,12.1,293.0,17.0
S004,AS1,F,A+,CAUCASIAN,48,23.294675,140,65,12.5,62.0,...,12.702554,,6.1,4.32,13.1,40.9,94.7,12.3,376.0,21.0
S005,AS1,M,A+,OTHER,52,36.04248,210,64,11.0,63.5,...,55.11138,,7.52,5.81,15.9,47.8,82.3,14.4,414.0,93.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S647,AS1,M,O+,HISPANIC,31,30.128571,210,70,10.0,55.0,...,17.757095,57.301236,4.93,4.92,14.3,42.6,86.6,13.2,269.0,118.0
S648,AS1,M,B+,AFRAMRCN,36,47.988868,325,69,14.0,60.0,...,31.811182,52.646317,6.38,5.54,15.6,47.2,85.2,12.7,234.0,120.0
S649,AS3,F,A+,CAUCASIAN,61,23.170166,135,64,13.25,61.29,...,42.473012,,4.3,4.24,13.5,41.0,96.7,13.1,223.0,17.0
S650,AS1,F,A+,HIGH,62,24.293018,146,65,10.0,64.0,...,37.48213,18.510241,4.61,4.44,13.3,41.0,92.3,13.3,303.0,9.0


##### Cut phenotype data into ranges if desired

In [8]:
# cuts = {
#     "BMI": [0, 25, 30, 40, 60],
#     "Age": [0, 20, 40, 60, 80, 100],
# }
# if not df_phenotypes.empty:
#     for col, bins in cuts.items():
#         labels = []
#         for idx in range(1, len(bins)):
#             if idx == 1 or idx == len(bins) - 1:
#                 labels += [f"lt{bins[idx]:d}"]
#             elif idx:
#                 labels += [f"gt{bins[idx - 1]:d}"]
#             else:
#                 labels += [f"{bins[idx - 1]:d}to{bins[idx]:d}"]
#         df_phenotypes[f"{col}_Range"] = pd.cut(
#             df_phenotypes[col], bins=bins, labels=labels, right=False
#         )
# df_phenotypes

#### Combine into one DataFrame for MetaData

In [9]:
print(f"Proteomics: {df_proteomics[donor_key].nunique()} donors")
print(
    f"  Genomics: {df_genotypes.dropna(how='all', axis=0).index.nunique() if not df_genotypes.empty else 0} donors"
)
print(
    f"Phenotypes: {df_phenotypes.dropna(how='all', axis=0).index.nunique() if not df_phenotypes.empty else 0} donors"
)


df_metadata = pd.concat((df_genotypes, df_phenotypes), axis=1).convert_dtypes()

if not df_metadata.empty:
    df_metadata = df_metadata.reset_index(drop=False)
    # Ensure only metadata corresponds to the available omics data
    if not df_metadata[donor_key].isin(df_proteomics[donor_key]).all():
        df_metadata = df_metadata[df_metadata[donor_key].isin(df_proteomics[donor_key])]

    # If time was not included in metadata, add as a part of index to ensure index matches samples
    if time_key and time_key not in df_metadata.index:
        df_metadata = (
            pd.concat(
                (
                    df_metadata,
                    pd.Series(
                        [list(df_proteomics[time_key].unique())]
                        * len(df_metadata.index),
                        index=df_metadata.index,
                        name=time_key,
                    ),
                ),
                axis=1,
            )
            .explode(time_key)
            .reset_index(drop=True)
        )
    # Create sample IDs from donor and time points, then set as index
    df_metadata.index = pd.Index(
        df_metadata[[donor_key, time_key]]
        .apply(lambda x: f"{x[donor_key]}_{time_abbrev}{x[time_key]}", axis=1)
        .values,
        name=sample_key,
    )
    print(f"\nFinal data: {df_metadata[donor_key].nunique()} donors")
    df_metadata = df_metadata.drop([donor_key, time_key], axis=1)
else:
    print(f"\nFinal Meta: 0 donors")

df_metadata.head()

Proteomics: 651 donors
  Genomics: 639 donors
Phenotypes: 651 donors

Final data: 651 donors


Unnamed: 0_level_0,ATP11C_V972M,G6PD_V68M,AS,Gender,DONDB.ABO_RH,RBCOmics.Race.Ethnicity.Group,Age,BMI,Weight,Height,...,Adjusted.Osmotic.Hemolysis,Adjusted.Oxidative.Hemolysis,CBC.WBC,CBC.RBC,CBC.HGB,CBC.HCT,CBC.MCV,CBC.RDW,CBC.PLT,Ferritin
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S001_D10,0,0,AS1,M,O+,CAUCASIAN,33,25.724339,195,73,...,51.412936,,5.56,5.33,15.5,46.6,87.4,12.7,274.0,19.0
S001_D23,0,0,AS1,M,O+,CAUCASIAN,33,25.724339,195,73,...,51.412936,,5.56,5.33,15.5,46.6,87.4,12.7,274.0,19.0
S001_D42,0,0,AS1,M,O+,CAUCASIAN,33,25.724339,195,73,...,51.412936,,5.56,5.33,15.5,46.6,87.4,12.7,274.0,19.0
S002_D10,0,0,AS3,M,A+,HIGH,71,27.891291,200,71,...,72.000942,,7.32,5.38,16.5,47.9,89.0,12.7,222.0,23.0
S002_D23,0,0,AS3,M,A+,HIGH,71,27.891291,200,71,...,72.000942,,7.32,5.38,16.5,47.9,89.0,12.7,222.0,23.0


In [10]:
df = df_metadata[["Gender", "G6PD_V68M"]].copy()
df.index = [x.split("_")[0] for x in df.index]
df = (
    df.groupby(level=0).agg(lambda x: list(x.unique())).explode(["Gender", "G6PD_V68M"])
)
df.groupby("Gender").value_counts()

Gender  G6PD_V68M
F       0            291
        1              7
M       0            330
        2             11
Name: count, dtype: int64

### Get MCH per sample

In [11]:
# Provide in picograms. Set as None to use metadata if provided
mch_sample_value = None
if mch_sample_value is None:
    try:
        df_MCH_per_sample = pd.read_csv(
            raw_data_dirpath / "Phenotypes.csv",
            index_col=None,
        )

    except FileNotFoundError:
        raise ValueError(
            "Cannot determine MCH. No phenotype data provided and a default value is not provided"
        )

    # Ensure only metadata corresponds to the available omics data
    if not df_MCH_per_sample[donor_key].isin(df_proteomics[donor_key]).all():
        df_MCH_per_sample = df_MCH_per_sample[
            df_MCH_per_sample[donor_key].isin(df_proteomics[donor_key])
        ]

    if "CBC.MCH" not in df_MCH_per_sample.columns:
        if all([x in df_MCH_per_sample.columns for x in ["CBC.HGB", "CBC.RBC"]]):
            df_MCH_per_sample["CBC.MCH"] = (
                df_MCH_per_sample["CBC.HGB"] / df_MCH_per_sample["CBC.RBC"]
            ) * 10
        elif all([x in df_MCH_per_sample.columns for x in ["CBC.MCHC", "CBC.MCV"]]):
            df_MCH_per_sample["CBC.MCH"] = (
                df_MCH_per_sample["CBC.MCHC"] * df_MCH_per_sample["CBC.MCV"]
            ) / 100
        else:
            raise ValueError(
                "Cannot determine MCH, one of the following combinations is needed: (CBC.HGB and CBC.RBC) or (CBC.MCHC and CBC.MCV)"
            )
    df_MCH_per_sample = df_MCH_per_sample.set_index(donor_key)["CBC.MCH"]
    n_missing = len(df_MCH_per_sample[df_MCH_per_sample.isna()])
    print(f"Missing values for {n_missing} samples.")
    print(f"Mean MCH in pg: {df_MCH_per_sample.mean():.2f}")
    df_MCH_per_sample = df_MCH_per_sample.fillna(df_MCH_per_sample.mean())
else:
    print("Using default MCH value provided for all samples")
    df_MCH_per_sample = pd.Series(
        [mch_sample_value] * df_proteomics[donor_key].nunique(),
        index=pd.Index(df_proteomics[donor_key].unique(), name=donor_key),
        name="CBC.MCH",
    )
    print(f"Mean MCH in pg: {mch_sample_value:.2f}")

df_MCH_per_sample = df_MCH_per_sample.reset_index(drop=False)
# If time was not included in metadata, add as a part of index to ensure index matches samples
if time_key and time_key not in df_MCH_per_sample.index:
    df_MCH_per_sample = (
        pd.concat(
            (
                df_MCH_per_sample,
                pd.Series(
                    [list(df_proteomics[time_key].unique())]
                    * len(df_MCH_per_sample.index),
                    index=df_MCH_per_sample.index,
                    name=time_key,
                ),
            ),
            axis=1,
        )
        .explode(time_key)
        .reset_index(drop=True)
    )
# Create sample IDs from donor and time points, then set as index
df_MCH_per_sample.index = pd.Index(
    df_MCH_per_sample[[donor_key, time_key]]
    .apply(lambda x: f"{x[donor_key]}_{time_abbrev}{x[time_key]}", axis=1)
    .values,
    name=sample_key,
)
df_MCH_per_sample.head()

Missing values for 12 samples.
Mean MCH in pg: 29.51


Unnamed: 0_level_0,PUBLIC RECALL DONOR ID,CBC.MCH,DAY
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S001_D10,S001,29.080675,10
S001_D23,S001,29.080675,23
S001_D42,S001,29.080675,42
S002_D10,S002,30.669145,10
S002_D23,S002,30.669145,23


### Remove duplicated IDs in samples

In [12]:
df_proteomics = df_proteomics[~df_proteomics.index.duplicated(keep=False)]
df_MCH_per_sample = df_MCH_per_sample.loc[df_proteomics.index]
sample_ids = list(df_proteomics.index)
df_proteomics

Unnamed: 0_level_0,PUBLIC RECALL DONOR ID,DAY,A0A075B6I0,A0A075B6I9,A0A075B6J9,A0A075B6K4,A0A075B6K5,A0A075B6R2,A0A075B6S5,A0A075B6S9,...,Q9Y639,Q9Y666,Q9Y696,Q9Y6B6,Q9Y6B7,Q9Y6E0,Q9Y6I3,Q9Y6M4,Q9Y6M5,Q9Y6R7
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S001_D10,S001,10,114.10669,61.188179,16.998806,1.0,240.439072,0.0,8.726534,0.0,...,4.806035,0.0,44.643505,130.273163,0.0,30.318407,22.992273,0.0,38.119408,202.467422
S001_D23,S001,23,90.694664,18.855646,0.0,0.0,234.999283,0.0,30.381741,148.992706,...,3.503707,0.0,28.674469,0.0,0.0,33.782383,159.744751,21.729137,38.118774,210.14856
S001_D42,S001,42,123.064743,49.431454,61.730526,0.0,221.155167,0.0,25.704966,0.0,...,7.449622,1.478644,26.478333,139.307129,7.024934,45.400738,19.8267,22.173162,28.154861,191.642059
S002_D10,S002,10,0.0,0.0,0.0,0.0,267.438843,12.034687,24.897249,0.0,...,12.975469,0.0,53.475803,91.111374,0.0,50.8153,0.0,0.0,113.735794,264.970459
S002_D23,S002,23,64.492546,0.0,490.452606,0.0,518.987244,0.0,0.0,0.0,...,10.578275,48.285789,48.756619,13.925104,0.0,77.84198,0.0,0.0,129.577789,93.992775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S648_D42,S648,42,77.322556,0.0,0.0,0.0,519.996765,0.0,22.033518,0.0,...,9.065331,0.0,68.654137,0.0,0.0,43.890713,0.0,0.0,62.910065,98.207466
S649_D10,S649,10,90.417908,0.0,64.904564,0.0,121.021606,0.0,8.485724,0.0,...,4.922802,0.0,18.535042,155.032333,0.0,29.087191,19.722677,0.0,35.296345,138.553513
S649_D42,S649,42,95.081497,0.0,0.0,0.0,86.734108,0.0,4.888623,32.385452,...,5.490839,13.410266,23.362236,132.913025,0.0,34.493198,15.165656,72.291283,39.360321,99.92926
S650_D10,S650,10,0.0,0.0,0.0,0.0,263.719025,8.240076,0.0,0.0,...,4.578097,0.0,0.0,18.804209,0.0,55.625256,34.652401,0.0,89.325157,337.018494


### Get data subsets using operations

In [13]:
operations = [
    "mean",
    "median",
]
operation_dfs_proteomics = []
operation_dfs_MCH = []
fill_keys = set()


def group_data(df, operation, keys, columns, prefix_values=None, name_col=None):
    keys = ensure_iterable(keys)
    if not prefix_values:
        prefix_values = [""] * len(keys)
    if isinstance(prefix_values, dict):
        prefix_values = {k: prefix_values.get(k, "") for k in keys}
    else:
        prefix_values = dict(zip(keys, prefix_values))

    df = df.groupby(keys, as_index=False, observed=False)[columns]
    df = getattr(df, operation.lower())()
    labels = df[keys].apply(
        lambda x: "_".join([f"{prefix_values[key]}{x[key]}" for key in keys]),
        axis=1,
    )
    df[name_col] = [f"{operation.capitalize()}_{value}" for value in labels]
    return df

#### Group by donor only

In [14]:
# keys = [donor_key]
# prefix_values = {}

# operation_dfs_proteomics += [
#     group_data(
#         df_proteomics.reset_index(drop=False),
#         operation,
#         keys=keys,
#         columns=list(df_protein_data.index),
#         prefix_values=prefix_values,
#         name_col=sample_key,
#     )
#     for operation in operations
# ]

# operation_dfs_MCH += [
#     group_data(
#         df_MCH_per_sample.reset_index(drop=False),
#         operation,
#         keys=keys,
#         columns=["CBC.MCH"],
#         prefix_values=prefix_values,
#         name_col=sample_key,
#     )
#     for operation in operations
# ]

#### Group by time only

In [15]:
keys = [time_key]
prefix_values = {time_key: time_abbrev}

operation_dfs_proteomics += [
    group_data(
        df_proteomics.reset_index(drop=False),
        operation,
        keys=keys,
        columns=list(df_protein_data.index),
        prefix_values=prefix_values,
        name_col=sample_key,
    )
    for operation in operations
]

operation_dfs_MCH += [
    group_data(
        df_MCH_per_sample.reset_index(drop=False),
        operation,
        keys=keys,
        columns=["CBC.MCH"],
        prefix_values=prefix_values,
        name_col=sample_key,
    )
    for operation in operations
]

#### Group by metadata only

In [16]:
keys_prefixes = dict(zip(list(df_genotypes.columns), list(df_genotypes.columns)))
keys_prefixes.update(
    # "Gender": "Sex",
    # "BMI_Range": "BMI",
    # "Age_Range": "Age",
)
keys_prefixes = {k: f"{v}_" for k, v in keys_prefixes.items()}
for key, prefix in keys_prefixes.items():

    operation_dfs_proteomics += [
        group_data(
            pd.merge(
                df_proteomics,
                df_metadata,
                left_index=True,
                right_index=True,
                how="left",
            ).reset_index(drop=False),
            operation,
            keys=[key],
            columns=list(df_protein_data.index),
            prefix_values={key: prefix},
            name_col=sample_key,
        )
        for operation in operations
    ]

    operation_dfs_MCH += [
        group_data(
            pd.merge(
                df_MCH_per_sample,
                df_metadata,
                left_index=True,
                right_index=True,
                how="left",
            ).reset_index(drop=False),
            operation,
            keys=[key],
            columns=["CBC.MCH"],
            prefix_values={key: prefix},
            name_col=sample_key,
        )
        for operation in operations
    ]

### Add to DataFrames

In [17]:
try:
    df_proteomics_op = pd.concat(operation_dfs_proteomics, axis=0).drop_duplicates()
except (KeyError, ValueError):
    df_proteomics_final = df_proteomics[df_protein_data.index].copy()
else:
    df_proteomics_final = pd.concat(
        (df_proteomics.reset_index(drop=False), df_proteomics_op), axis=0
    )
    df_proteomics_final = df_proteomics_final.set_index(sample_key)[
        df_protein_data.index
    ]

try:
    df_MCH_op = pd.concat(operation_dfs_MCH, axis=0).drop_duplicates()
except (KeyError, ValueError):
    if isinstance(df_MCH_per_sample, pd.DataFrame):
        df_MCH_final = df_MCH_per_sample["CBC.MCH"].copy()
    else:
        df_MCH_final = df_MCH_per_sample.copy()
else:
    df_MCH_final = pd.concat(
        (df_MCH_per_sample.reset_index(drop=False), df_MCH_op), axis=0
    )
    df_MCH_final = df_MCH_final.set_index(sample_key)["CBC.MCH"]

df_MCH_final.name = "MCH"
df_MCH_final

SAMPLE ID
S001_D10              29.080675
S001_D23              29.080675
S001_D42              29.080675
S002_D10              30.669145
S002_D23              30.669145
                        ...    
Mean_G6PD_V68M_1      28.884994
Mean_G6PD_V68M_2      30.647021
Median_G6PD_V68M_0    29.655172
Median_G6PD_V68M_1    29.605263
Median_G6PD_V68M_2    30.371901
Name: MCH, Length: 1928, dtype: float64

### Normalize data by hemoglobin mass
#### Set percent for hemoglobin and low abundance protoemes

In [18]:
HB_PERCENT, LA_PERCENT = (0.95, 0.05)
df_percent_abundance = df_proteomics_final.apply(lambda x: x / x.sum(), axis=1)

#### Scale data

In [19]:
MODELED_PERCENT = HB_PERCENT + LA_PERCENT
assert 1 >= MODELED_PERCENT
HB_PROTEINS = {
    "HBA": "P69905",  # Hemoglobin subunit alpha
    "HBB": "P68871",  # Hemoglobin subunit beta
    "HBD": "P02042",  # Hemoglobin subunit delta
    "HBE1": "P02100",  # Hemoglobin subunit beta
    "HBG1": "P69891",  # Hemoglobin subunit gamma-1
    "HBG2": "P69892",  # Hemoglobin subunit gamma-2
    "HBM": "Q6B0K9",  # Hemoglobin subunit mu
    "HBQ1": "P09105",  # Hemoglobin subunit theta-1
    "HBZ": "P02008",  # Hemoglobin subunit zeta
}

# Protein intensity / Total intensity --> Percent protein abundance / total protein
df_percent_hb = df_percent_abundance.loc[
    :, df_percent_abundance.columns.isin(list(HB_PROTEINS.values()))
]
df_percent_la = df_percent_abundance.loc[
    :, ~df_percent_abundance.columns.isin(list(HB_PROTEINS.values()))
]

df_summary = {
    "Perfect total": 1.0,
    "Current total": df_percent_abundance.loc[sample_ids].sum(axis=1).mean().item(),
    "Hemoglobin total": df_percent_hb.loc[sample_ids].sum(axis=1).mean().item(),
    "Low abundance total": df_percent_la.loc[sample_ids].sum(axis=1).mean().item(),
}

# # Scale hemoglobin and low abundance protoeme percentages
# df_percent_hb = HB_PERCENT * df_percent_hb.div(df_percent_hb.sum(axis=1), axis=0)
# df_percent_la = LA_PERCENT * df_percent_la.div(df_percent_la.sum(axis=1), axis=0)

# Combine dataframes back into one
df_percent_abundance = pd.concat((df_percent_hb, df_percent_la), axis=1)

df_summary["Hemoglobin scaled"] = (
    df_percent_hb.loc[sample_ids].sum(axis=1).mean().item()
)
df_summary["Low abundance scaled"] = (
    df_percent_la.loc[sample_ids].sum(axis=1).mean().item()
)
df_summary["Remaining scaled"] = 1 - (HB_PERCENT + LA_PERCENT)
df_summary = pd.DataFrame.from_dict(
    {" " * max(30 - len(k), 0) + k: [f"{v * 100:.1f}%"] for k, v in df_summary.items()},
    orient="index",
    columns=["Percentage"],
)
print(df_summary)

                     Percentage
       Perfect total     100.0%
       Current total     100.0%
    Hemoglobin total      42.9%
 Low abundance total      57.1%
   Hemoglobin scaled      42.9%
Low abundance scaled      57.1%
    Remaining scaled       0.0%


### Transform data to copy numbers and expected format

In [20]:
df_uniprot_to_mw = df_protein_data["Mass"].astype(float)

gDW_total_protein = (
    df_MCH_final  # pgDW HB
    * (1 / HB_PERCENT)  # pgDW total protein / pgDW HB
    * (1 / 1e12)  #  gDW total protein / pgDW total protein
)  #  gDW total protein

# Percent protein abundance / total protein --> Specific protein concentration / total protein
df_mol_per_gDW = df_percent_abundance.div(
    df_uniprot_to_mw, axis=1  # mol protein / gDW total protein
)
# Convert from mol / gDW protein --> nmol / gDW protein
df_nmol_per_gDW = df_mol_per_gDW * (1e9 / 1)  # nmol protein / mol protein

# Convert from mol / gDW protein --> copy numbers / cell
df_copy_numbers = df_mol_per_gDW.mul(gDW_total_protein, axis=0) * AVOGADRO_NUMBER
df_copy_numbers

Unnamed: 0_level_0,A0A075B6I0,A0A075B6I9,A0A075B6J9,A0A075B6K4,A0A075B6K5,A0A075B6R2,A0A075B6S5,A0A075B6S9,A0A087WSY6,A0A0A0MRZ8,...,Q9Y639,Q9Y666,Q9Y696,Q9Y6B6,Q9Y6B7,Q9Y6E0,Q9Y6I3,Q9Y6M4,Q9Y6M5,Q9Y6R7
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S001_D10,120545.940946,66435.00015,18539.706555,1088.105274,263935.453757,0.0,9292.960704,0.0,340778.686142,65004.165389,...,1465.741222,0.0,21004.601124,78693.580513,0.0,8323.676587,5162.27606,0.0,9331.408832,4791.510271
S001_D23,88140.435116,18833.142784,0.0,0.0,237307.352604,0.0,29763.016174,146234.556512,340591.620699,67233.455659,...,982.992383,0.0,12410.906566,0.0,0.0,8532.004331,32994.220823,5265.631869,8584.044712,4575.048004
S001_D42,131133.537685,54134.164357,67908.331286,0.0,244865.96337,0.0,27610.100716,0.0,323245.325592,89872.722563,...,2291.623531,169.509783,12565.663586,84878.233671,1152.048355,12572.173653,4490.02123,5891.451687,6951.733447,4574.532751
S002_D10,0.0,0.0,0.0,0.0,259572.578832,11211.587651,23442.575633,0.0,0.0,11711.092742,...,3498.92878,0.0,22246.162724,48662.966774,0.0,12335.164625,0.0,0.0,24617.269284,5544.423116
S002_D23,55769.073408,0.0,437848.539694,0.0,466328.322475,0.0,0.0,0.0,0.0,0.0,...,2640.752348,4492.150757,18777.257063,6885.337311,0.0,17493.032886,0.0,0.0,25964.147091,1820.766111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Mean_G6PD_V68M_1,123227.21871,30522.694529,58462.148905,1315.555853,213515.551472,2731.069781,23129.033081,36142.527071,100112.917787,63718.907274,...,1710.317603,248.171843,15564.927208,25034.343654,349.810478,11044.140482,4803.173662,61150.33852,10998.05842,2557.286304
Mean_G6PD_V68M_2,146505.941274,8368.789097,90336.90682,0.0,672707.774667,11265.036633,46290.573401,43918.826731,93418.384995,122881.494161,...,2132.610852,267.174416,15142.798934,39354.815623,113.468044,12204.343269,2617.200095,101894.290504,12349.453562,3233.26937
Median_G6PD_V68M_0,89829.527908,0.0,29590.687924,0.0,321875.761525,1148.080808,20717.916048,0.0,0.0,73416.473851,...,2228.771629,0.0,16435.595881,19599.497851,0.0,11623.054056,3184.30544,0.0,12244.230813,3142.030841
Median_G6PD_V68M_1,91978.203017,0.0,31638.015081,0.0,244566.724537,0.0,22559.414374,0.0,0.0,71122.790824,...,1565.098398,0.0,14784.344069,26817.069148,0.0,11929.744577,3951.890565,4159.868013,11719.183273,2585.065446


## Export absolute quantitative data and metadata per sample

In [21]:
dataframes_dict = {
    "ProteinData": df_protein_data,
    "ProteinIntensities": df_proteomics_final,
    "ProteinConcentrations": df_nmol_per_gDW,
    "ProteinCopyNumbers": df_copy_numbers,
    "MCH": df_MCH_final,
    "Metadata": df_metadata,
}
for data_type, df in dataframes_dict.items():
    # df.to_csv(
    #     processed_data_dirpath / f"{data_type}.tsv", sep="\t", index=True
    # )
    df.to_csv(processed_data_dirpath / f"{data_type}.csv", index=True)
    print(f"Saved data for {data_type}")

Saved data for ProteinData
Saved data for ProteinIntensities
Saved data for ProteinConcentrations
Saved data for ProteinCopyNumbers
Saved data for MCH
Saved data for Metadata
