# Prepare Proteomic Data - Intensities, REDS Index

1. Nemkov T, Stephenson D, Earley EJ, Keele GR, Hay A, Key A, Haiman ZB, Erickson C, Dzieciatkowska M, Reisz JA, Moore A, Stone M, Deng X, Kleinman S, Spitalnik SL, Hod EA, Hudson KE, Hansen KC, Palsson BO, Churchill GA, Roubinian N, Norris PJ, Busch MP, Zimring JC, Page GP, D'Alessandro A. Biological and genetic determinants of glycolysis: Phosphofructokinase isoforms boost energy status of stored red blood cells and transfusion outcomes. Cell Metab. 2024 Sep 3;36(9):1979-1997.e13. doi: 10.1016/j.cmet.2024.06.007. Epub 2024 Jul 3. PMID: 38964323; PMCID: PMC11374506.

2. D'Alessandro A, Culp-Hill R, Reisz JA, Anderson M, Fu X, Nemkov T, Gehrke S, Zheng C, Kanias T, Guo Y, Page G, Gladwin MT, Kleinman S, Lanteri M, Stone M, Busch M, Zimring JC; Recipient Epidemiology and Donor Evaluation Study-III (REDS-III). Heterogeneity of blood processing and storage additives in different centers impacts stored red blood cell metabolism as much as storage time: lessons from REDS-III-Omics. Transfusion. 2019 Jan;59(1):89-100. doi: 10.1111/trf.14979. Epub 2018 Oct 24. PMID: 30353560; PMCID: PMC6322946.

3. Josephson CD, Glynn S, Mathew S, Birch R, Bakkour S, Baumann Kreuziger L, Busch MP, Chapman K, Dinardo C, Hendrickson J, Hod EA, Kelly S, Luban N, Mast A, Norris P, Custer B, Sabino E, Sachais B, Spencer BR, Stone M, Kleinman S; National Heart, Lung, and Blood Institute (NHLBI) Recipient Epidemiology and Donor Evaluation Study-IV-Pediatric (REDS-IV-P). The Recipient Epidemiology and Donor Evaluation Study-IV-Pediatric (REDS-IV-P): A research program striving to improve blood donor safety and optimize transfusion outcomes across the lifespan. Transfusion. 2022 May;62(5):982-999. doi: 10.1111/trf.16869. Epub 2022 Apr 19. PMID: 35441384; PMCID: PMC9353062.

## Setup
### Import packages

In [1]:
import pandas as pd
from rbc_gem_utils import get_dirpath, show_versions
from rbc_gem_utils.util import AVOGADRO_NUMBER, ensure_iterable

# Show versions of notebook
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.3

Dependency Information
----------------------
beautifulsoup4                       4.13.4
bio                                   1.8.0
cobra                                0.29.1
depinfo                               2.2.0
gurobipy                             12.0.3
matplotlib                           3.10.3
matplotlib-venn                       1.1.2
memote                               0.17.0
networkx                                3.5
notebook                              7.4.4
openpyxl                              3.1.5
pandas                                2.3.1
pre-commit                            4.2.0
rbc-gem-utils[database,network,vis] missing
requests                             2.32.4
scikit-learn                          1.7.0
scipy                                1.16.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip          25.1
setuptools 78.1.1
wheel      0.45

## Set organism, dataset, and paths

In [2]:
organism = "Human"
dataset_name = "REDSIndex"
raw_data_dirpath = get_dirpath(use_temp="raw") / organism / dataset_name

# Ensure directory exists
processed_data_dirpath = get_dirpath(use_temp="processed") / organism / dataset_name
processed_data_dirpath.mkdir(exist_ok=True, parents=True)

## Set data value type and variables for columns keys 

In [3]:
protein_values_dtype = "Intensities"
sample_key = "SAMPLE ID"
donor_key = "PUBLIC INDEX DONOR ID"

## Load RBC Proteomics
### Load protein data

In [4]:
df_protein_data = pd.read_csv(
    raw_data_dirpath / "ProteinData.csv",
    index_col=None,
).convert_dtypes()
# Check to see if expected columns are included. If so, then order columns as listed.
# Comes directly from UniProt if possible
df_protein_data = df_protein_data.loc[
    :,
    [
        "Entry",
        "Entry Name",
        "Protein",
        "Protein Names",
        "Gene Names (Primary)",
        "Length",
        "Mass",  # Should be in DA
    ],
]
# Sort the data via alphabetical order of protein IDs for consistency
df_protein_data = df_protein_data.set_index("Entry").sort_index()
df_protein_data.head()

Unnamed: 0_level_0,Entry Name,Protein,Protein Names,Gene Names (Primary),Length,Mass
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A0A075B6H7,KV37_HUMAN,KV37,Probable non-functional immunoglobulin kappa v...,IGKV3-7,116,12783
A0A075B6K5,LV39_HUMAN,LV39,Immunoglobulin lambda variable 3-9,IGLV3-9,115,12332
A0A075B6P5,KV228_HUMAN,KV228,Immunoglobulin kappa variable 2-28,IGKV2-28,120,12957
A0A075B6R9,KVD24_HUMAN,KVD24,Probable non-functional immunoglobulin kappa v...,IGKV2D-24,120,13079
A0A075B6S2,KVD29_HUMAN,KVD29,Immunoglobulin kappa variable 2D-29,IGKV2D-29,120,13143


#### Load proteomics and map to UniProt if necessary

In [5]:
df_proteomics = pd.read_csv(
    raw_data_dirpath / f"Protein{protein_values_dtype}.csv",
    index_col=None,
).convert_dtypes()
original_ids_type = "uniprot"

df_proteomics = df_proteomics.set_index(donor_key)
df_proteomics.index.name = sample_key
# Transform Protein IDs to UniProt IDs
if original_ids_type != "uniprot" and any(
    df_proteomics.columns.isin(df_protein_data[original_ids_type])
):
    mapping_dict = df_protein_data.reset_index(drop=False)
    mapping_dict = mapping_dict.set_index(original_ids_type)[df_protein_data.index.name]
    mapping_dict = mapping_dict.to_dict()
    df_proteomics = df_proteomics.rename(mapping_dict, axis=1)

# # Sort for consistency
# df_proteomics = df_proteomics.sort_index(axis=0)[
#     [donor_key, time_key] + list(df_protein_data.index)
# ]
donor_ids = df_proteomics.index.unique()
print(f"Number of donors: {len(donor_ids)}")
print(
    f"Number of duplicated IDs: {len(df_proteomics[df_proteomics.index.duplicated()])}"
)
df_proteomics

Number of donors: 13062
Number of duplicated IDs: 0


Unnamed: 0_level_0,P68871,P32119,Q5BKX8,P69905,P00915,P02042,P02100,P02768,P11166,P02730,...,P61077,P62837,P61026,Q9Y5X4,A0A087WSY6,P05387,Q9H2M9,P57772,P78318,O43583
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S00001,7073.981445,8771.099609,6448.168457,6324.992188,5412.810547,4644.655762,3903.654541,3072.764648,5684.953125,3394.652588,...,48.514847,48.514847,60.15979,62.942181,68.603394,45.780422,80.064346,81.553123,111.679413,63.537769
S00002,9190.239258,5327.814941,7863.122559,6807.322754,4647.782227,3343.699219,5837.21582,4059.376221,2300.852295,3123.016357,...,127.883858,127.883858,90.135132,120.0952,55.360386,86.750679,63.579247,55.08456,73.463371,100.424057
S00003,8498.479492,4922.197754,126.768845,7736.603516,4992.395508,4374.737305,4242.23584,4505.963379,1433.8302,1862.586914,...,134.436707,134.436707,73.548569,51.715431,132.122818,85.769547,110.676659,53.000729,105.195908,64.226677
S00004,10630.88965,12136.83008,5000.621094,8911.09668,7699.40918,5738.85791,3954.778809,4777.587402,2832.1875,2285.200684,...,39.539875,39.539875,47.003647,61.498272,68.938332,43.723743,84.613365,78.729622,104.424522,56.544079
S00005,5723.407715,8565.396484,3377.380859,6979.328613,4816.725586,4077.465088,3848.742432,3599.222412,3777.494385,3049.085449,...,108.394714,108.394714,75.101746,106.529846,99.147247,101.750771,87.171074,57.418106,100.425819,125.826004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S13058,6586.478516,7412.302734,6501.297852,6532.356445,3902.338379,3500.957031,4225.030273,2557.459961,4362.724609,3635.246338,...,60.237164,60.237164,92.489388,63.391792,58.014748,55.713615,49.492943,93.865875,60.760597,121.360664
S13059,12357.73047,7735.660645,9155.633789,7385.98291,5878.56543,3710.473877,5086.031738,4291.113281,5325.365234,3931.783691,...,126.9338,126.9338,69.102386,137.057938,87.538391,52.625576,103.661774,51.053646,91.741119,60.273109
S13060,16872.25,14527.30371,8724.886719,9532.143555,9328.698242,5914.047852,4844.871582,2734.96875,5758.858887,3095.335938,...,74.558983,74.558983,77.828712,80.506981,73.264915,125.240219,59.302101,77.076576,68.149017,100.167213
S13061,9447.56543,8308.814453,4838.060059,6977.07959,5823.077148,3960.940186,3569.583008,3629.384277,3140.781006,1786.237671,...,104.796013,104.796013,65.034401,99.836937,107.264633,111.633064,121.998962,73.177971,87.983307,84.537773


### Load metadata corresponding to samples (optional)
#### Genotype data

In [6]:
try:
    df_genotypes = pd.read_csv(
        raw_data_dirpath / "Genotypes.csv",
        index_col=[donor_key],
    ).convert_dtypes()
except FileNotFoundError:
    df_genotypes = pd.DataFrame([])
else:
    df_genotypes.index.name = sample_key
for col, series in df_genotypes.items():
    counts = series.value_counts().sort_index()
    print(col)
    if len(counts) == 1:
        df_genotypes = df_genotypes.drop(col, axis=1)
    for k, v in counts.items():
        print(f"{k}: {v}")
    print()
df_genotypes = df_genotypes.replace(-1, pd.NA)
df_genotypes

ATP11C_V972M
-1: 9
0: 12317
1: 169
2: 75

G6PD_V68M
-1: 6
0: 12293
1: 168
2: 103

G6PD_S188F
-1: 4
0: 12554
1: 7
2: 5



Unnamed: 0_level_0,ATP11C_V972M,G6PD_V68M,G6PD_S188F
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S00001,0,0,0
S00002,0,0,0
S00003,0,0,0
S00004,0,0,0
S00005,0,0,0
...,...,...,...
S13086,0,0,0
S13087,0,0,0
S13088,0,0,0
S13089,0,0,0


#### Phenotype data

In [7]:
try:
    df_phenotypes = pd.read_csv(
        raw_data_dirpath / "Phenotypes.csv",
        index_col=[donor_key],
    ).convert_dtypes()
except FileNotFoundError:
    df_phenotypes = pd.DataFrame([])
else:
    df_phenotypes.index.name = sample_key
df_phenotypes

Unnamed: 0_level_0,Blood Center,Additive,Sex,Blood group,RH Status,Ethnicity,Storage,Osmotic,Oxidative,Ferritin,BMI,Age,CBC.WBC,CBC.RBC,CBC.HGB,CBC.HCT,CBC.MCV,CBC.RDW,CBC.PLT
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
S00001,ARC,CP2D; AS3,F,A-,-,AFRAMRCN,0.343415,13.383662,29.400921,23.0,26.51795,24,7.38,4.58,11.6,37.3,81.4,14.5,259.0
S00002,ITxM,CPD; AS1,F,O+,+,CAUCASIAN_OTHER,0.281698,24.524673,46.526669,11.0,24.689126,21,5.3,4.27,13.0,38.4,89.8,12.4,255.0
S00003,ITxM,CPD; AS1,M,O+,+,CAUCASIAN_OTHER,0.178813,29.662541,35.858546,66.0,29.210526,45,6.4,4.96,16.1,47.0,94.9,12.7,201.0
S00004,ARC,CP2D; AS3,F,B-,-,CAUCASIAN_OTHER,0.173793,21.129382,0.0,11.0,25.845588,46,8.39,4.26,12.5,39.6,93.0,13.4,319.0
S00005,ARC,CP2D; AS3,M,O+,+,HIGH,0.146873,6.630983,16.078902,25.0,23.672449,76,7.15,4.84,14.8,44.8,92.6,13.7,264.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S13086,,,,,,,,,,,,,,,,,,,
S13087,,,,,,,,,,,,,,,,,,,
S13088,,,,,,,,,,,,,,,,,,,
S13089,,,,,,,,,,,,,,,,,,,


##### Cut phenotype data into ranges if desired

In [8]:
# cuts = {
#     "BMI": [0, 25, 30, 40, 60],
#     "Age": [0, 20, 40, 60, 80, 100],
# }
# if not df_phenotypes.empty:
#     for col, bins in cuts.items():
#         labels = []
#         for idx in range(1, len(bins)):
#             if idx == 1 or idx == len(bins) - 1:
#                 labels += [f"lt{bins[idx]:d}"]
#             elif idx:
#                 labels += [f"gt{bins[idx - 1]:d}"]
#             else:
#                 labels += [f"{bins[idx - 1]:d}to{bins[idx]:d}"]
#         df_phenotypes[f"{col}_Range"] = pd.cut(
#             df_phenotypes[col], bins=bins, labels=labels, right=False
#         )
# df_phenotypes

#### Combine into one DataFrame for MetaData

In [9]:
print(f"Proteomics: {df_proteomics.index.nunique()} donors")
print(
    f"  Genomics: {df_genotypes.dropna(how='all', axis=0).index.nunique() if not df_genotypes.empty else 0} donors"
)
print(
    f"Phenotypes: {df_phenotypes.dropna(how='all', axis=0).index.nunique() if not df_phenotypes.empty else 0} donors"
)


df_metadata = pd.concat((df_genotypes, df_phenotypes), axis=1).convert_dtypes()

if not df_metadata.empty:
    if not df_metadata.index.isin(df_proteomics.index).all():
        df_metadata = df_metadata[df_metadata.index.isin(df_proteomics.index)]

    print(f"\nFinal data: {df_metadata.index.nunique()} donors")

df_metadata.head()

Proteomics: 13062 donors
  Genomics: 12570 donors
Phenotypes: 13062 donors

Final data: 13062 donors


Unnamed: 0_level_0,ATP11C_V972M,G6PD_V68M,G6PD_S188F,Blood Center,Additive,Sex,Blood group,RH Status,Ethnicity,Storage,...,Ferritin,BMI,Age,CBC.WBC,CBC.RBC,CBC.HGB,CBC.HCT,CBC.MCV,CBC.RDW,CBC.PLT
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S00001,0,0,0,ARC,CP2D; AS3,F,A-,-,AFRAMRCN,0.343415,...,23.0,26.51795,24,7.38,4.58,11.6,37.3,81.4,14.5,259.0
S00002,0,0,0,ITxM,CPD; AS1,F,O+,+,CAUCASIAN_OTHER,0.281698,...,11.0,24.689126,21,5.3,4.27,13.0,38.4,89.8,12.4,255.0
S00003,0,0,0,ITxM,CPD; AS1,M,O+,+,CAUCASIAN_OTHER,0.178813,...,66.0,29.210526,45,6.4,4.96,16.1,47.0,94.9,12.7,201.0
S00004,0,0,0,ARC,CP2D; AS3,F,B-,-,CAUCASIAN_OTHER,0.173793,...,11.0,25.845588,46,8.39,4.26,12.5,39.6,93.0,13.4,319.0
S00005,0,0,0,ARC,CP2D; AS3,M,O+,+,HIGH,0.146873,...,25.0,23.672449,76,7.15,4.84,14.8,44.8,92.6,13.7,264.0


### Get MCH per sample

In [10]:
# Provide in picograms. Set as None to use metadata if provided
mch_sample_value = None
if mch_sample_value is None:
    try:
        df_MCH_per_sample = pd.read_csv(
            raw_data_dirpath / "Phenotypes.csv",
            index_col=[donor_key],
        )

    except FileNotFoundError:
        raise ValueError(
            "Cannot determine MCH. No phenotype data provided and a default value is not provided"
        )
    else:
        df_MCH_per_sample.index.name = sample_key

    # Ensure only metadata corresponds to the available omics data
    if not df_MCH_per_sample.index.isin(df_proteomics.index).all():
        df_MCH_per_sample = df_MCH_per_sample[
            df_MCH_per_sample.index.isin(df_proteomics.index)
        ]

    if "CBC.MCH" not in df_MCH_per_sample.columns:
        if all([x in df_MCH_per_sample.columns for x in ["CBC.HGB", "CBC.RBC"]]):
            df_MCH_per_sample["CBC.MCH"] = (
                df_MCH_per_sample["CBC.HGB"] / df_MCH_per_sample["CBC.RBC"]
            ) * 10
        elif all([x in df_MCH_per_sample.columns for x in ["CBC.MCHC", "CBC.MCV"]]):
            df_MCH_per_sample["CBC.MCH"] = (
                df_MCH_per_sample["CBC.MCHC"] * df_MCH_per_sample["CBC.MCV"]
            ) / 100
        else:
            raise ValueError(
                "Cannot determine MCH, one of the following combinations is needed: (CBC.HGB and CBC.RBC) or (CBC.MCHC and CBC.MCV)"
            )
    df_MCH_per_sample = df_MCH_per_sample["CBC.MCH"]
    n_missing = len(df_MCH_per_sample[df_MCH_per_sample.isna()])
    print(f"Missing values for {n_missing} samples.")
    print(f"Mean MCH in pg: {df_MCH_per_sample.mean():.2f}")
    df_MCH_per_sample = df_MCH_per_sample.fillna(df_MCH_per_sample.mean())
else:
    print("Using default MCH value provided for all samples")
    df_MCH_per_sample = pd.Series(
        [mch_sample_value] * df_proteomics.index.nunique(),
        index=pd.Index(df_proteomics.index.unique(), name=sample_key),
        name="CBC.MCH",
    )
    print(f"Mean MCH in pg: {mch_sample_value:.2f}")

df_MCH_per_sample.head()

Missing values for 396 samples.
Mean MCH in pg: 29.49


SAMPLE ID
S00001    25.327511
S00002    30.444965
S00003    32.459677
S00004    29.342723
S00005    30.578512
Name: CBC.MCH, dtype: float64

### Remove duplicated IDs in samples

In [11]:
df_proteomics = df_proteomics[~df_proteomics.index.duplicated(keep=False)]
df_MCH_per_sample = df_MCH_per_sample.loc[df_proteomics.index]
sample_ids = list(df_proteomics.index)
df_proteomics

Unnamed: 0_level_0,P68871,P32119,Q5BKX8,P69905,P00915,P02042,P02100,P02768,P11166,P02730,...,P61077,P62837,P61026,Q9Y5X4,A0A087WSY6,P05387,Q9H2M9,P57772,P78318,O43583
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S00001,7073.981445,8771.099609,6448.168457,6324.992188,5412.810547,4644.655762,3903.654541,3072.764648,5684.953125,3394.652588,...,48.514847,48.514847,60.15979,62.942181,68.603394,45.780422,80.064346,81.553123,111.679413,63.537769
S00002,9190.239258,5327.814941,7863.122559,6807.322754,4647.782227,3343.699219,5837.21582,4059.376221,2300.852295,3123.016357,...,127.883858,127.883858,90.135132,120.0952,55.360386,86.750679,63.579247,55.08456,73.463371,100.424057
S00003,8498.479492,4922.197754,126.768845,7736.603516,4992.395508,4374.737305,4242.23584,4505.963379,1433.8302,1862.586914,...,134.436707,134.436707,73.548569,51.715431,132.122818,85.769547,110.676659,53.000729,105.195908,64.226677
S00004,10630.88965,12136.83008,5000.621094,8911.09668,7699.40918,5738.85791,3954.778809,4777.587402,2832.1875,2285.200684,...,39.539875,39.539875,47.003647,61.498272,68.938332,43.723743,84.613365,78.729622,104.424522,56.544079
S00005,5723.407715,8565.396484,3377.380859,6979.328613,4816.725586,4077.465088,3848.742432,3599.222412,3777.494385,3049.085449,...,108.394714,108.394714,75.101746,106.529846,99.147247,101.750771,87.171074,57.418106,100.425819,125.826004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S13058,6586.478516,7412.302734,6501.297852,6532.356445,3902.338379,3500.957031,4225.030273,2557.459961,4362.724609,3635.246338,...,60.237164,60.237164,92.489388,63.391792,58.014748,55.713615,49.492943,93.865875,60.760597,121.360664
S13059,12357.73047,7735.660645,9155.633789,7385.98291,5878.56543,3710.473877,5086.031738,4291.113281,5325.365234,3931.783691,...,126.9338,126.9338,69.102386,137.057938,87.538391,52.625576,103.661774,51.053646,91.741119,60.273109
S13060,16872.25,14527.30371,8724.886719,9532.143555,9328.698242,5914.047852,4844.871582,2734.96875,5758.858887,3095.335938,...,74.558983,74.558983,77.828712,80.506981,73.264915,125.240219,59.302101,77.076576,68.149017,100.167213
S13061,9447.56543,8308.814453,4838.060059,6977.07959,5823.077148,3960.940186,3569.583008,3629.384277,3140.781006,1786.237671,...,104.796013,104.796013,65.034401,99.836937,107.264633,111.633064,121.998962,73.177971,87.983307,84.537773


### Get data subsets using operations

In [12]:
operations = [
    "mean",
    "median",
]
operation_dfs_proteomics = []
operation_dfs_MCH = []
fill_keys = set()


def group_data(df, operation, keys, columns, prefix_values=None, name_col=None):
    keys = ensure_iterable(keys)
    if not prefix_values:
        prefix_values = [""] * len(keys)
    if isinstance(prefix_values, dict):
        prefix_values = {k: prefix_values.get(k, "") for k in keys}
    else:
        prefix_values = dict(zip(keys, prefix_values))

    df = df.groupby(keys, as_index=False, observed=False)[columns]
    df = getattr(df, operation.lower())()
    labels = df[keys].apply(
        lambda x: "_".join([f"{prefix_values[key]}{x[key]}" for key in keys]),
        axis=1,
    )
    df[name_col] = [f"{operation.capitalize()}_{value}" for value in labels]
    return df

#### Group by all donors

In [13]:
# for operation in operations:
#     df = getattr(df_proteomics, operation.lower())()
#     df = pd.DataFrame(df.values, columns=[f"{operation.capitalize()}_All"], index=list(df.index)).T
#     df.index.name = sample_key
#     operation_dfs_proteomics += [df.reset_index(drop=False)]

#     df = getattr(df_MCH_per_sample, operation.lower())()
#     df = pd.DataFrame(df, columns=[f"{operation.capitalize()}_All"], index=["CBC.MCH"]).T
#     df.index.name = sample_key
#     operation_dfs_MCH += [df.reset_index(drop=False)]

#### Group by metadata only

In [14]:
keys_prefixes = dict(zip(list(df_genotypes.columns), list(df_genotypes.columns)))
keys_prefixes.update(
    # "Gender": "Sex",
    # "BMI_Range": "BMI",
    # "Age_Range": "Age",
)
keys_prefixes = {k: f"{v}_" for k, v in keys_prefixes.items()}
for key, prefix in keys_prefixes.items():

    operation_dfs_proteomics += [
        group_data(
            pd.merge(
                df_proteomics,
                df_metadata,
                left_index=True,
                right_index=True,
                how="left",
            ).reset_index(drop=False),
            operation,
            keys=[key],
            columns=list(df_protein_data.index),
            prefix_values={key: prefix},
            name_col=sample_key,
        )
        for operation in operations
    ]

    operation_dfs_MCH += [
        group_data(
            pd.merge(
                df_MCH_per_sample,
                df_metadata,
                left_index=True,
                right_index=True,
                how="left",
            ).reset_index(drop=False),
            operation,
            keys=[key],
            columns=["CBC.MCH"],
            prefix_values={key: prefix},
            name_col=sample_key,
        )
        for operation in operations
    ]

### Add to DataFrames

In [15]:
try:
    df_proteomics_op = pd.concat(operation_dfs_proteomics, axis=0).drop_duplicates()
except (KeyError, ValueError):
    df_proteomics_final = df_proteomics[df_protein_data.index].copy()
else:
    df_proteomics_final = pd.concat(
        (df_proteomics.reset_index(drop=False), df_proteomics_op), axis=0
    )
    df_proteomics_final = df_proteomics_final.set_index(sample_key)[
        df_protein_data.index
    ]

try:
    df_MCH_op = pd.concat(operation_dfs_MCH, axis=0).drop_duplicates()
except (KeyError, ValueError):
    if isinstance(df_MCH_per_sample, pd.DataFrame):
        df_MCH_final = df_MCH_per_sample["CBC.MCH"].copy()
    else:
        df_MCH_final = df_MCH_per_sample.copy()
else:
    df_MCH_final = pd.concat(
        (df_MCH_per_sample.reset_index(drop=False), df_MCH_op), axis=0
    )
    df_MCH_final = df_MCH_final.set_index(sample_key)["CBC.MCH"]

df_MCH_final.name = "MCH"
df_MCH_final

SAMPLE ID
S00001                 25.327511
S00002                 30.444965
S00003                 32.459677
S00004                 29.342723
S00005                 30.578512
                         ...    
Mean_G6PD_S188F_1      29.788787
Mean_G6PD_S188F_2      29.914978
Median_G6PD_S188F_0    29.647059
Median_G6PD_S188F_1    30.155211
Median_G6PD_S188F_2    29.824561
Name: MCH, Length: 13080, dtype: float64

### Normalize data by hemoglobin mass
#### Set percent for hemoglobin and low abundance protoemes

In [16]:
HB_PERCENT, LA_PERCENT = (0.95, 0.05)
df_percent_abundance = df_proteomics_final.apply(lambda x: x / x.sum(), axis=1)

#### Scale data

In [17]:
MODELED_PERCENT = HB_PERCENT + LA_PERCENT
assert 1 >= MODELED_PERCENT
HB_PROTEINS = {
    "HBA": "P69905",  # Hemoglobin subunit alpha
    "HBB": "P68871",  # Hemoglobin subunit beta
    "HBD": "P02042",  # Hemoglobin subunit delta
    "HBE1": "P02100",  # Hemoglobin subunit beta
    "HBG1": "P69891",  # Hemoglobin subunit gamma-1
    "HBG2": "P69892",  # Hemoglobin subunit gamma-2
    "HBM": "Q6B0K9",  # Hemoglobin subunit mu
    "HBQ1": "P09105",  # Hemoglobin subunit theta-1
    "HBZ": "P02008",  # Hemoglobin subunit zeta
}

# Protein intensity / Total intensity --> Percent protein abundance / total protein
df_percent_hb = df_percent_abundance.loc[
    :, df_percent_abundance.columns.isin(list(HB_PROTEINS.values()))
]
df_percent_la = df_percent_abundance.loc[
    :, ~df_percent_abundance.columns.isin(list(HB_PROTEINS.values()))
]

df_summary = {
    "Perfect total": 1.0,
    "Current total": df_percent_abundance.loc[sample_ids].sum(axis=1).mean().item(),
    "Hemoglobin total": df_percent_hb.loc[sample_ids].sum(axis=1).mean().item(),
    "Low abundance total": df_percent_la.loc[sample_ids].sum(axis=1).mean().item(),
}

# # Scale hemoglobin and low abundance protoeme percentages
# df_percent_hb = HB_PERCENT * df_percent_hb.div(df_percent_hb.sum(axis=1), axis=0)
# df_percent_la = LA_PERCENT * df_percent_la.div(df_percent_la.sum(axis=1), axis=0)

# Combine dataframes back into one
df_percent_abundance = pd.concat((df_percent_hb, df_percent_la), axis=1)

df_summary["Hemoglobin scaled"] = (
    df_percent_hb.loc[sample_ids].sum(axis=1).mean().item()
)
df_summary["Low abundance scaled"] = (
    df_percent_la.loc[sample_ids].sum(axis=1).mean().item()
)
df_summary["Remaining scaled"] = 1 - (HB_PERCENT + LA_PERCENT)
df_summary = pd.DataFrame.from_dict(
    {" " * max(30 - len(k), 0) + k: [f"{v * 100:.1f}%"] for k, v in df_summary.items()},
    orient="index",
    columns=["Percentage"],
)
print(df_summary)

                     Percentage
       Perfect total     100.0%
       Current total     100.0%
    Hemoglobin total       9.6%
 Low abundance total      90.4%
   Hemoglobin scaled       9.6%
Low abundance scaled      90.4%
    Remaining scaled       0.0%


### Transform data to copy numbers and expected format

In [18]:
df_uniprot_to_mw = df_protein_data["Mass"].astype(float)

gDW_total_protein = (
    df_MCH_final  # pgDW HB
    * (1 / HB_PERCENT)  # pgDW total protein / pgDW HB
    * (1 / 1e12)  #  gDW total protein / pgDW total protein
)  #  gDW total protein

# Percent protein abundance / total protein --> Specific protein concentration / total protein
df_mol_per_gDW = df_percent_abundance.div(
    df_uniprot_to_mw, axis=1  # mol protein / gDW total protein
)
# Convert from mol / gDW protein --> nmol / gDW protein
df_nmol_per_gDW = df_mol_per_gDW * (1e9 / 1)  # nmol protein / mol protein

# Convert from mol / gDW protein --> copy numbers / cell
df_copy_numbers = df_mol_per_gDW.mul(gDW_total_protein, axis=0) * AVOGADRO_NUMBER
df_copy_numbers

Unnamed: 0_level_0,A0A075B6H7,A0A075B6K5,A0A075B6P5,A0A075B6R9,A0A075B6S2,A0A075B6S6,A0A087WSY6,A0A087WW87,A0A0A0MRZ7,A0A0A0MRZ8,...,Q9Y4E8,Q9Y4P8,Q9Y570,Q9Y5K8,Q9Y5P6,Q9Y5X1,Q9Y5X4,Q9Y5Z4,Q9Y617,Q9Y6I3
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S00001,297416.708329,1200596.487946,571957.470453,861549.627094,563863.116842,560790.990893,272366.105059,556788.350462,557332.702463,356569.454853,...,82759.706933,87423.664776,89245.747264,154583.629069,157639.016738,53965.950388,70082.429106,139916.603882,113808.928716,56759.310937
S00002,318470.500504,1052101.438684,422594.265147,908816.326671,416613.702618,414343.843625,294928.97506,411386.468333,411788.666128,345551.989003,...,95657.431328,133161.688549,129925.615415,627769.257966,149817.210408,84782.790044,179433.645813,249251.259569,119473.571213,97334.143011
S00003,605776.27962,1719442.751272,368117.904792,986343.3299,362908.292809,360931.039908,802705.133675,358354.898,358705.248732,703720.302422,...,118705.494463,125458.205523,153406.28311,369748.170877,171774.379055,101922.335254,88116.707847,270557.887202,235605.448982,117280.469278
S00004,330225.519553,1101531.083003,271669.327749,280512.636976,267824.657966,266365.454381,291981.625508,264464.273452,264722.830687,379900.893021,...,131184.429717,244944.90896,94097.299873,380102.54687,85104.625472,57091.093049,73049.553571,210383.068178,134467.548639,89179.215351
S00005,467800.267497,1653486.585711,475800.185675,505189.10147,469066.65189,466511.010654,521906.185127,463181.292697,463634.128434,337078.884028,...,131692.863282,99827.590641,132106.971553,154302.525479,162992.584279,71692.052816,157268.948428,262094.640459,126186.544632,110685.653503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Mean_G6PD_S188F_1,554005.10243,980797.972214,442090.910205,613584.273351,435834.430764,433459.850437,374531.013223,430366.034825,430786.788263,377079.365403,...,116163.751528,129108.096246,117752.940538,225083.675829,119011.413564,76634.777816,118760.971404,209396.439006,141614.800958,86514.433076
Mean_G6PD_S188F_2,327829.941809,1098726.025799,498696.512731,925037.973134,491638.949666,488960.326557,399435.256789,485470.376819,485945.003794,646700.701366,...,122881.919668,116801.121771,263399.286723,250872.37419,129946.39714,71956.591444,106962.266374,203352.851048,87787.983926,78742.231614
Median_G6PD_S188F_0,411255.643585,890878.357682,412240.954544,442802.126945,406406.912274,404192.66349,404318.930224,401307.74215,401700.086337,458000.483898,...,121577.48609,112015.015537,125683.625559,204289.253226,136358.464804,78425.039655,113375.546174,227497.037057,126071.446074,85684.151626
Median_G6PD_S188F_1,500824.227577,981740.580603,359598.622784,541546.698006,354509.575851,352578.082135,434461.217795,350061.559385,350403.802017,365427.414849,...,124317.273631,116552.981818,117770.46769,204297.960668,123368.314308,78762.376745,128767.411597,214086.655495,153174.541973,90202.286299


## Export absolute quantitative data and metadata per sample

In [19]:
dataframes_dict = {
    "ProteinData": df_protein_data,
    "ProteinIntensities": df_proteomics_final,
    "ProteinConcentrations": df_nmol_per_gDW,
    "ProteinCopyNumbers": df_copy_numbers,
    "MCH": df_MCH_final,
    "Metadata": df_metadata,
}
for data_type, df in dataframes_dict.items():
    # df.to_csv(
    #     processed_data_dirpath / f"{data_type}.tsv", sep="\t", index=True
    # )
    df.to_csv(processed_data_dirpath / f"{data_type}.csv", index=True)
    print(f"Saved data for {data_type}")

Saved data for ProteinData
Saved data for ProteinIntensities
Saved data for ProteinConcentrations
Saved data for ProteinCopyNumbers
Saved data for MCH
Saved data for Metadata
