# Prepare proteomic data - REDS RBC Omics
1. Nemkov T, Stephenson D, Earley EJ, Keele GR, Hay A, Key A, Haiman ZB, Erickson C, Dzieciatkowska M, Reisz JA, Moore A, Stone M, Deng X, Kleinman S, Spitalnik SL, Hod EA, Hudson KE, Hansen KC, Palsson BO, Churchill GA, Roubinian N, Norris PJ, Busch MP, Zimring JC, Page GP, D'Alessandro A. Biological and genetic determinants of glycolysis: Phosphofructokinase isoforms boost energy status of stored red blood cells and transfusion outcomes. Cell Metab. 2024 Sep 3;36(9):1979-1997.e13. doi: 10.1016/j.cmet.2024.06.007. Epub 2024 Jul 3. PMID: 38964323; PMCID: PMC11374506.

2. D'Alessandro A, Culp-Hill R, Reisz JA, Anderson M, Fu X, Nemkov T, Gehrke S, Zheng C, Kanias T, Guo Y, Page G, Gladwin MT, Kleinman S, Lanteri M, Stone M, Busch M, Zimring JC; Recipient Epidemiology and Donor Evaluation Study-III (REDS-III). Heterogeneity of blood processing and storage additives in different centers impacts stored red blood cell metabolism as much as storage time: lessons from REDS-III-Omics. Transfusion. 2019 Jan;59(1):89-100. doi: 10.1111/trf.14979. Epub 2018 Oct 24. PMID: 30353560; PMCID: PMC6322946.

3. Josephson CD, Glynn S, Mathew S, Birch R, Bakkour S, Baumann Kreuziger L, Busch MP, Chapman K, Dinardo C, Hendrickson J, Hod EA, Kelly S, Luban N, Mast A, Norris P, Custer B, Sabino E, Sachais B, Spencer BR, Stone M, Kleinman S; National Heart, Lung, and Blood Institute (NHLBI) Recipient Epidemiology and Donor Evaluation Study-IV-Pediatric (REDS-IV-P). The Recipient Epidemiology and Donor Evaluation Study-IV-Pediatric (REDS-IV-P): A research program striving to improve blood donor safety and optimize transfusion outcomes across the lifespan. Transfusion. 2022 May;62(5):982-999. doi: 10.1111/trf.16869. Epub 2022 Apr 19. PMID: 35441384; PMCID: PMC9353062.

## Setup
### Import packages

In [1]:
from pathlib import Path

import pandas as pd
from rbc_gem_utils import COBRA_CONFIGURATION, ROOT_PATH, show_versions
from rbc_gem_utils.util import AVOGADRO_NUMBER

# Show versions of notebook
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.1

Dependency Information
----------------------
beautifulsoup4                       4.12.3
bio                                   1.6.2
cobra                                0.29.0
depinfo                               2.2.0
kaleido                               0.2.1
matplotlib                            3.8.2
memote                               0.17.0
networkx                              3.2.1
notebook                              7.0.7
openpyxl                              3.1.2
pandas                                2.2.0
pre-commit                            3.6.0
pyvis                                 0.3.2
rbc-gem-utils[database,network,vis] missing
requests                             2.31.0
scipy                                1.12.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip        23.3.1
setuptools 68.2.2
wheel      0.41.2

Platform Information
-------------------

### Define configuration
#### COBRA Configuration

In [2]:
COBRA_CONFIGURATION.solver = "gurobi"
COBRA_CONFIGURATION.bounds = (-1e3, 1e3)
COBRA_CONFIGURATION

Attribute,Description,Value
solver,Mathematical optimization solver,gurobi
tolerance,"General solver tolerance (feasibility, integrality, etc.)",1e-07
lower_bound,Default reaction lower bound,-1000.0
upper_bound,Default reaction upper bound,1000.0
processes,Number of parallel processes,15
cache_directory,Path for the model cache,/Users/zhaiman/Library/Caches/cobrapy
max_cache_size,Maximum cache size in bytes,104857600
cache_expiration,Model cache expiration time in seconds (if any),


## Load RBC-GEM model

In [3]:
data_path = Path(ROOT_PATH, "data", "analysis", "OVERLAY").resolve()
dataset_name = "REDS_RBCOmics"
data_path

PosixPath('/Users/zhaiman/opt/github/RBC-GEM/data/analysis/OVERLAY')

## Load RBC Proteomics

In [4]:
df_protein_data = pd.read_csv(
    data_path / dataset_name / f"{dataset_name}_ProteinData.tsv",
    sep="\t",
    index_col=None,
)
# Check to see if expected columns are included. If so, then order columns as listed.
df_protein_data = df_protein_data.loc[
    :,
    [
        "Entry",
        "Entry Name",
        "Protein",
        "Protein Names",
        "Gene Names (Primary)",
        "Length",
        "Mass",  # Should be in DA
    ],
]
# Sort the data via alphabetical order of protein IDs for consistency
df_protein_data = df_protein_data.sort_values(by="Entry")

df_protein_data.head()

Unnamed: 0,Entry,Entry Name,Protein,Protein Names,Gene Names (Primary),Length,Mass
0,A0A075B6I0,LV861_HUMAN,LV861,Immunoglobulin lambda variable 8-61,IGLV8-61,122,12814
1,A0A075B6I9,LV746_HUMAN,LV746,Immunoglobulin lambda variable 7-46,IGLV7-46,117,12468
2,A0A075B6J9,LV218_HUMAN,LV218,Immunoglobulin lambda variable 2-18,IGLV2-18,118,12412
3,A0A075B6K4,LV310_HUMAN,LV310,Immunoglobulin lambda variable 3-10,IGLV3-10,115,12441
4,A0A075B6K5,LV39_HUMAN,LV39,Immunoglobulin lambda variable 3-9,IGLV3-9,115,12332


### Set variables for columns keys and protein values type

In [5]:
donor_key = "PUBLIC DONOR ID"
time_key = "DAY"
time_abbrev = time_key[0]
protein_values_dtype = "Intensity"

### Load proteomic data - intensities

In [6]:
df_intensities = pd.read_csv(
    data_path / dataset_name / f"{dataset_name}_{protein_values_dtype}Data.tsv",
    sep="\t",
    index_col=None,
)

# Transform Protein IDs to UniProt IDs
if any(df_intensities.columns.isin(df_protein_data["Protein"])):
    df_intensities = df_intensities.rename(
        df_protein_data.set_index("Protein")["Entry"].to_dict(), axis=1
    )
df_intensities = df_intensities.set_index([donor_key, time_key])
# Sort the rows by Public Donor IDs and time points
df_intensities = df_intensities.sort_index(axis=0)
# Sort the column data via alphabetical order of protein IDs for consistency
df_intensities = df_intensities.sort_index(axis=1)
# Reset index for DataFrame
df_intensities = df_intensities.reset_index(drop=False)

time_points = df_intensities[time_key].unique()
print(f"Number of time points per sample: {len(time_points)}")
df_intensities.head()

Number of time points per sample: 3


Unnamed: 0,PUBLIC DONOR ID,DAY,A0A075B6I0,A0A075B6I9,A0A075B6J9,A0A075B6K4,A0A075B6K5,A0A075B6R2,A0A075B6S5,A0A075B6S9,...,Q9Y639,Q9Y666,Q9Y696,Q9Y6B6,Q9Y6B7,Q9Y6E0,Q9Y6I3,Q9Y6M4,Q9Y6M5,Q9Y6R7
0,S001,10,114.10669,61.188179,16.998806,1.0,240.439072,0.0,8.726534,0.0,...,4.806035,0.0,44.643505,130.273163,0.0,30.318407,22.992273,0.0,38.119408,202.467422
1,S001,23,90.694664,18.855646,0.0,0.0,234.999283,0.0,30.381741,148.992706,...,3.503707,0.0,28.674469,0.0,0.0,33.782383,159.744751,21.729137,38.118774,210.14856
2,S001,42,123.064743,49.431454,61.730526,0.0,221.155167,0.0,25.704966,0.0,...,7.449622,1.478644,26.478333,139.307129,7.024934,45.400738,19.8267,22.173162,28.154861,191.642059
3,S002,10,0.0,0.0,0.0,0.0,267.438843,12.034687,24.897249,0.0,...,12.975469,0.0,53.475803,91.111374,0.0,50.8153,0.0,0.0,113.735794,264.970459
4,S002,23,64.492546,0.0,490.452606,0.0,518.987244,0.0,0.0,0.0,...,10.578275,48.285789,48.756619,13.925104,0.0,77.84198,0.0,0.0,129.577789,93.992775


In [7]:
df_metadata = pd.read_csv(
    data_path / dataset_name / f"{dataset_name}_MetaData.tsv",
    sep="\t",
    index_col=None,
)
# Ensure only metadata corresponds to the available omics data
if set(df_intensities[donor_key].unique()) != set(df_metadata[donor_key].unique()):
    df_metadata = df_metadata[df_metadata[donor_key].isin(df_intensities[donor_key])]

df_metadata.head()

Unnamed: 0,PUBLIC DONOR ID,AS,Gender,DONDB.ABO_RH,RBCOmics.Race.Ethnicity.Group,Age,BMI,Weight,Height,Hemolysis.volume,...,Adjusted.Osmotic.Hemolysis,Adjusted.Oxidative.Hemolysis,CBC.WBC,CBC.RBC,CBC.HGB,CBC.HCT,CBC.MCV,CBC.RDW,CBC.PLT,Ferritin
0,S001,AS1,M,O+,CAUCASIAN,33,25.724339,195,73,14.0,...,51.412936,,5.56,5.33,15.5,46.6,87.4,12.7,274.0,19.0
1,S002,AS3,M,A+,HIGH,71,27.891291,200,71,14.0,...,72.000942,,7.32,5.38,16.5,47.9,89.0,12.7,222.0,23.0
2,S003,AS1,F,O+,HIGH,51,23.128284,139,65,11.5,...,25.647109,,5.67,4.38,14.2,41.7,95.2,12.1,293.0,17.0
3,S004,AS1,F,A+,CAUCASIAN,48,23.294675,140,65,12.5,...,12.702554,,6.1,4.32,13.1,40.9,94.7,12.3,376.0,21.0
4,S005,AS1,M,A+,OTHER,52,36.04248,210,64,11.0,...,55.11138,,7.52,5.81,15.9,47.8,82.3,14.4,414.0,93.0


### Quantify copy numbers per sample
#### Convert Da to kDa

In [8]:
df_uniprot_to_mw = df_protein_data.set_index("Entry")["Mass"] / 1000
df_uniprot_to_mw

Entry
A0A075B6I0     12.814
A0A075B6I9     12.468
A0A075B6J9     12.412
A0A075B6K4     12.441
A0A075B6K5     12.332
               ...   
Q9Y6E0         49.308
Q9Y6I3         60.293
Q9Y6M4         51.389
Q9Y6M5         55.300
Q9Y6R7        572.017
Name: Mass, Length: 1827, dtype: float64

#### Get approximate dry weight for each donor

In [9]:
df_mch_per_donor = df_metadata.set_index(donor_key)[["CBC.HGB", "CBC.RBC"]]
df_mch_per_donor = (
    # Calculate MCH in pg using CBC.HGB and CBC.RBC measurements
    df_mch_per_donor["CBC.HGB"]
    / df_mch_per_donor["CBC.RBC"]
    * 10
)
n_missing = len(df_mch_per_donor[df_mch_per_donor.isna()])
print(
    f"Mean MCH in pg (n={len(df_mch_per_donor) - n_missing}):\t{df_mch_per_donor.mean():.2f}"
)
print(f"Missing values:\t\t{n_missing}")

df_mch_per_donor = df_mch_per_donor.fillna(df_mch_per_donor.mean())
df_mch_per_donor

Mean MCH in pg (n=598):	29.51
Missing values:		12


PUBLIC DONOR ID
S001    29.080675
S002    30.669145
S003    32.420091
S004    30.324074
S005    27.366609
          ...    
S606    30.000000
S607    30.152672
S608    32.041344
S609    28.942116
S610    29.025424
Length: 610, dtype: float64

#### Transform intensities to copy numbers and expected format

In [10]:
dict_of_dataframes = {
    "Concentrations": [],
    "CopyNumbers": [],
}
for tp in time_points:
    df_intensities_per_tp = df_intensities.set_index([time_key, donor_key]).loc[tp]
    df_intensities_per_tp = df_intensities_per_tp.apply(lambda x: x / x.sum(), axis=1)
    # pmol / mgDW sample --> nmol / gDW sample
    df_concs_per_tp = (df_intensities_per_tp * 1e6) / df_uniprot_to_mw
    df_copies_per_tp = (
        # Conversion to mol per gDW sample
        (df_concs_per_tp.T * 1e-9)  # Transpose for alignment of DataFrames
        * (df_mch_per_donor * 1e-12)
        * AVOGADRO_NUMBER
    ).T  # Revert back to original

    # Reset Donor and Day indicies for concentrations
    for key, df in zip(list(dict_of_dataframes), [df_concs_per_tp, df_copies_per_tp]):
        df[time_key] = tp
        df = df.reset_index(drop=False).set_index([donor_key, time_key])
        dict_of_dataframes[key].append(df)

### Export absolute quantitative data

In [11]:
dict_of_final_dfs = {}
for key, list_of_dfs in dict_of_dataframes.items():
    df_per_sample = pd.concat(dict_of_dataframes[key])
    df_per_sample = df_per_sample.sort_index(level=0)
    df_per_sample.index = [
        f"{ind[0]}_{time_abbrev}{ind[1]}" for ind in df_per_sample.index
    ]
    dict_of_final_dfs[key] = df_per_sample

dict_of_final_dfs["CopyNumbers"]

Unnamed: 0,A0A075B6I0,A0A075B6I9,A0A075B6J9,A0A075B6K4,A0A075B6K5,A0A075B6R2,A0A075B6S5,A0A075B6S9,A0A087WSY6,A0A0A0MRZ8,...,Q9Y639,Q9Y666,Q9Y696,Q9Y6B6,Q9Y6B7,Q9Y6E0,Q9Y6I3,Q9Y6M4,Q9Y6M5,Q9Y6R7
S001_D10,114518.643899,63113.250142,17612.721227,1033.70001,250738.681069,0.000000,8828.312669,0.000000,323739.751835,61753.957120,...,1392.454161,0.000000,19954.371068,74758.901488,0.000000,7907.492758,4904.162257,0.000000,8864.838390,4551.934758
S001_D23,83733.413360,17891.485645,0.000000,0.00000,225441.984974,0.000000,28274.865365,138922.828686,323562.039664,63871.782876,...,933.842764,0.000000,11790.361238,0.000000,0.000000,8105.404114,31344.509782,5002.350276,8154.842476,4346.295603
S001_D42,124576.860801,51427.456139,64512.914721,0.00000,232622.665201,0.000000,26229.595680,0.000000,307083.059312,85379.086435,...,2177.042354,161.034294,11937.380407,80634.321987,1094.445937,11943.564971,4265.520169,5596.879103,6604.146775,4345.806113
S002_D10,0.000000,0.000000,0.000000,0.00000,246593.949891,10651.008268,22270.446851,0.000000,0.000000,11125.538105,...,3323.982341,0.000000,21133.854588,46229.818435,0.000000,11718.406394,0.000000,0.000000,23386.405820,5267.201961
S002_D23,52980.619738,0.000000,415956.112710,0.00000,443011.906351,0.000000,0.000000,0.000000,0.000000,0.000000,...,2508.714731,4267.543219,17838.394210,6541.070445,0.000000,16618.381242,0.000000,0.000000,24665.939736,1729.727805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S609_D23,0.000000,0.000000,0.000000,0.00000,225635.007425,0.000000,9092.998956,0.000000,60754.867934,40445.049243,...,3673.791044,0.000000,7148.104570,124188.370607,0.000000,8157.254438,3839.757297,6371.901194,15905.183729,2299.903991
S609_D42,0.000000,0.000000,0.000000,0.00000,138221.840018,0.000000,4613.159040,0.000000,88833.464684,20512.080999,...,3465.156142,0.000000,7107.225657,60512.211739,0.000000,9264.103912,10641.041401,897037.392699,13430.857197,3605.287383
S610_D10,52483.854665,0.000000,16017.273082,0.00000,149738.028981,14758.092712,10439.649393,39609.443748,0.000000,20907.088540,...,3964.128687,0.000000,0.000000,22176.084986,0.000000,10180.734600,6533.999663,7128.257654,7239.240669,1943.954949
S610_D23,54229.883559,0.000000,0.000000,0.00000,93055.342976,0.000000,12750.686038,86361.815849,200688.996186,134584.998264,...,2734.603592,0.000000,17318.853805,82170.719021,0.000000,8004.910478,2869.029393,5507.615399,9508.849456,4726.448375
