# Prepare proteomic data - REDS RBC Omics
1. Nemkov T, Stephenson D, Earley EJ, Keele GR, Hay A, Key A, Haiman ZB, Erickson C, Dzieciatkowska M, Reisz JA, Moore A, Stone M, Deng X, Kleinman S, Spitalnik SL, Hod EA, Hudson KE, Hansen KC, Palsson BO, Churchill GA, Roubinian N, Norris PJ, Busch MP, Zimring JC, Page GP, D'Alessandro A. Biological and genetic determinants of glycolysis: Phosphofructokinase isoforms boost energy status of stored red blood cells and transfusion outcomes. Cell Metab. 2024 Sep 3;36(9):1979-1997.e13. doi: 10.1016/j.cmet.2024.06.007. Epub 2024 Jul 3. PMID: 38964323; PMCID: PMC11374506.

2. D'Alessandro A, Culp-Hill R, Reisz JA, Anderson M, Fu X, Nemkov T, Gehrke S, Zheng C, Kanias T, Guo Y, Page G, Gladwin MT, Kleinman S, Lanteri M, Stone M, Busch M, Zimring JC; Recipient Epidemiology and Donor Evaluation Study-III (REDS-III). Heterogeneity of blood processing and storage additives in different centers impacts stored red blood cell metabolism as much as storage time: lessons from REDS-III-Omics. Transfusion. 2019 Jan;59(1):89-100. doi: 10.1111/trf.14979. Epub 2018 Oct 24. PMID: 30353560; PMCID: PMC6322946.

3. Josephson CD, Glynn S, Mathew S, Birch R, Bakkour S, Baumann Kreuziger L, Busch MP, Chapman K, Dinardo C, Hendrickson J, Hod EA, Kelly S, Luban N, Mast A, Norris P, Custer B, Sabino E, Sachais B, Spencer BR, Stone M, Kleinman S; National Heart, Lung, and Blood Institute (NHLBI) Recipient Epidemiology and Donor Evaluation Study-IV-Pediatric (REDS-IV-P). The Recipient Epidemiology and Donor Evaluation Study-IV-Pediatric (REDS-IV-P): A research program striving to improve blood donor safety and optimize transfusion outcomes across the lifespan. Transfusion. 2022 May;62(5):982-999. doi: 10.1111/trf.16869. Epub 2022 Apr 19. PMID: 35441384; PMCID: PMC9353062.

## Setup
### Import packages

In [None]:
from pathlib import Path

import pandas as pd
from rbc_gem_utils import (
    COBRA_CONFIGURATION,
    EXTERNAL_PATH,
    ROOT_PATH,
    get_annotation_df,
    show_versions,
)
from rbc_gem_utils.util import AVOGADRO_NUMBER

# Show versions of notebook
show_versions()

### Define configuration
#### COBRA Configuration

In [None]:
COBRA_CONFIGURATION.solver = "gurobi"
COBRA_CONFIGURATION.bounds = (-1e3, 1e3)
COBRA_CONFIGURATION

## Load RBC Proteomics
### Set paths

In [None]:
dataset_name = "REDS_RBCOmics"
proteomics_dirpath = (
    ROOT_PATH / EXTERNAL_PATH / "proteomics" / "datasets" / "HumanData" / dataset_name
).resolve()
proteomics_dirpath.mkdir(exist_ok=True, parents=True)

### Load protein data

In [None]:
df_protein_data = pd.read_csv(
    proteomics_dirpath / f"{dataset_name}_ProteinData.tsv",
    sep="\t",
    index_col=None,
)
# Check to see if expected columns are included. If so, then order columns as listed.
df_protein_data = df_protein_data.loc[
    :,
    [
        "Entry",
        "Entry Name",
        "Protein",
        "Protein names",
        "Gene Names (primary)",
        "Length",
        "Mass",  # Should be in DA
    ],
]
# Sort the data via alphabetical order of protein IDs for consistency
df_protein_data = df_protein_data.sort_values(by="Entry")

df_protein_data.head()

### Load proteomics
#### Set variables for columns keys and protein values type

In [None]:
protein_values_dtype = "Intensity"
sample_key = "PUBLIC DONOR ID"

time_key = None
time_abbrev = None
time_ordered = None

time_key = "DAY"
time_abbrev = time_key[0]
time_ordered = None

#### Load data and map to UniProt

In [None]:
df_proteomics = pd.read_csv(
    proteomics_dirpath / f"{dataset_name}_{protein_values_dtype}Data.tsv",
    sep="\t",
    index_col=None,
)
# Transform Protein IDs to UniProt IDs
if any(df_proteomics.columns.isin(df_protein_data["Protein"])):
    df_proteomics = df_proteomics.rename(
        df_protein_data.set_index("Protein")["Entry"].to_dict(), axis=1
    )
df_proteomics

#### Sort data by samples and time for consistency

In [None]:
if time_key:
    index_keys = [sample_key, time_key]
    time_points = df_proteomics[time_key].unique()
else:
    index_keys = [sample_key]
    time_points = []
df_proteomics = df_proteomics.set_index(index_keys)
# Sort the column data via alphabetical order of protein IDs for consistency
df_proteomics = df_proteomics.sort_index(axis=1).reset_index(drop=False)
# Sort the rows by Sample IDs and time points
if time_ordered:
    # Map values to numbers before sorting
    time_mapping = dict(zip(time_ordered, range(len(time_ordered))))
    df_proteomics[time_key] = df_proteomics[time_key].apply(lambda x: time_mapping[x])
    # Sort
    df_proteomics = df_proteomics.sort_values(by=index_keys, axis=0)
    # Revert numbers to values after sorting
    time_mapping = {v: k for k, v in time_mapping.items()}
    df_proteomics[time_key] = df_proteomics[time_key].apply(lambda x: time_mapping[x])
else:
    df_proteomics = df_proteomics.sort_values(by=index_keys, axis=0)

print(f"Number of time points per sample: {len(time_points)}")
df_proteomics.head()

## Transform intensities to copy numbers
If copy numbers are provided, skip this section.

In [None]:
# Provide in picograms. Set as None to use metadata if provided
mch_sample_value = None

### Get MCH per sample
#### Load metadata corresponding to samples (optional)

In [None]:
try:
    df_metadata = pd.read_csv(
        proteomics_dirpath / f"{dataset_name}_MetaData.tsv",
        sep="\t",
        index_col=None,
    )
except FileNotFoundError:
    df_metadata = pd.DataFrame()
else:
    # Ensure only metadata corresponds to the available omics data
    if set(df_proteomics[sample_key].unique()) != set(df_metadata[sample_key].unique()):
        df_metadata = df_metadata[
            df_metadata[sample_key].isin(df_proteomics[sample_key])
        ]

df_metadata.head()

#### Get approximate dry weight for each donor

In [None]:
# Provide in picograms. Set as None to use metadata if provided
if not df_metadata.empty and mch_sample_value is None:
    try:
        MCH_per_sample = df_metadata.set_index(index_keys)
    except KeyError:
        MCH_per_sample = df_metadata.set_index(sample_key)

    MCH_per_sample = MCH_per_sample[["CBC.HGB", "CBC.RBC"]]
    MCH_per_sample = pd.Series(
        # Calculate MCH in pg using CBC.HGB and CBC.RBC measurements
        MCH_per_sample["CBC.HGB"] / MCH_per_sample["CBC.RBC"] * 10,
        name="MCH",
    )
    n_missing = len(MCH_per_sample[MCH_per_sample.isna()])
    print(
        f"Mean MCH in pg (n={len(MCH_per_sample) - n_missing}):\t{MCH_per_sample.mean():.2f}"
    )
    print(f"Missing values:\t\t{n_missing}")
    MCH_per_sample = MCH_per_sample.fillna(MCH_per_sample.mean())

elif mch_sample_value is not None:
    MCH_per_sample = pd.Series(
        [mch_sample_value] * len(df_proteomics[sample_key].unique()),
        index=df_proteomics[sample_key].unique(),
        name="MCH",
    )
    MCH_per_sample.index.name = sample_key
    print(f"Mean MCH in pg:\t{mch_sample_value:.2f}")
else:
    raise ValueError(
        "Must provide metadata containing the "
        "Mean Corpuscular Hemoglobin (MCH), or provide the value directly in picograms."
    )


# If time was not included in metadata, add as a part of index
if time_key and time_key not in MCH_per_sample.index.names:
    MCH_per_sample = (
        pd.concat(
            (
                MCH_per_sample,
                pd.Series(
                    [time_points] * len(MCH_per_sample.index),
                    index=MCH_per_sample.index,
                    name=time_key,
                ),
            ),
            axis=1,
        )
        .explode(time_key)
        .reset_index(drop=False)
    )
    MCH_per_sample = MCH_per_sample.set_index(index_keys).squeeze()
MCH_per_sample.head()

#### Transform intensities to copy numbers and expected format

In [None]:
# Convert Da to kDa
df_uniprot_to_mw = df_protein_data.set_index("Entry")["Mass"] / 1000

df_concentrations = df_proteomics.set_index(index_keys)
# Sum intensities and convert to pmol / mgDW sample
df_concentrations = (
    df_concentrations.apply(lambda x: x / x.sum(), axis=1) / df_uniprot_to_mw
)
# pmol / mgDW sample --> nmol / gDW sample
df_concentrations = df_concentrations * 1e6
# # Conversion to copy numbers
# df_copy_numbers = (df_concentrations * 1e-9).mul(MCH_per_sample * 1e-12, axis=0) * AVOGADRO_NUMBER
# df_copy_numbers
df_concentrations[df_concentrations.index.duplicated()]

### Export absolute quantitative data per sample

In [None]:
for data_type, df in zip(
    ["Concentrations", "CopyNumbers"], [df_concentrations, df_copy_numbers]
):
    df = df.sort_index(level=0, sort_remaining=False)
    df.index = [f"{ind[0]}_{time_abbrev}{ind[1]}" for ind in df.index]
    df.to_csv(
        proteomics_dirpath / f"{dataset_name}_{data_type}.tsv", sep="\t", index=True
    )
    print(f"Saved data for {data_type}")