# Prepare Proteomic Data - Intensities, Mouse G6PD variants
## Setup
### Import packages

In [1]:
import pandas as pd
from rbc_gem_utils import get_dirpath, show_versions
from rbc_gem_utils.util import AVOGADRO_NUMBER

# Show versions of notebook
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.1

Dependency Information
----------------------
beautifulsoup4                       4.12.3
bio                                   1.6.2
cobra                                0.29.0
depinfo                               2.2.0
kaleido                               0.2.1
matplotlib                            3.8.2
memote                               0.17.0
networkx                              3.2.1
notebook                              7.0.7
openpyxl                              3.1.2
pandas                                2.2.0
pre-commit                            3.6.0
pyvis                                 0.3.2
rbc-gem-utils[database,network,vis] missing
requests                             2.31.0
scipy                                1.12.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip        23.3.1
setuptools 68.2.2
wheel      0.41.2

Platform Information
-------------------

## Load RBC Proteomics
### Set organism, dataset, and paths

In [2]:
organism = "Mouse"
dataset_name = "G6PDvariants"
proteomics_dirpath = (
    get_dirpath("proteomics", use_temp="raw") / organism / dataset_name
).resolve()
proteomics_dirpath.mkdir(exist_ok=True, parents=True)

### Load protein data

In [3]:
df_protein_data = pd.read_csv(
    proteomics_dirpath / f"{dataset_name}_ProteinData.tsv",
    sep="\t",
    index_col=None,
)
# Check to see if expected columns are included. If so, then order columns as listed.
df_protein_data = df_protein_data.loc[
    :,
    [
        "Entry",
        "Entry Name",
        "Protein",
        "Protein names",
        "Gene Names (primary)",
        "Length",
        "Mass",  # Should be in DA
    ],
]
# Sort the data via alphabetical order of protein IDs for consistency
df_protein_data = df_protein_data.sort_values(by="Entry")

df_protein_data.head()

Unnamed: 0,Entry,Entry Name,Protein,Protein names,Gene Names (primary),Length,Mass
0,A2AAY5,SPD2B_MOUSE,SPD2B,SH3 and PX domain-containing protein 2B (Facto...,Sh3pxd2b,908,101517
1,A2ADY9,DDI2_MOUSE,DDI2,Protein DDI1 homolog 2 (EC 3.4.23.-),Ddi2,399,44591
2,A2AGT5,CKAP5_MOUSE,CKAP5,Cytoskeleton-associated protein 5,Ckap5,2032,225635
3,A2AN08,UBR4_MOUSE,UBR4,E3 ubiquitin-protein ligase UBR4 (EC 2.3.2.27)...,Ubr4,5180,572290
4,A2AQ07,TBB1_MOUSE,TBB1,Tubulin beta-1 chain,Tubb1,451,50441


### Set data value type and variables for columns keys 

In [4]:
protein_values_dtype = "Intensity"
sample_key = "SAMPLE ID"
donor_key = "MOUSE ID"
time_key = "TIME"

#### Load data and map to UniProt if necessary

In [5]:
df_proteomics = pd.read_csv(
    proteomics_dirpath / f"{dataset_name}_{protein_values_dtype}Data.tsv",
    sep="\t",
    index_col=None,
)
# Transform Protein IDs to UniProt IDs
if any(df_proteomics.columns.isin(df_protein_data["Protein"])):
    df_proteomics = df_proteomics.rename(
        df_protein_data.set_index("Protein")["Entry"].to_dict(), axis=1
    )
df_proteomics

Unnamed: 0,MOUSE ID,TIME,A2AAY5,A2ADY9,A2AGT5,A2AN08,A2AQ07,A2AVZ9,A6X935,B2RPV6,...,Q9Z1Z0,Q9Z2K1,Q9Z2L7,Q9Z2M7,Q9Z2U0,Q9Z2U1,Q9Z2W0,Q9Z2X1,Q9Z2Y8,V9GXG1
0,A1,Post,0.00000,125224.660,0.0,623638.90,172587.060,0.0,796035.200,43706.266,...,0.0000,0.000,72590.766,313902.34,594696.90,799950.06,416941.34,0.000,43627.234,0.0
1,A1,Pre,0.00000,146447.440,0.0,405435.94,447764.400,0.0,891963.560,447598.940,...,11330.2820,0.000,32236.828,282056.28,646163.94,1114011.10,428478.20,40141.754,47638.426,0.0
2,A1,TD,0.00000,320916.660,0.0,662447.80,611563.500,0.0,367201.400,311376.840,...,0.0000,0.000,89671.070,343245.66,483585.10,868526.25,355002.30,0.000,41105.300,0.0
3,A10,Post,318.46050,161438.160,0.0,302582.25,37780.363,0.0,96832.305,0.000,...,0.0000,24017.834,90886.750,346674.88,830986.44,1003041.30,287848.50,0.000,65348.203,0.0
4,A10,Pre,0.00000,51032.973,0.0,363544.70,97704.414,0.0,278689.800,43538.170,...,7095.1300,0.000,61173.242,367621.12,810137.06,1139476.00,431312.88,0.000,60206.770,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,MED8,Pre,0.00000,100151.140,0.0,322600.90,212596.700,0.0,1122667.900,74924.140,...,6502.6094,0.000,70870.730,282731.72,766091.60,1073668.90,250482.58,40331.793,54894.980,0.0
103,MED8,TD,0.00000,132591.530,0.0,357020.56,1709988.000,0.0,92423.266,90750.090,...,0.0000,0.000,73874.960,272972.72,541198.20,771852.00,338136.06,47113.010,46276.266,0.0
104,MED9,Post,0.00000,224659.750,0.0,420088.47,381507.940,0.0,20209.568,56303.030,...,0.0000,14465.219,59123.812,299184.10,546876.44,978658.30,209130.08,0.000,34989.305,0.0
105,MED9,Pre,0.00000,120813.750,0.0,289419.80,82233.790,0.0,222721.160,238749.810,...,7378.2730,0.000,76976.190,439084.10,720954.00,1209502.60,332436.47,0.000,75257.670,0.0


#### Sort and format data

In [6]:
index_keys = []
for label, key in zip(["donor", "timepoint"], [donor_key, time_key]):
    if key:
        index_keys += [key]
        print(f"Number of {label}s: {df_proteomics[key].nunique()}")

df = df_proteomics.set_index(index_keys)
# Sort the column data via alphabetical order of protein IDs for consistency
df = df.sort_index(axis=1).reset_index(drop=False)
# Sort the rows by Sample IDs and time points
df = df.sort_values(by=index_keys, axis=0)

df.head()

Number of donors: 36
Number of timepoints: 3


Unnamed: 0,MOUSE ID,TIME,A2AAY5,A2ADY9,A2AGT5,A2AN08,A2AQ07,A2AVZ9,A6X935,B2RPV6,...,Q9Z1Z0,Q9Z2K1,Q9Z2L7,Q9Z2M7,Q9Z2U0,Q9Z2U1,Q9Z2W0,Q9Z2X1,Q9Z2Y8,V9GXG1
0,A1,Post,0.0,125224.66,0.0,623638.9,172587.06,0.0,796035.2,43706.266,...,0.0,0.0,72590.766,313902.34,594696.9,799950.06,416941.34,0.0,43627.234,0.0
1,A1,Pre,0.0,146447.44,0.0,405435.94,447764.4,0.0,891963.56,447598.94,...,11330.282,0.0,32236.828,282056.28,646163.94,1114011.1,428478.2,40141.754,47638.426,0.0
2,A1,TD,0.0,320916.66,0.0,662447.8,611563.5,0.0,367201.4,311376.84,...,0.0,0.0,89671.07,343245.66,483585.1,868526.25,355002.3,0.0,41105.3,0.0
3,A10,Post,318.4605,161438.16,0.0,302582.25,37780.363,0.0,96832.305,0.0,...,0.0,24017.834,90886.75,346674.88,830986.44,1003041.3,287848.5,0.0,65348.203,0.0
4,A10,Pre,0.0,51032.973,0.0,363544.7,97704.414,0.0,278689.8,43538.17,...,7095.13,0.0,61173.242,367621.12,810137.06,1139476.0,431312.88,0.0,60206.77,0.0


## Transform intensities to copy numbers
If copy numbers are provided, skip this section.

In [7]:
# Provide in picograms. Set as None to use metadata if provided
mch_sample_value = 13.9

### Get MCH per sample
#### Load metadata corresponding to samples (optional)

In [8]:
try:
    df_metadata = pd.read_csv(
        proteomics_dirpath / f"{dataset_name}_MetaData.tsv",
        sep="\t",
        index_col=None,
    )
except FileNotFoundError:
    df_metadata = pd.DataFrame()
else:
    # Ensure only metadata corresponds to the available omics data
    if set(df_proteomics[donor_key].unique()) != set(df_metadata[donor_key].unique()):
        df_metadata = df_metadata[df_metadata[donor_key].isin(df_proteomics[donor_key])]

df_metadata.head()

#### Get approximate dry weight for each donor

In [9]:
# Provide in picograms. Set as None to use metadata if provided
if not df_metadata.empty and mch_sample_value is None:
    try:
        MCH_per_sample = df_metadata.set_index(index_keys)
    except KeyError:
        MCH_per_sample = df_metadata.set_index(donor_key)

    MCH_per_sample = MCH_per_sample[["CBC.HGB", "CBC.RBC"]]
    MCH_per_sample = pd.Series(
        # Calculate MCH in pg using CBC.HGB and CBC.RBC measurements
        MCH_per_sample["CBC.HGB"] / MCH_per_sample["CBC.RBC"] * 10,
        name="MCH",
    )
    n_missing = len(MCH_per_sample[MCH_per_sample.isna()])
    print(
        f"Mean MCH in pg (n={len(MCH_per_sample) - n_missing}):\t{MCH_per_sample.mean():.2f}"
    )
    print(f"Missing values:\t\t{n_missing}")
    MCH_per_sample = MCH_per_sample.fillna(MCH_per_sample.mean())

elif mch_sample_value is not None:
    MCH_per_sample = pd.Series(
        [mch_sample_value] * len(df_proteomics[donor_key].unique()),
        index=df_proteomics[donor_key].unique(),
        name="MCH",
    )
    MCH_per_sample.index.name = donor_key
    print(f"Mean MCH in pg:\t{mch_sample_value:.2f}")
else:
    raise ValueError(
        "Must provide metadata containing the "
        "Mean Corpuscular Hemoglobin (MCH), or provide the value directly in picograms."
    )

# If time was not included in metadata, add as a part of index
if time_key and time_key not in MCH_per_sample.index.names:
    MCH_per_sample = (
        pd.concat(
            (
                MCH_per_sample,
                pd.Series(
                    [list(df[time_key].unique())] * len(MCH_per_sample.index),
                    index=MCH_per_sample.index,
                    name=time_key,
                ),
            ),
            axis=1,
        )
        .explode(time_key)
        .reset_index(drop=False)
    )


MCH_per_sample.to_csv(
    proteomics_dirpath / f"{dataset_name}_MCH.tsv", sep="\t", index=False
)
MCH_per_sample = MCH_per_sample.set_index(index_keys).squeeze()
MCH_per_sample.head()

Mean MCH in pg:	13.90


MOUSE ID  TIME
A1        Post    13.9
          Pre     13.9
          TD      13.9
A10       Post    13.9
          Pre     13.9
Name: MCH, dtype: float64

#### Transform intensities to copy numbers and expected format

In [10]:
# Convert Da to kDa
df_uniprot_to_mw = df_protein_data.set_index("Entry")["Mass"] / 1000

df_concentrations = df_proteomics.set_index(index_keys)
# Sum intensities and convert to pmol / mgDW sample
df_concentrations = (
    df_concentrations.apply(lambda x: x / x.sum(), axis=1) / df_uniprot_to_mw
)
# pmol / mgDW sample --> nmol / gDW sample
df_concentrations = df_concentrations * 1e6
df_concentrations

# Conversion to copy numbers
df_copy_numbers = (df_concentrations * 1e-9).mul(
    MCH_per_sample * 1e-12, axis=0
) * AVOGADRO_NUMBER
df_copy_numbers

Unnamed: 0_level_0,Unnamed: 1_level_0,A2AAY5,A2ADY9,A2AGT5,A2AN08,A2AQ07,A2AVZ9,A6X935,B2RPV6,B2RQC6,C0HKE1,...,Q9Z1Z0,Q9Z2K1,Q9Z2L7,Q9Z2M7,Q9Z2U0,Q9Z2U1,Q9Z2W0,Q9Z2X1,Q9Z2Y8,V9GXG1
MOUSE ID,TIME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A1,Post,0.000000,9099.150752,0.0,3530.817340,11086.201140,0.0,24644.153353,1040.458218,0.000000,72454.137921,...,0.000000,0.000000,4745.883310,36774.577733,69175.247708,98137.782656,25876.434275,0.000000,4704.200965,0.0
A1,Pre,0.000000,10027.408024,0.0,2163.018340,27103.163713,0.0,26021.039753,10040.743977,0.000000,266548.758649,...,323.355358,0.000000,1986.021210,31137.577311,70826.155729,128783.074125,25058.444451,2680.089261,4840.402703,0.0
A1,TD,0.000000,25219.707978,0.0,4056.304103,42486.687412,0.0,12294.836265,8016.855175,2026.030319,29929.529705,...,0.000000,0.000000,6340.519161,43490.558712,60836.574196,115237.308335,23828.539859,0.000000,4793.611134,0.0
A10,Post,10.625597,12262.965610,0.0,1790.868980,2536.991205,0.0,3133.864483,0.000000,0.000000,36762.642684,...,0.000000,1576.413568,6211.758494,42457.442242,101047.901207,128638.359637,18675.479277,0.000000,7366.143058,0.0
A10,Pre,0.000000,3736.458334,0.0,2073.949353,6323.928958,0.0,8693.626527,1044.357972,0.000000,20129.011988,...,216.522021,0.000000,4029.910945,43396.216809,94953.684834,140856.477436,26972.436274,0.000000,6541.416334,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MED8,Pre,0.000000,6644.844115,0.0,1667.729186,12469.497225,0.0,31735.901261,1628.623165,0.000000,129506.786719,...,179.824560,0.000000,4230.780397,30244.442074,81368.001536,120271.198886,14194.656976,2609.290542,5404.788693,0.0
MED8,TD,0.000000,10896.422088,0.0,2286.085228,124229.495791,0.0,3236.086176,2443.346957,1151.687735,23693.570961,...,0.000000,0.000000,5462.482667,36168.419010,71198.120381,107093.851176,23734.391286,3775.330687,5643.436372,0.0
MED9,Post,0.000000,18321.536839,0.0,2669.367972,27504.498703,0.0,702.205603,1504.313447,0.000000,38759.156436,...,0.000000,1019.315565,4338.341484,39338.453314,71395.344248,134750.405058,14567.049856,0.000000,4234.373209,0.0
MED9,Pre,0.000000,9163.371823,0.0,1710.402168,5513.822421,0.0,7197.324031,5932.693614,0.000000,26859.977708,...,233.252391,0.000000,5253.152799,53694.375744,87536.776401,154884.556856,21536.051481,0.000000,8470.458150,0.0


## Export absolute quantitative data per sample

In [11]:
dataframes_dict = {
    "Concentrations": df_concentrations.reset_index(drop=False),
    "CopyNumbers": df_copy_numbers.reset_index(drop=False),
}
for data_type, df in dataframes_dict.items():
    df.to_csv(
        proteomics_dirpath / f"{dataset_name}_{data_type}.tsv", sep="\t", index=False
    )
    print(f"Saved data for {data_type}")

Saved data for Concentrations
Saved data for CopyNumbers
