# Prepare proteomic data - RBC Omics
## Setup
### Import packages

In [1]:
from pathlib import Path

import pandas as pd
from rbc_gem_utils import COBRA_CONFIGURATION, ROOT_PATH, show_versions
from rbc_gem_utils.util import AVOGADRO_NUMBER

# Show versions of notebook
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.1

Dependency Information
----------------------
beautifulsoup4                       4.12.3
bio                                   1.6.2
cobra                                0.29.0
depinfo                               2.2.0
kaleido                               0.2.1
matplotlib                            3.8.2
memote                               0.17.0
networkx                              3.2.1
notebook                              7.0.7
openpyxl                              3.1.2
pandas                                2.2.0
pre-commit                            3.6.0
pyvis                                 0.3.2
rbc-gem-utils[database,network,vis] missing
requests                             2.31.0
scipy                                1.12.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip        23.3.1
setuptools 68.2.2
wheel      0.41.2

Platform Information
-------------------

### Define configuration
#### COBRA Configuration

In [2]:
COBRA_CONFIGURATION.solver = "gurobi"
COBRA_CONFIGURATION.bounds = (-1e3, 1e3)
COBRA_CONFIGURATION

Attribute,Description,Value
solver,Mathematical optimization solver,gurobi
tolerance,"General solver tolerance (feasibility, integrality, etc.)",1e-07
lower_bound,Default reaction lower bound,-1000.0
upper_bound,Default reaction upper bound,1000.0
processes,Number of parallel processes,15
cache_directory,Path for the model cache,/Users/zhaiman/Library/Caches/cobrapy
max_cache_size,Maximum cache size in bytes,104857600
cache_expiration,Model cache expiration time in seconds (if any),


## Load RBC-GEM model

In [3]:
dataset_name = "RBComics"
dataset_path = Path(f"{dataset_name}").resolve()
dataset_path

PosixPath('/Users/zhaiman/opt/github/RBC-GEM/code/notebooks/studies/rbc1/RBComics')

## Load RBC Proteomics

In [4]:
df_omics = pd.read_csv(f"{dataset_path}/{dataset_name}.tsv", sep="\t", index_col=False)
df_protein_data = pd.read_csv(
    f"{dataset_path}/{dataset_name}_protein_data.tsv",
    sep="\t",
    index_col="Entry",
)
df_uniprot_to_mw = df_protein_data["Mass"] / 1000
df_metadata = (
    df_omics.groupby("INDEX ID")[df_omics.columns[2:30]]
    .agg(lambda x: list(x.unique())[0])
    .reset_index(drop=False)
)

# Fill missing data with average
df_hb_per_sample = df_metadata.set_index("INDEX ID")["CBC.HGB"] * 2
df_hb_per_sample = df_hb_per_sample.fillna(round(df_hb_per_sample.mean(), 1))

df_proteomics = df_omics.loc[
    :, list(df_omics.columns[:2]) + list(df_omics.columns[30:])
].copy()
df_proteomics = df_proteomics.loc[
    :, [x for x in df_proteomics.columns if not str(x[0]).isnumeric()]
]

### Transform sample intensities to copy numbers

In [5]:
df_copy_numbers = {}
df_conc_per_sample = {}
uniprot_ids = set()
min_copy_number = 0

#### Day 10

In [6]:
# Day is 10, 23, or 42
day = 10
df_proteomics_day = df_proteomics.set_index(["Day", "INDEX ID"]).loc[day]
df_proteomics_day = df_proteomics_day.apply(lambda x: x / x.sum(), axis=1)
# pmol / mgDW sample --> nmol / gDW sample
df_proteomics_day = (df_proteomics_day * 1e6) / df_uniprot_to_mw
df_conc_per_sample[day] = df_proteomics_day.copy()

# Normalize using each individuals CBC.HGB
df_proteomics_day = (
    (df_proteomics_day.T * 1e-9) * (df_hb_per_sample * 1e-12) * AVOGADRO_NUMBER
)
df_copy_numbers[day] = df_proteomics_day.T
df_proteomics_day

INDEX ID,1,2,3,4,5,6,7,8,9,10,...,607,608,609,610,611,612,613,614,615,616
P69905,1.254607e+08,1.622846e+08,1.408849e+08,1.533373e+08,1.451599e+08,1.441721e+08,1.219426e+08,1.294649e+08,1.605849e+08,8.616039e+07,...,1.588897e+08,1.401338e+08,1.632786e+08,1.617655e+08,1.413980e+08,1.390063e+08,1.438014e+08,1.281982e+08,1.418535e+08,1.364156e+08
P68871,6.751875e+07,3.248112e+07,9.282055e+07,9.428727e+07,7.885271e+07,9.648761e+07,6.912440e+07,7.944885e+07,8.127288e+07,7.179953e+07,...,1.062677e+08,8.232853e+07,6.070213e+07,1.045888e+08,7.073972e+07,5.247461e+07,7.322772e+07,7.844957e+07,7.966189e+07,9.380119e+07
P02042,1.684139e+08,2.455682e+08,2.866462e+08,2.796149e+08,2.064930e+08,2.762197e+08,1.976066e+08,1.983371e+08,2.066775e+08,1.939045e+08,...,2.038007e+08,2.056432e+08,1.617469e+08,2.166825e+08,2.232084e+08,1.397474e+08,1.917944e+08,1.941655e+08,1.574790e+08,2.044559e+08
P02008,3.409157e+07,6.281825e+07,3.096045e+07,4.492695e+07,5.865059e+07,7.109292e+07,4.847243e+07,4.364584e+07,6.235367e+07,1.836647e+07,...,7.331362e+07,5.987539e+07,4.996389e+07,6.920626e+07,5.775719e+07,4.549118e+07,5.532827e+07,5.501459e+07,6.000069e+07,6.053364e+07
P02730,4.582424e+06,5.497925e+06,4.302025e+06,3.875573e+06,6.774951e+06,5.593933e+06,5.606950e+06,3.857835e+06,6.796955e+06,7.156441e+06,...,6.976515e+06,6.787051e+06,6.676400e+06,5.767936e+06,6.597180e+06,6.389398e+06,6.311433e+06,5.702726e+06,6.375189e+06,6.637305e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9Y5P4,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,2.566093e+03,1.882475e+03,0.000000e+00,0.000000e+00,0.000000e+00,9.841318e+03,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
Q9Y5Y2,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.683285e+04,1.104328e+04,7.423386e+04,...,0.000000e+00,0.000000e+00,6.758014e+04,0.000000e+00,0.000000e+00,5.344251e+04,3.474968e+04,0.000000e+00,0.000000e+00,0.000000e+00
Q9Y666,0.000000e+00,4.567509e+03,2.406122e+03,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.592869e+03,...,0.000000e+00,0.000000e+00,7.491786e+02,0.000000e+00,0.000000e+00,1.681270e+03,8.442787e+02,0.000000e+00,0.000000e+00,0.000000e+00
Q9Y6B7,1.088452e+03,0.000000e+00,0.000000e+00,1.195146e+03,0.000000e+00,0.000000e+00,0.000000e+00,1.028561e+03,0.000000e+00,0.000000e+00,...,2.610187e+03,7.896490e+02,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


#### Day 23

In [7]:
# Day is 10, 23, or 42
day = 23
df_proteomics_day = df_proteomics.set_index(["Day", "INDEX ID"]).loc[day]
df_proteomics_day = df_proteomics_day.apply(lambda x: x / x.sum(), axis=1)
# pmol / mgDW sample --> nmol / gDW sample
df_proteomics_day = (df_proteomics_day * 1e6) / df_uniprot_to_mw
df_conc_per_sample[day] = df_proteomics_day.copy()

# Normalize using each individuals CBC.HGB
df_proteomics_day = (
    (df_proteomics_day.T * 1e-9) * (df_hb_per_sample * 1e-12) * AVOGADRO_NUMBER
)
df_copy_numbers[day] = df_proteomics_day.T
df_proteomics_day

INDEX ID,1,2,3,4,5,6,7,8,9,10,...,607,608,609,610,611,612,613,614,615,616
P69905,1.158886e+08,1.463969e+08,1.556543e+08,1.413781e+08,1.295201e+08,1.720411e+08,1.236445e+08,1.384543e+08,1.378411e+08,1.512476e+08,...,1.578808e+08,1.470635e+08,1.559194e+08,1.598851e+08,1.468404e+08,1.458896e+08,1.346782e+08,1.320135e+08,1.328703e+08,1.433065e+08
P68871,5.003845e+07,7.878593e+07,5.943647e+07,9.195358e+07,7.526566e+07,5.657667e+07,8.394623e+07,8.207305e+07,8.142419e+07,9.657883e+07,...,8.448350e+07,8.508321e+07,1.062013e+08,5.973372e+07,8.320421e+07,8.024789e+07,8.023202e+07,7.209723e+07,8.497104e+07,9.048335e+07
P02042,1.143545e+08,1.976684e+08,1.293589e+08,2.148368e+08,2.050091e+08,1.412694e+08,2.072890e+08,2.219010e+08,2.260540e+08,2.486136e+08,...,2.183707e+08,2.282119e+08,2.266639e+08,1.417010e+08,2.151044e+08,2.111469e+08,1.657192e+08,1.488954e+08,1.755779e+08,1.742910e+08
P02008,3.686555e+07,6.681194e+07,4.861565e+07,6.124425e+07,5.686679e+07,4.900507e+07,4.339189e+07,5.607363e+07,4.809051e+07,1.691351e+07,...,5.995301e+07,5.672085e+07,6.716319e+07,4.888708e+07,6.006616e+07,5.609873e+07,5.718542e+07,5.327334e+07,5.383020e+07,6.675829e+07
P02730,5.200932e+06,5.792947e+06,6.033778e+06,5.695618e+06,6.856259e+06,6.254549e+06,4.680096e+06,5.465201e+06,5.129800e+06,4.447073e+06,...,6.764635e+06,6.608708e+06,7.637633e+06,6.183320e+06,6.633854e+06,6.255158e+06,6.441324e+06,5.428161e+06,5.983915e+06,6.355986e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9Y5P4,0.000000e+00,0.000000e+00,1.511446e+03,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.516860e+03,0.000000e+00
Q9Y5Y2,8.263187e+04,1.970891e+04,4.573432e+04,0.000000e+00,0.000000e+00,8.380638e+04,0.000000e+00,0.000000e+00,0.000000e+00,4.488416e+04,...,1.820955e+04,0.000000e+00,0.000000e+00,5.433243e+04,0.000000e+00,0.000000e+00,0.000000e+00,3.884316e+04,0.000000e+00,1.635814e+04
Q9Y666,0.000000e+00,0.000000e+00,3.324935e+02,0.000000e+00,0.000000e+00,2.182839e+03,1.073910e+03,3.253168e+02,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,9.144257e+02
Q9Y6B7,1.630876e+03,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,9.451661e+02,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


#### Day 42

In [8]:
# Day is 10, 23, or 42
day = 42
df_proteomics_day = df_proteomics.set_index(["Day", "INDEX ID"]).loc[day]
df_proteomics_day = df_proteomics_day.apply(lambda x: x / x.sum(), axis=1)
# pmol / mgDW sample --> nmol / gDW sample
df_proteomics_day = (df_proteomics_day * 1e6) / df_uniprot_to_mw
df_conc_per_sample[day] = df_proteomics_day.copy()

# Normalize using each individuals CBC.HGB
df_proteomics_day = (
    (df_proteomics_day.T * 1e-9) * (df_hb_per_sample * 1e-12) * AVOGADRO_NUMBER
)
df_copy_numbers[day] = df_proteomics_day.T
df_proteomics_day

INDEX ID,1,2,3,4,5,6,7,8,9,10,...,607,608,609,610,611,612,613,614,615,616
P69905,1.405083e+08,1.411472e+08,1.450739e+08,1.522884e+08,1.486539e+08,1.651044e+08,1.295855e+08,1.214263e+08,1.572203e+08,1.248765e+08,...,1.580931e+08,1.453645e+08,1.772746e+08,1.485915e+08,1.627156e+08,1.479013e+08,1.502160e+08,1.367636e+08,1.452709e+08,1.508962e+08
P68871,2.283338e+07,8.148250e+07,8.452587e+07,6.200941e+07,7.918350e+07,8.612713e+07,7.008838e+07,7.897144e+07,7.931223e+07,7.644652e+07,...,9.806259e+07,1.109978e+08,1.209412e+08,8.370645e+07,1.018638e+08,9.034572e+07,1.017134e+08,7.392661e+07,8.787838e+07,9.040845e+07
P02042,1.905630e+08,2.093051e+08,2.523241e+08,1.304307e+08,1.879842e+08,2.181466e+08,1.864827e+08,1.758891e+08,2.106533e+08,2.118778e+08,...,2.854427e+08,2.406243e+08,2.307701e+08,2.025343e+08,2.941625e+08,1.777762e+08,1.650392e+08,1.359617e+08,1.717963e+08,1.687312e+08
P02008,5.119440e+07,6.122631e+07,6.451162e+07,3.801154e+06,5.975837e+07,5.402841e+07,5.491615e+07,5.632827e+07,6.245288e+07,5.876889e+07,...,6.294854e+07,6.315131e+07,8.224730e+07,5.469040e+07,5.737152e+07,5.760717e+07,7.026294e+07,5.370047e+07,5.531884e+07,5.736702e+07
P02730,4.741377e+06,5.748356e+06,6.164398e+06,6.499277e+06,6.574796e+06,5.803558e+06,5.850697e+06,5.912509e+06,6.876974e+06,6.894150e+06,...,6.263199e+06,6.612394e+06,6.830992e+06,6.407041e+06,6.048621e+06,6.396855e+06,6.076268e+06,5.640477e+06,6.005166e+06,6.505993e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9Y5P4,0.000000e+00,0.000000e+00,0.000000e+00,8.127691e+03,3.268759e+03,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
Q9Y5Y2,0.000000e+00,8.951968e+03,0.000000e+00,5.046344e+04,5.421273e+03,0.000000e+00,0.000000e+00,0.000000e+00,2.596330e+04,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,9.715740e+03,0.000000e+00,1.452876e+04,0.000000e+00,1.547912e+04,0.000000e+00,2.385635e+04
Q9Y666,0.000000e+00,0.000000e+00,0.000000e+00,2.220167e+02,0.000000e+00,3.406521e+02,0.000000e+00,0.000000e+00,3.444904e+03,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.442080e+03,0.000000e+00,4.372160e+03,0.000000e+00,0.000000e+00
Q9Y6B7,5.176451e+02,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


### Export absolute quantitative data

In [9]:
days = [10, 23, 42]
for day in days:
    df_conc_per_sample[day]["Day"] = day
    df_copy_numbers[day]["Day"] = day

df_concentrations_all = (
    pd.concat([df_conc_per_sample[day] for day in days], axis=0)
    .reset_index(drop=False)
    .set_index(["INDEX ID", "Day"])
)
df_concentrations_all.index = [f"S{x[0]}_D{x[1]}" for x in df_concentrations_all.index]
df_concentrations_all.T.to_csv(
    f"{dataset_path}/{dataset_name}_Concentrations.tsv", sep="\t", index=True
)

df_copy_number_all = (
    pd.concat([df_copy_numbers[day] for day in days], axis=0)
    .reset_index(drop=False)
    .set_index(["INDEX ID", "Day"])
)
df_copy_number_all.index = [f"S{x[0]}_D{x[1]}" for x in df_copy_number_all.index]
df_copy_number_all.T.to_csv(
    f"{dataset_path}/{dataset_name}_CopyNumbers.tsv", sep="\t", index=True
)
df_copy_number_all

Unnamed: 0,P69905,P68871,P02042,P02008,P02730,P30043,P32119,P02768,Q92902,P04040,...,Q9Y3E7,Q9Y4Y9,Q9Y508,Q9Y587,Q9Y5B8,Q9Y5P4,Q9Y5Y2,Q9Y666,Q9Y6B7,Q9Y6M4
S1_D10,1.254607e+08,6.751875e+07,1.684139e+08,3.409157e+07,4.582424e+06,1.956960e+07,7.461828e+06,4.729611e+06,3.697239e+06,3.742403e+06,...,0.0,0.000000,0.000000,0.000000,7378.473625,0.0,0.000000,0.000000,1088.451504,0.000000
S2_D10,1.622846e+08,3.248112e+07,2.455682e+08,6.281825e+07,5.497925e+06,2.819367e+07,1.184191e+07,3.586910e+06,2.223799e+06,2.496767e+06,...,0.0,0.000000,9634.822452,5263.615637,805.935440,0.0,0.000000,4567.509290,0.000000,0.000000
S3_D10,1.408849e+08,9.282055e+07,2.866462e+08,3.096045e+07,4.302025e+06,1.713969e+07,8.752663e+06,2.807233e+06,1.635281e+07,8.260527e+05,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,2406.122450,0.000000,0.000000
S4_D10,1.533373e+08,9.428727e+07,2.796149e+08,4.492695e+07,3.875573e+06,2.500399e+07,8.190457e+06,3.354456e+06,1.123342e+06,3.666083e+06,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1195.145799,0.000000
S5_D10,1.451599e+08,7.885271e+07,2.064930e+08,5.865059e+07,6.774951e+06,2.690653e+07,1.633960e+07,3.973826e+06,2.859506e+06,4.664673e+06,...,0.0,27830.662335,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S612_D42,1.479013e+08,9.034572e+07,1.777762e+08,5.760717e+07,6.396855e+06,2.158214e+07,1.311621e+07,4.412278e+06,2.390702e+06,4.028351e+06,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,14528.761376,3442.080339,0.000000,211244.957057
S613_D42,1.502160e+08,1.017134e+08,1.650392e+08,7.026294e+07,6.076268e+06,2.803835e+07,1.344956e+07,3.119734e+06,2.704957e+06,4.309456e+06,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
S614_D42,1.367636e+08,7.392661e+07,1.359617e+08,5.370047e+07,5.640477e+06,2.786735e+07,1.309307e+07,1.950198e+06,3.363896e+06,3.664767e+06,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,15479.120703,4372.160364,0.000000,5809.848633
S615_D42,1.452709e+08,8.787838e+07,1.717963e+08,5.531884e+07,6.005166e+06,2.895227e+07,1.179158e+07,2.199533e+06,1.464653e+06,4.171225e+06,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000


### Export metadata

In [10]:
df_metadata = df_metadata.set_index("INDEX ID")
df_metadata.index = [f"S{x}" for x in df_metadata.index]
df_metadata.to_csv(f"{dataset_path}/{dataset_name}_Metadata.tsv", sep="\t", index=True)
df_metadata

Unnamed: 0,AS,Gender,DONDB.ABO_RH,RBCOmics.Race.Ethnicity.Group,Age,BMI,Weight,Height,Hemolysis.volume,Hemolysis.hct,...,Adjusted.Osmotic.Hemolysis,Adjusted.Oxidative.Hemolysis,CBC.WBC,CBC.RBC,CBC.HGB,CBC.HCT,CBC.MCV,CBC.RDW,CBC.PLT,Ferritin
S1,AS3,F,O+,CAUCASIAN,19,24.126627,145,65,11.50,62.50,...,25.115126,,7.87,4.200,12.2,36.40,86.70,12.8,261.0,34.0
S2,AS3,M,A-,HIGH,46,27.464398,186,69,11.00,65.87,...,53.047355,,8.48,4.680,14.0,41.50,88.70,14.3,208.0,26.0
S3,AS3,M,O+,CAUCASIAN,25,23.565095,155,68,9.50,64.13,...,5.462808,,6.40,4.530,14.9,43.00,94.90,13.6,255.0,154.0
S4,AS3,F,O+,CAUCASIAN,63,30.782249,185,65,11.75,66.28,...,14.441025,,6.26,4.230,14.2,41.70,98.60,13.4,269.0,37.0
S5,AS3,M,O+,CAUCASIAN,60,25.840160,175,69,12.00,65.14,...,56.239610,,5.11,4.740,14.5,43.00,90.70,13.6,207.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S612,AS3,M,O+,AFRAMRCN,48,27.316740,185,69,11.00,58.00,...,21.129382,29.199072,3.93,5.840,14.8,46.60,79.80,14.3,187.0,30.0
S613,AS3,M,B+,HIGH,55,25.724339,195,73,13.00,62.00,...,22.004209,19.286704,5.25,4.575,13.9,42.45,92.75,14.7,196.0,12.0
S614,AS3,F,B+,ASIAN,50,24.140479,132,62,13.00,65.56,...,67.911292,,7.38,4.190,12.0,36.70,87.60,13.7,226.0,7.0
S615,AS3,F,O+,AFRAMRCN,60,49.417234,279,63,11.00,64.95,...,10.849738,,8.24,4.610,12.8,40.80,88.50,17.7,327.0,102.0
