# Prepare proteomic data - RBC Omics
## Setup
### Import packages

In [None]:
from pathlib import Path

import pandas as pd
from rbc_gem_utils import COBRA_CONFIGURATION, show_versions
from rbc_gem_utils.util import AVOGADRO_NUMBER

# Show versions of notebook
show_versions()

### Define configuration
#### COBRA Configuration

In [None]:
COBRA_CONFIGURATION.solver = "gurobi"
COBRA_CONFIGURATION.bounds = (-1e3, 1e3)
COBRA_CONFIGURATION

## Load RBC-GEM model

In [None]:
dataset_name = "RBComics"
dataset_path = Path(f"{dataset_name}").resolve()
dataset_path

## Load RBC Proteomics

In [None]:
df_omics = pd.read_csv(dataset_path / f"{dataset_name}.tsv", sep="\t", index_col=False)
df_protein_data = pd.read_csv(
    dataset_path / f"{dataset_name}_protein_data.tsv",
    sep="\t",
    index_col="Entry",
)
df_uniprot_to_mw = df_protein_data["Mass"] / 1000
df_metadata = (
    df_omics.groupby("INDEX ID")[df_omics.columns[2:30]]
    .agg(lambda x: list(x.unique())[0])
    .reset_index(drop=False)
)

# Fill missing data with average
df_hb_per_sample = df_metadata.set_index("INDEX ID")["CBC.HGB"] * 2
df_hb_per_sample = df_hb_per_sample.fillna(round(df_hb_per_sample.mean(), 1))

df_proteomics = df_omics.loc[
    :, list(df_omics.columns[:2]) + list(df_omics.columns[30:])
].copy()
df_proteomics = df_proteomics.loc[
    :, [x for x in df_proteomics.columns if not str(x[0]).isnumeric()]
]

### Transform sample intensities to copy numbers

In [None]:
df_copy_numbers = {}
df_conc_per_sample = {}
uniprot_ids = set()
min_copy_number = 0

#### Day 10

In [None]:
# Day is 10, 23, or 42
day = 10
df_proteomics_day = df_proteomics.set_index(["Day", "INDEX ID"]).loc[day]
df_proteomics_day = df_proteomics_day.apply(lambda x: x / x.sum(), axis=1)
# pmol / mgDW sample --> nmol / gDW sample
df_proteomics_day = (df_proteomics_day * 1e6) / df_uniprot_to_mw
df_conc_per_sample[day] = df_proteomics_day.copy()

# Normalize using each individuals CBC.HGB
df_proteomics_day = (
    (df_proteomics_day.T * 1e-9) * (df_hb_per_sample * 1e-12) * AVOGADRO_NUMBER
)
df_copy_numbers[day] = df_proteomics_day.T
df_proteomics_day

#### Day 23

In [None]:
# Day is 10, 23, or 42
day = 23
df_proteomics_day = df_proteomics.set_index(["Day", "INDEX ID"]).loc[day]
df_proteomics_day = df_proteomics_day.apply(lambda x: x / x.sum(), axis=1)
# pmol / mgDW sample --> nmol / gDW sample
df_proteomics_day = (df_proteomics_day * 1e6) / df_uniprot_to_mw
df_conc_per_sample[day] = df_proteomics_day.copy()

# Normalize using each individuals CBC.HGB
df_proteomics_day = (
    (df_proteomics_day.T * 1e-9) * (df_hb_per_sample * 1e-12) * AVOGADRO_NUMBER
)
df_copy_numbers[day] = df_proteomics_day.T
df_proteomics_day

#### Day 42

In [None]:
# Day is 10, 23, or 42
day = 42
df_proteomics_day = df_proteomics.set_index(["Day", "INDEX ID"]).loc[day]
df_proteomics_day = df_proteomics_day.apply(lambda x: x / x.sum(), axis=1)
# pmol / mgDW sample --> nmol / gDW sample
df_proteomics_day = (df_proteomics_day * 1e6) / df_uniprot_to_mw
df_conc_per_sample[day] = df_proteomics_day.copy()

# Normalize using each individuals CBC.HGB
df_proteomics_day = (
    (df_proteomics_day.T * 1e-9) * (df_hb_per_sample * 1e-12) * AVOGADRO_NUMBER
)
df_copy_numbers[day] = df_proteomics_day.T
df_proteomics_day

### Export absolute quantitative data

In [None]:
days = [10, 23, 42]
for day in days:
    df_conc_per_sample[day]["Day"] = day
    df_copy_numbers[day]["Day"] = day

df_concentrations_all = (
    pd.concat([df_conc_per_sample[day] for day in days], axis=0)
    .reset_index(drop=False)
    .set_index(["INDEX ID", "Day"])
)
df_concentrations_all.index = [f"S{x[0]}_D{x[1]}" for x in df_concentrations_all.index]
df_concentrations_all.T.to_csv(
    dataset_path / f"{dataset_name}_Concentrations.tsv", sep="\t", index=True
)

df_copy_number_all = (
    pd.concat([df_copy_numbers[day] for day in days], axis=0)
    .reset_index(drop=False)
    .set_index(["INDEX ID", "Day"])
)
df_copy_number_all.index = [f"S{x[0]}_D{x[1]}" for x in df_copy_number_all.index]
df_copy_number_all.T.to_csv(
    dataset_path / f"{dataset_name}_CopyNumbers.tsv", sep="\t", index=True
)
df_copy_number_all

### Export metadata

In [None]:
df_metadata = df_metadata.set_index("INDEX ID")
df_metadata.index = [f"S{x}" for x in df_metadata.index]
df_metadata.to_csv(dataset_path / f"{dataset_name}_Metadata.tsv", sep="\t", index=True)
df_metadata