# Prepare proteomic data - DeepRed
## Setup
### Import packages

In [None]:
from pathlib import Path

import pandas as pd
from rbc_gem_utils import COBRA_CONFIGURATION, ROOT_PATH, show_versions
from rbc_gem_utils.util import AVOGADRO_NUMBER

# Show versions of notebook
show_versions()

### Define configuration
#### COBRA Configuration

In [None]:
COBRA_CONFIGURATION.solver = "gurobi"
COBRA_CONFIGURATION.bounds = (-1e3, 1e3)
COBRA_CONFIGURATION

## Load RBC Proteomics
### Set paths

In [None]:
data_path = Path(ROOT_PATH, "data", "analysis", "proteomics").resolve()
dataset_name = "DeepRedOmics"
data_path

### Load protein data

In [None]:
df_protein_data = pd.read_csv(
    data_path / dataset_name / f"{dataset_name}_ProteinData.tsv",
    sep="\t",
    index_col=None,
)
# Check to see if expected columns are included. If so, then order columns as listed.
df_protein_data = df_protein_data.loc[
    :,
    [
        "Entry",
        "Entry Name",
        "Protein",
        "Protein names",
        "Gene Names (primary)",
        "Length",
        "Mass",  # Should be in DA
    ],
]
# Sort the data via alphabetical order of protein IDs for consistency
df_protein_data = df_protein_data.sort_values(by="Entry")

df_protein_data.head()

### Set variables for columns keys and protein values type

In [None]:
protein_quantity_dtype = "CopyNumbers"

### Load proteomic data - copy numbers per cell

In [None]:
df_copy_numbers = pd.read_csv(
    data_path / dataset_name / f"{dataset_name}_{protein_quantity_dtype}Data.tsv",
    sep="\t",
    index_col=None,
)
df_copy_numbers = df_copy_numbers.sort_index()
# Transform data to expected format
df_per_sample = df_copy_numbers.set_index("Entry")[["Copy Numbers / Cell"]].T
df_per_sample.index = [dataset_name]
df_per_sample.head()

### Export absolute quantitative data

In [None]:
df_per_sample.to_csv(
    data_path / dataset_name / f"{dataset_name}_CopyNumbers.tsv", sep="\t", index=True
)
df_per_sample