# Prepare Proteomic Data - Copy Numbers, DeepRed
## Setup
### Import packages

In [1]:
import pandas as pd
from rbc_gem_utils import get_dirpath, show_versions

# Show versions of notebook
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.1

Dependency Information
----------------------
beautifulsoup4                       4.12.3
bio                                   1.6.2
cobra                                0.29.0
depinfo                               2.2.0
kaleido                               0.2.1
matplotlib                            3.8.2
memote                               0.17.0
networkx                              3.2.1
notebook                              7.0.7
openpyxl                              3.1.2
pandas                                2.2.0
pre-commit                            3.6.0
pyvis                                 0.3.2
rbc-gem-utils[database,network,vis] missing
requests                             2.31.0
scipy                                1.12.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip        23.3.1
setuptools 68.2.2
wheel      0.41.2

Platform Information
-------------------

## Load RBC Proteomics
### Set organism, dataset, and paths

In [2]:
organism = "Human"
dataset_name = "DeepRedOmics"
proteomics_dirpath = (
    get_dirpath("proteomics", use_temp="raw") / organism / dataset_name
).resolve()
proteomics_dirpath.mkdir(exist_ok=True, parents=True)

### Load protein data

In [3]:
df_protein_data = pd.read_csv(
    proteomics_dirpath / f"{dataset_name}_ProteinData.tsv",
    sep="\t",
    index_col=None,
)
# Check to see if expected columns are included. If so, then order columns as listed.
df_protein_data = df_protein_data.loc[
    :,
    [
        "Entry",
        "Entry Name",
        "Protein",
        "Protein names",
        "Gene Names (primary)",
        "Length",
        "Mass",  # Should be in DA
    ],
]
# Sort the data via alphabetical order of protein IDs for consistency
df_protein_data = df_protein_data.sort_values(by="Entry")

df_protein_data.head()

Unnamed: 0,Entry,Entry Name,Protein,Protein names,Gene Names (primary),Length,Mass
0,A0A024RBG1,NUD4B_HUMAN,NUD4B,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,181,20434
1,A0A075B6I9,LV746_HUMAN,LV746,Immunoglobulin lambda variable 7-46,IGLV7-46,117,12468
2,A0A075B6K5,LV39_HUMAN,LV39,Immunoglobulin lambda variable 3-9,IGLV3-9,115,12332
3,A0A075B6P5,KV228_HUMAN,KV228,Immunoglobulin kappa variable 2-28,IGKV2-28,120,12957
4,A0A075B6R9,KVD24_HUMAN,KVD24,Probable non-functional immunoglobulin kappa v...,IGKV2D-24,120,13079


### Set data value type and variables for columns keys 

In [4]:
protein_values_dtype = "CopyNumbers"
sample_key = "SAMPLE ID"

#### Sort and format data

In [5]:
df_copy_numbers = pd.read_csv(
    proteomics_dirpath / f"{dataset_name}_{protein_values_dtype}Data.tsv",
    sep="\t",
    index_col=None,
)

df_copy_numbers = df_copy_numbers.set_index("Entry")[["Copy Numbers / Cell"]].T
df_copy_numbers.index = pd.Index([dataset_name], name=sample_key)
df_copy_numbers.columns.name = None
df_copy_numbers

Unnamed: 0_level_0,A0A024RBG1,A0A075B6I9,A0A075B6K5,A0A075B6P5,A0A075B6R9,A0A075B6S2,A0A0A0MRZ8,A0A0C4DH25,A0A0C4DH41,A0A1W2PR19,...,Q9Y6M1,Q9Y6M4,Q9Y6M5,Q9Y6N5,Q9Y6P5,Q9Y6R4,Q9Y6U3,Q9Y6W3,Q9Y6W5,Q9Y6Y8
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DeepRedOmics,4721.116599,15021.40978,609.550609,38381.33516,3124.21758,6557.865957,10352.39291,2136.1758,2177.580856,1706.116041,...,321.00004,29094.65457,307431.0413,689.048171,5653.931616,103.500561,655.143138,7630.705577,52631.87621,8313.38099


## Export absolute quantitative data per sample

In [6]:
df_copy_numbers.reset_index(drop=False).to_csv(
    proteomics_dirpath / f"{dataset_name}_{protein_values_dtype}.tsv", 
    sep="\t", 
    index=False
)
print(f"Saved data for {protein_values_dtype}")

Saved data for CopyNumbers
