# Prepare Proteomic Data - Copy Numbers, DeepRed
## Setup
### Import packages

In [1]:
import pandas as pd
from rbc_gem_utils import get_dirpath, show_versions

# Show versions of notebook
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.2

Dependency Information
----------------------
beautifulsoup4                       4.13.4
bio                                   1.8.0
cobra                                0.29.1
depinfo                               2.2.0
gurobipy                             12.0.2
matplotlib                           3.10.3
matplotlib-venn                       1.1.2
memote                               0.17.0
networkx                              3.4.2
notebook                              7.4.2
openpyxl                              3.1.5
pandas                                2.2.3
pre-commit                            4.2.0
rbc-gem-utils[database,network,vis] missing
requests                             2.32.3
scikit-learn                          1.6.1
scipy                                1.15.3
seaborn                              0.13.2

Build Tools Information
-----------------------
pip          25.1
setuptools 78.1.1
wheel      0.45

## Set organism, dataset, and paths

In [2]:
organism = "Human"
dataset_name = "DeepRedOmics"
raw_data_dirpath = get_dirpath(use_temp="raw") / organism / dataset_name

# Ensure directory exists
processed_data_dirpath = get_dirpath(use_temp="processed") / organism / dataset_name
processed_data_dirpath.mkdir(exist_ok=True, parents=True)

## Set data value type and variables for columns keys 

In [3]:
protein_values_dtype = "CopyNumbers"
sample_key = "SAMPLE ID"

### Load protein data

In [4]:
df_protein_data = pd.read_csv(
    raw_data_dirpath / f"{dataset_name}_ProteinData.tsv",
    sep="\t",
    index_col=None,
)
# Check to see if expected columns are included. If so, then order columns as listed.
# Comes directly from UniProt if possible
df_protein_data = df_protein_data.loc[
    :,
    [
        "Entry",
        "Entry Name",
        "Protein",
        "Protein names",
        "Gene Names (primary)",
        "Length",
        "Mass",  # Should be in DA
    ],
]
# Sort the data via alphabetical order of protein IDs for consistency
df_protein_data = df_protein_data.set_index("Entry").sort_index()
df_protein_data.head()

Unnamed: 0_level_0,Entry Name,Protein,Protein names,Gene Names (primary),Length,Mass
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A0A024RBG1,NUD4B_HUMAN,NUD4B,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,181,20434
A0A075B6I9,LV746_HUMAN,LV746,Immunoglobulin lambda variable 7-46,IGLV7-46,117,12468
A0A075B6K5,LV39_HUMAN,LV39,Immunoglobulin lambda variable 3-9,IGLV3-9,115,12332
A0A075B6P5,KV228_HUMAN,KV228,Immunoglobulin kappa variable 2-28,IGKV2-28,120,12957
A0A075B6R9,KVD24_HUMAN,KVD24,Probable non-functional immunoglobulin kappa v...,IGKV2D-24,120,13079


#### Load proteomics and map to UniProt if necessary

In [5]:
df_proteomics = pd.read_csv(
    raw_data_dirpath / f"{dataset_name}_Protein{protein_values_dtype}.tsv",
    sep="\t",
    index_col=None,
)
original_ids_type = "uniprot"

# Transform Protein IDs to UniProt IDs
if original_ids_type != "uniprot" and any(
    df_proteomics.columns.isin(df_protein_data[original_ids_type])
):
    mapping_dict = df_protein_data.reset_index(drop=False)
    mapping_dict = mapping_dict.set_index(original_ids_type)[df_protein_data.index.name]
    mapping_dict = mapping_dict.to_dict()
    df_proteomics = df_proteomics.rename(mapping_dict, axis=1)

# For one sample, set using datapoint
df_proteomics_final = df_proteomics.set_index(sample_key)
# Sort for consistency
df_proteomics_final = df_proteomics_final.sort_index(axis=0)[df_protein_data.index]
print(f"Number of actual samples: {len(df_proteomics_final)}")
df_proteomics_final

Number of actual samples: 1


Unnamed: 0_level_0,A0A024RBG1,A0A075B6I9,A0A075B6K5,A0A075B6P5,A0A075B6R9,A0A075B6S2,A0A0A0MRZ8,A0A0C4DH25,A0A0C4DH41,A0A1W2PR19,...,Q9Y6M1,Q9Y6M4,Q9Y6M5,Q9Y6N5,Q9Y6P5,Q9Y6R4,Q9Y6U3,Q9Y6W3,Q9Y6W5,Q9Y6Y8
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DeepRedOmics,4721.116599,15021.40978,609.550609,38381.33516,3124.21758,6557.865957,10352.39291,2136.1758,2177.580856,1706.116041,...,321.00004,29094.65457,307431.0413,689.048171,5653.931616,103.500561,655.143138,7630.705577,52631.87621,8313.38099


## Export absolute quantitative data per sample

In [6]:
dataframes_dict = {
    "ProteinData": df_protein_data,
    "ProteinCopyNumbers": df_proteomics_final,
}
for data_type, df in dataframes_dict.items():
    df.to_csv(
        processed_data_dirpath / f"{dataset_name}_{data_type}.tsv", sep="\t", index=True
    )
    print(f"Saved data for {data_type}")

Saved data for ProteinData
Saved data for ProteinCopyNumbers
