# Prepare Proteomic Data - Copy Numbers, DeepRed
## Setup
### Import packages

In [None]:
import pandas as pd
from rbc_gem_utils import get_dirpath, show_versions

# Show versions of notebook
show_versions()

## Set organism, dataset, and paths

In [None]:
organism = "Human"
dataset_name = "DeepRed"
raw_data_dirpath = get_dirpath(use_temp="raw") / organism / dataset_name

# Ensure directory exists
processed_data_dirpath = get_dirpath(use_temp="processed") / organism / dataset_name
processed_data_dirpath.mkdir(exist_ok=True, parents=True)

## Set data value type and variables for columns keys 

In [None]:
protein_values_dtype = "CopyNumbers"
sample_key = "SAMPLE ID"

### Load protein data

In [None]:
df_protein_data = pd.read_csv(
    raw_data_dirpath / "ProteinData.csv",
    index_col=None,
)
# Check to see if expected columns are included. If so, then order columns as listed.
# Comes directly from UniProt if possible
df_protein_data = df_protein_data.loc[
    :,
    [
        "Entry",
        "Entry Name",
        "Protein",
        "Protein Names",
        "Gene Names (Primary)",
        "Length",
        "Mass",  # Should be in DA
    ],
]
# Sort the data via alphabetical order of protein IDs for consistency
df_protein_data = df_protein_data.set_index("Entry").sort_index()
df_protein_data.head()

#### Load proteomics and map to UniProt if necessary

In [None]:
df_proteomics = pd.read_csv(
    raw_data_dirpath / f"Protein{protein_values_dtype}.csv",
    index_col=None,
)
original_ids_type = "uniprot"

# Transform Protein IDs to UniProt IDs
if original_ids_type != "uniprot" and any(
    df_proteomics.columns.isin(df_protein_data[original_ids_type])
):
    mapping_dict = df_protein_data.reset_index(drop=False)
    mapping_dict = mapping_dict.set_index(original_ids_type)[df_protein_data.index.name]
    mapping_dict = mapping_dict.to_dict()
    df_proteomics = df_proteomics.rename(mapping_dict, axis=1)

# For one sample, set using datapoint
df_proteomics_final = df_proteomics.set_index(sample_key)
# Sort for consistency
df_proteomics_final = df_proteomics_final.sort_index(axis=0)[df_protein_data.index]
print(f"Number of actual samples: {len(df_proteomics_final)}")
df_proteomics_final

## Export absolute quantitative data per sample

In [None]:
dataframes_dict = {
    "ProteinData": df_protein_data,
    "ProteinCopyNumbers": df_proteomics_final,
}
for data_type, df in dataframes_dict.items():
    # df.to_csv(
    #     processed_data_dirpath / f"{data_type}.tsv", sep="\t", index=True
    # )
    df.to_csv(processed_data_dirpath / f"{data_type}.csv", index=True)
    print(f"Saved data for {data_type}")