# Prepare Proteomic Data - Intensities, Mouse G6PD variants
## Setup
### Import packages

In [83]:
import pandas as pd
from rbc_gem_utils import get_dirpath, show_versions
from rbc_gem_utils.util import AVOGADRO_NUMBER, ensure_iterable

# Show versions of notebook
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.2

Dependency Information
----------------------
beautifulsoup4                       4.13.4
bio                                   1.8.0
cobra                                0.29.1
depinfo                               2.2.0
gurobipy                             12.0.2
matplotlib                           3.10.3
matplotlib-venn                       1.1.2
memote                               0.17.0
networkx                              3.4.2
notebook                              7.4.2
openpyxl                              3.1.5
pandas                                2.2.3
pre-commit                            4.2.0
rbc-gem-utils[database,network,vis] missing
requests                             2.32.3
scikit-learn                          1.6.1
scipy                                1.15.3
seaborn                              0.13.2

Build Tools Information
-----------------------
pip          25.1
setuptools 78.1.1
wheel      0.45

## Set organism, dataset, and paths

In [2]:
organism = "Mouse"
dataset_name = "G6PDvariants"
raw_data_dirpath = get_dirpath(use_temp="raw") / organism / dataset_name

# Ensure directory exists
processed_data_dirpath = get_dirpath(use_temp="processed") / organism / dataset_name
processed_data_dirpath.mkdir(exist_ok=True, parents=True)

## Set data value type and variables for columns keys

In [8]:
protein_values_dtype = "Intensities"
sample_key = "SAMPLE ID"
donor_key = "MOUSE ID"
time_key = "TIME"

time_abbrev = ""

## Load RBC Proteomics
### Load protein data

In [9]:
df_protein_data = pd.read_csv(
    raw_data_dirpath / f"{dataset_name}_ProteinData.tsv",
    sep="\t",
    index_col=None,
)
# Check to see if expected columns are included. If so, then order columns as listed.
# Comes directly from UniProt if possible
df_protein_data = df_protein_data.loc[
    :,
    [
        "Entry",
        "Entry Name",
        "Protein",
        "Protein names",
        "Gene Names (primary)",
        "Length",
        "Mass",  # Should be in DA
    ],
]
# Sort the data via alphabetical order of protein IDs for consistency
df_protein_data = df_protein_data.set_index("Entry").sort_index()
df_protein_data.head()

Unnamed: 0_level_0,Entry Name,Protein,Protein names,Gene Names (primary),Length,Mass
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A2AAY5,SPD2B_MOUSE,SPD2B,SH3 and PX domain-containing protein 2B (Facto...,Sh3pxd2b,908,101517
A2ADY9,DDI2_MOUSE,DDI2,Protein DDI1 homolog 2 (EC 3.4.23.-),Ddi2,399,44591
A2AGT5,CKAP5_MOUSE,CKAP5,Cytoskeleton-associated protein 5,Ckap5,2032,225635
A2AN08,UBR4_MOUSE,UBR4,E3 ubiquitin-protein ligase UBR4 (EC 2.3.2.27)...,Ubr4,5180,572290
A2AQ07,TBB1_MOUSE,TBB1,Tubulin beta-1 chain,Tubb1,451,50441


#### Load proteomics and map to UniProt if necessary

In [23]:
df_proteomics = pd.read_csv(
    raw_data_dirpath / f"{dataset_name}_Protein{protein_values_dtype}.tsv",
    sep="\t",
    index_col=None,
)
original_ids_type = "uniprot"

# Create sample IDs from donor and time points, then set as index
df_proteomics.index = pd.Index(
    df_proteomics[[donor_key, time_key]]
    .apply(lambda x: f"{x[donor_key]}_{time_abbrev}{x[time_key]}", axis=1)
    .values,
    name=sample_key,
)

# Transform Protein IDs to UniProt IDs
if original_ids_type != "uniprot" and any(
    df_proteomics.columns.isin(df_protein_data[original_ids_type])
):
    mapping_dict = df_protein_data.reset_index(drop=False)
    mapping_dict = mapping_dict.set_index(original_ids_type)[df_protein_data.index.name]
    mapping_dict = mapping_dict.to_dict()
    df_proteomics = df_proteomics.rename(mapping_dict, axis=1)

# Sort for consistency
df_proteomics = df_proteomics.sort_index(axis=0)[
    [donor_key, time_key] + list(df_protein_data.index)
]
donor_ids = df_proteomics[donor_key].unique()
timepoints = df_proteomics[time_key].unique()
print(f"Number of donors: {len(donor_ids)}")
print(f"Number of timepoints: {len(timepoints)}")
print(f"Number of expected samples: {len(donor_ids) * len(timepoints)}")
print(f"Number of actual samples: {len(df_proteomics)}")
df_proteomics

Number of donors: 36
Number of timepoints: 3
Number of expected samples: 108
Number of actual samples: 107


Unnamed: 0_level_0,MOUSE ID,TIME,A2AAY5,A2ADY9,A2AGT5,A2AN08,A2AQ07,A2AVZ9,A6X935,B2RPV6,...,Q9Z1Z0,Q9Z2K1,Q9Z2L7,Q9Z2M7,Q9Z2U0,Q9Z2U1,Q9Z2W0,Q9Z2X1,Q9Z2Y8,V9GXG1
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A10_Post,A10,Post,318.46050,161438.160,0.0,302582.25,37780.363,0.000,96832.305,0.00,...,0.0000,24017.834,90886.750,346674.88,830986.44,1003041.30,287848.50,0.000,65348.203,0.0
A10_Pre,A10,Pre,0.00000,51032.973,0.0,363544.70,97704.414,0.000,278689.800,43538.17,...,7095.1300,0.000,61173.242,367621.12,810137.06,1139476.00,431312.88,0.000,60206.770,0.0
A10_TD,A10,TD,0.00000,330797.120,0.0,945866.40,261775.860,15244.284,112416.836,43084.26,...,28166.4880,19541.887,152629.340,397354.60,786441.44,1208090.00,303334.97,13966.819,66419.260,54033.2
A11_Post,A11,Post,0.00000,121373.320,0.0,527708.70,139184.920,0.000,659869.700,46562.80,...,0.0000,0.000,117793.810,318111.47,871332.10,1193655.10,194997.90,0.000,65527.793,0.0
A11_Pre,A11,Pre,0.00000,50554.890,0.0,284532.72,0.000,0.000,0.000,0.00,...,0.0000,0.000,50005.133,377780.56,502533.56,966400.56,432990.10,0.000,38804.742,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MED8_Pre,MED8,Pre,0.00000,100151.140,0.0,322600.90,212596.700,0.000,1122667.900,74924.14,...,6502.6094,0.000,70870.730,282731.72,766091.60,1073668.90,250482.58,40331.793,54894.980,0.0
MED8_TD,MED8,TD,0.00000,132591.530,0.0,357020.56,1709988.000,0.000,92423.266,90750.09,...,0.0000,0.000,73874.960,272972.72,541198.20,771852.00,338136.06,47113.010,46276.266,0.0
MED9_Post,MED9,Post,0.00000,224659.750,0.0,420088.47,381507.940,0.000,20209.568,56303.03,...,0.0000,14465.219,59123.812,299184.10,546876.44,978658.30,209130.08,0.000,34989.305,0.0
MED9_Pre,MED9,Pre,0.00000,120813.750,0.0,289419.80,82233.790,0.000,222721.160,238749.81,...,7378.2730,0.000,76976.190,439084.10,720954.00,1209502.60,332436.47,0.000,75257.670,0.0


### Load metadata corresponding to samples (optional)
#### Genotype data

In [24]:
try:
    df_genotypes = pd.read_csv(
        raw_data_dirpath / f"{dataset_name}_Genotypes.tsv",
        sep="\t",
        index_col=[donor_key],
    )
except FileNotFoundError:
    df_genotypes = pd.DataFrame([])
df_genotypes

#### Phenotype data

In [25]:
try:
    df_phenotypes = pd.read_csv(
        raw_data_dirpath / f"{dataset_name}_Phenotypes.tsv",
        sep="\t",
        index_col=[donor_key],
    )
except FileNotFoundError:
    df_phenotypes = pd.DataFrame([])
df_phenotypes

Unnamed: 0_level_0,G6PD_PHENOTYPE
MOUSE ID,Unnamed: 1_level_1
A1,A
A10,A
A11,A
A12,A
A2,A
A3,A
A4,A
A5,A
A6,A
A7,A


#### Combine into one DataFrame for MetaData

In [60]:
print(f"Proteomics: {df_proteomics[donor_key].nunique()} donors")
print(
    f"  Genomics: {df_genotypes.index.nunique() if not df_genotypes.empty else 0} donors"
)
print(
    f"Phenotypes: {df_phenotypes.index.nunique() if not df_phenotypes.empty else 0} donors"
)

df_metadata = pd.concat((df_genotypes, df_phenotypes), axis=1)


if not df_metadata.empty:
    df_metadata = df_metadata.reset_index(drop=False)
    # Ensure only metadata corresponds to the available omics data
    if not df_metadata[donor_key].isin(df_proteomics[donor_key]).all():
        df_metadata = df_metadata[df_metadata[donor_key].isin(df_proteomics[donor_key])]

    # If time was not included in metadata, add as a part of index to ensure index matches samples
    if time_key and time_key not in df_metadata.index:
        df_metadata = (
            pd.concat(
                (
                    df_metadata,
                    pd.Series(
                        [list(df_proteomics[time_key].unique())]
                        * len(df_metadata.index),
                        index=df_metadata.index,
                        name=time_key,
                    ),
                ),
                axis=1,
            )
            .explode(time_key)
            .reset_index(drop=True)
        )
    # Create sample IDs from donor and time points, then set as index
    df_metadata.index = pd.Index(
        df_metadata[[donor_key, time_key]]
        .apply(lambda x: f"{x[donor_key]}_{time_abbrev}{x[time_key]}", axis=1)
        .values,
        name=sample_key,
    )
    print(f"\nFinal data: {df_metadata[donor_key].nunique()} donors")
    df_metadata = df_metadata.drop([donor_key, time_key], axis=1)
else:
    print(f"\nFinal Meta: 0 donors")

df_metadata.head()

Proteomics: 36 donors
  Genomics: 0 donors
Phenotypes: 36 donors

Final data: 36 donors


Unnamed: 0_level_0,G6PD_PHENOTYPE
SAMPLE ID,Unnamed: 1_level_1
A1_Post,A
A1_Pre,A
A1_TD,A
A10_Post,A
A10_Pre,A


### Get MCH per sample

In [77]:
# Provide in picograms. Set as None to use metadata if provided
mch_sample_value = 13.9
if mch_sample_value is None:
    try:
        df_MCH_per_sample = pd.read_csv(
            raw_data_dirpath / f"{dataset_name}_Phenotypes.tsv",
            sep="\t",
            index_col=None,
        )

    except FileNotFoundError:
        raise ValueError(
            "Cannot determine MCH. No phenotype data provided and a default value is not provided"
        )

    # Ensure only metadata corresponds to the available omics data
    if not df_MCH_per_sample[donor_key].isin(df_proteomics[donor_key]).all():
        df_MCH_per_sample = df_MCH_per_sample[
            df_MCH_per_sample[donor_key].isin(df_proteomics[donor_key])
        ]

    if "CBC.MCH" not in df_MCH_per_sample.columns:
        if all([x in df_MCH_per_sample.columns for x in ["CBC.HGB", "CBC.RBC"]]):
            df_MCH_per_sample["CBC.MCH"] = (
                df_MCH_per_sample["CBC.HGB"] / df_MCH_per_sample["CBC.RBC"]
            ) * 10
        elif all([x in df_MCH_per_sample.columns for x in ["CBC.MCHC", "CBC.MCV"]]):
            df_MCH_per_sample["CBC.MCH"] = (
                df_MCH_per_sample["CBC.MCHC"] * df_MCH_per_sample["CBC.MCV"]
            ) / 100
        else:
            raise ValueError(
                "Cannot determine MCH, one of the following combinations is needed: (CBC.HGB and CBC.RBC) or (CBC.MCHC and CBC.MCV)"
            )
    df_MCH_per_sample = df_MCH_per_sample.set_index(donor_key)["CBC.MCH"]
    n_missing = len(df_MCH_per_sample[df_MCH_per_sample.isna()])
    print(f"Missing values for {n_missing} samples.")
    print(f"Mean MCH in pg: {df_MCH_per_sample.mean():.2f}")
    df_MCH_per_sample = df_MCH_per_sample.fillna(df_MCH_per_sample.mean())
else:
    print("Using default MCH value provided for all samples")
    df_MCH_per_sample = pd.Series(
        [mch_sample_value] * df_proteomics[donor_key].nunique(),
        index=pd.Index(df_proteomics[donor_key].unique(), name=donor_key),
        name="CBC.MCH",
    )
    print(f"Mean MCH in pg: {mch_sample_value:.2f}")

df_MCH_per_sample = df_MCH_per_sample.reset_index(drop=False)
# If time was not included in metadata, add as a part of index to ensure index matches samples
if time_key and time_key not in df_MCH_per_sample.index:
    df_MCH_per_sample = (
        pd.concat(
            (
                df_MCH_per_sample,
                pd.Series(
                    [list(df_proteomics[time_key].unique())]
                    * len(df_MCH_per_sample.index),
                    index=df_MCH_per_sample.index,
                    name=time_key,
                ),
            ),
            axis=1,
        )
        .explode(time_key)
        .reset_index(drop=True)
    )
# Create sample IDs from donor and time points, then set as index
df_MCH_per_sample.index = pd.Index(
    df_MCH_per_sample[[donor_key, time_key]]
    .apply(lambda x: f"{x[donor_key]}_{time_abbrev}{x[time_key]}", axis=1)
    .values,
    name=sample_key,
)
df_MCH_per_sample = df_MCH_per_sample.loc[df_proteomics.index]

df_MCH_per_sample.head()

Using default MCH value provided for all samples
Mean MCH in pg: 13.90


Unnamed: 0_level_0,MOUSE ID,CBC.MCH,TIME
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A10_Post,A10,13.9,Post
A10_Pre,A10,13.9,Pre
A10_TD,A10,13.9,TD
A11_Post,A11,13.9,Post
A11_Pre,A11,13.9,Pre


### Get data subsets using operations

In [78]:
operations = [
    "mean",
    "median",
]
operation_dfs_proteomics = []
operation_dfs_MCH = []
fill_keys = set()


def group_data(df, operation, keys, columns, prefix_values=None, name_col=None):
    keys = ensure_iterable(keys)
    if not prefix_values:
        prefix_values = [""] * len(keys)
    if isinstance(prefix_values, dict):
        prefix_values = {k: prefix_values.get(k, "") for k in keys}
    else:
        prefix_values = dict(zip(keys, prefix_values))

    df = df.groupby(keys, as_index=False, observed=False)[columns]
    df = getattr(df, operation.lower())()
    labels = df[keys].apply(
        lambda x: "_".join([f"{prefix_values[key]}{x[key]}" for key in keys]),
        axis=1,
    )
    df[name_col] = [f"{operation.capitalize()}_{value}" for value in labels]
    return df

#### Group by time and phenotype

In [79]:
keys = [time_key, "G6PD_PHENOTYPE"]
prefix_values = {}

operation_dfs_proteomics += [
    group_data(
        pd.merge(
            df_proteomics, df_metadata, left_index=True, right_index=True, how="left"
        ).reset_index(drop=False),
        operation,
        keys=keys,
        columns=list(df_protein_data.index),
        prefix_values=None,
        name_col=sample_key,
    )
    for operation in operations
]

operation_dfs_MCH += [
    group_data(
        pd.merge(
            df_MCH_per_sample,
            df_metadata,
            left_index=True,
            right_index=True,
            how="left",
        ).reset_index(drop=False),
        operation,
        keys=keys,
        columns=["CBC.MCH"],
        prefix_values=None,
        name_col=sample_key,
    )
    for operation in operations
]

### Add to DataFrames

In [80]:
try:
    df_proteomics_op = pd.concat(operation_dfs_proteomics, axis=0).drop_duplicates()
except (KeyError, ValueError):
    df_proteomics_final = df_proteomics[df_protein_data.index].copy()
else:
    df_proteomics_final = pd.concat(
        (df_proteomics.reset_index(drop=False), df_proteomics_op), axis=0
    )
    df_proteomics_final = df_proteomics_final.set_index(sample_key)[
        df_protein_data.index
    ]

try:
    df_MCH_op = pd.concat(operation_dfs_MCH, axis=0).drop_duplicates()
except (KeyError, ValueError):
    df_MCH_final = df_MCH_per_sample["CBC.MCH"].copy()
else:
    df_MCH_final = pd.concat(
        (df_MCH_per_sample.reset_index(drop=False), df_MCH_op), axis=0
    )
    df_MCH_final = df_MCH_final.set_index(sample_key)["CBC.MCH"]

df_MCH_final.name = "MCH"
df_proteomics_final

Unnamed: 0_level_0,A2AAY5,A2ADY9,A2AGT5,A2AN08,A2AQ07,A2AVZ9,A6X935,B2RPV6,B2RQC6,C0HKE1,...,Q9Z1Z0,Q9Z2K1,Q9Z2L7,Q9Z2M7,Q9Z2U0,Q9Z2U1,Q9Z2W0,Q9Z2X1,Q9Z2Y8,V9GXG1
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A10_Post,318.4605,161438.1600,0.0,302582.25,37780.363,0.000,96832.305,0.0000,0.0000,153414.3600,...,0.000,24017.834,90886.750,346674.88,830986.440,1003041.30,287848.500,0.000,65348.2030,0.000
A10_Pre,0.0000,51032.9730,0.0,363544.70,97704.414,0.000,278689.800,43538.1700,0.0000,87148.8750,...,7095.130,0.000,61173.242,367621.12,810137.060,1139476.00,431312.880,0.000,60206.7700,0.000
A10_TD,0.0000,330797.1200,0.0,945866.40,261775.860,15244.284,112416.836,43084.2600,138335.2500,7266.3867,...,28166.488,19541.887,152629.340,397354.60,786441.440,1208090.00,303334.970,13966.819,66419.2600,54033.200
A11_Post,0.0000,121373.3200,0.0,527708.70,139184.920,0.000,659869.700,46562.8000,0.0000,214803.8600,...,0.000,0.000,117793.810,318111.47,871332.100,1193655.10,194997.900,0.000,65527.7930,0.000
A11_Pre,0.0000,50554.8900,0.0,284532.72,0.000,0.000,0.000,0.0000,0.0000,14401.9330,...,0.000,0.000,50005.133,377780.56,502533.560,966400.56,432990.100,0.000,38804.7420,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Median_Pre_HumCan,0.0000,105603.7425,0.0,461056.09,151337.525,0.000,381460.135,35275.6265,0.0000,242985.1100,...,0.000,0.000,56020.795,350434.03,612192.650,1086103.40,340375.530,0.000,41955.6035,0.000
Median_Pre_MED,288.2130,117896.5800,0.0,410021.10,149833.515,0.000,881461.100,39635.0265,0.0000,155779.9000,...,0.000,0.000,61034.130,309039.01,712458.250,1099143.40,346932.185,0.000,49261.7630,0.000
Median_TD_A,0.0000,242014.0050,0.0,821800.27,310434.950,0.000,284526.915,138880.2730,139003.1950,70731.0390,...,6685.530,0.000,101523.708,439543.95,647543.925,998826.90,339279.000,0.000,56345.5015,31782.089
Median_TD_HumCan,0.0000,225756.8700,0.0,500839.92,297763.565,0.000,239037.340,43170.0465,0.0000,36792.9360,...,0.000,0.000,91657.173,408822.08,604273.225,814564.02,341810.800,0.000,54365.4160,0.000


### Transform data to copy numbers and expected format

In [81]:
# Convert Da to kDa
df_uniprot_to_mw = df_protein_data["Mass"] / 1000

df_concentrations = df_proteomics_final.copy()
# Sum intensities and convert to pmol / mgDW sample, then to nmol / gDW sample
df_concentrations = (
    df_concentrations.apply(lambda x: x / x.sum(), axis=1) / df_uniprot_to_mw
) * 1e6

# Conversion to copy numbers
df_copy_numbers = (df_concentrations * 1e-9).mul(
    df_MCH_final * 1e-12, axis=0
) * AVOGADRO_NUMBER
df_copy_numbers

Unnamed: 0_level_0,A2AAY5,A2ADY9,A2AGT5,A2AN08,A2AQ07,A2AVZ9,A6X935,B2RPV6,B2RQC6,C0HKE1,...,Q9Z1Z0,Q9Z2K1,Q9Z2L7,Q9Z2M7,Q9Z2U0,Q9Z2U1,Q9Z2W0,Q9Z2X1,Q9Z2Y8,V9GXG1
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A10_Post,10.625597,12262.965610,0.0,1790.868980,2536.991205,0.000000,3133.864483,0.000000,0.000000,36762.642684,...,0.000000,1576.413568,6211.758494,42457.442242,101047.901207,128638.359637,18675.479277,0.000000,7366.143058,0.000000
A10_Pre,0.000000,3736.458334,0.0,2073.949353,6323.928958,0.000000,8693.626527,1044.357972,0.000000,20129.011988,...,216.522021,0.000000,4029.910945,43396.216809,94953.684834,140856.477436,26972.436274,0.000000,6541.416334,0.000000
A10_TD,0.000000,23342.184549,0.0,5200.446484,16329.500396,856.017012,3379.728138,996.020685,1789.486498,1617.520092,...,828.409208,1191.497924,9690.417358,45206.425227,88836.249967,143926.722171,18281.874932,960.999793,6954.901754,1671.896666
A11_Post,0.000000,8584.042575,0.0,2907.996907,8702.105407,0.000000,19883.711324,1078.891225,0.000000,47924.998925,...,0.000000,0.000000,7495.761812,36273.553442,98649.842071,142531.171933,11779.233742,0.000000,6877.195346,0.000000
A11_Pre,0.000000,3993.693360,0.0,1751.357541,0.000000,0.000000,0.000000,0.000000,0.000000,3589.084901,...,0.000000,0.000000,3554.273230,48116.418634,63550.747849,128893.520595,29215.141040,0.000000,4548.974843,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Median_Pre_HumCan,0.000000,8038.353223,0.0,2734.466419,10183.520169,0.000000,12371.078593,879.695668,0.000000,58347.041089,...,0.000000,0.000000,3836.734274,43006.707186,74596.755197,139579.396635,22129.142295,0.000000,4739.089792,0.000000
Median_Pre_MED,9.394813,8749.180219,0.0,2370.846390,9829.663419,0.000000,27870.190522,963.640955,0.000000,36469.429969,...,0.000000,0.000000,4075.337597,36976.142470,84638.824741,137715.517535,21990.202103,0.000000,5424.919450,0.000000
Median_TD_A,0.000000,17957.024347,0.0,4751.067571,20362.355095,0.000000,8994.724358,3376.015062,1890.751456,16555.995878,...,206.757920,0.000000,6777.757425,52582.147821,76914.311236,125125.685455,21501.528114,0.000000,6203.978866,1034.058900
Median_TD_HumCan,0.000000,18229.223306,0.0,3151.063695,21255.056773,0.000000,8223.630117,1142.035614,0.000000,9372.232074,...,0.000000,0.000000,6659.142757,53223.530032,78109.631754,111049.032609,23573.900424,0.000000,6514.290079,0.000000


### Export absolute quantitative data and metadata per sample

## Export absolute quantitative data per sample

In [82]:
dataframes_dict = {
    "ProteinData": df_protein_data,
    "ProteinIntensities": df_proteomics_final,
    "ProteinConcentrations": df_concentrations,
    "ProteinCopyNumbers": df_copy_numbers,
    "MCH": df_MCH_final,
    "Metadata": df_metadata,
}
for data_type, df in dataframes_dict.items():
    df.to_csv(
        processed_data_dirpath / f"{dataset_name}_{data_type}.tsv", sep="\t", index=True
    )
    print(f"Saved data for {data_type}")

Saved data for ProteinData
Saved data for ProteinIntensities
Saved data for ProteinConcentrations
Saved data for ProteinCopyNumbers
Saved data for MCH
Saved data for Metadata
