# Prepare Proteomic Data - Intensities, G6PD variants
## Setup
### Import packages

In [1]:
import pandas as pd
from rbc_gem_utils import get_dirpath, show_versions
from rbc_gem_utils.util import AVOGADRO_NUMBER, ensure_iterable

# Show versions of notebook
show_versions()


Package Information
-------------------
rbc-gem-utils 0.0.3

Dependency Information
----------------------
beautifulsoup4                       4.13.4
bio                                   1.8.0
cobra                                0.29.1
depinfo                               2.2.0
gurobipy                             12.0.3
matplotlib                           3.10.3
matplotlib-venn                       1.1.2
memote                               0.17.0
networkx                                3.5
notebook                              7.4.4
openpyxl                              3.1.5
pandas                                2.3.1
pre-commit                            4.2.0
rbc-gem-utils[database,network,vis] missing
requests                             2.32.4
scikit-learn                          1.7.0
scipy                                1.16.0
seaborn                              0.13.2

Build Tools Information
-----------------------
pip          25.1
setuptools 78.1.1
wheel      0.45

## Set organism, dataset, and paths

In [2]:
organism = "Mouse"
dataset_name = "G6PDvariants"
raw_data_dirpath = get_dirpath(use_temp="raw") / organism / dataset_name

# Ensure directory exists
processed_data_dirpath = get_dirpath(use_temp="processed") / organism / dataset_name
processed_data_dirpath.mkdir(exist_ok=True, parents=True)

## Set data value type and variables for columns keys

In [3]:
protein_values_dtype = "Intensities"
sample_key = "SAMPLE ID"
donor_key = "MOUSE ID"
time_key = "TIME"

time_abbrev = ""

## Load RBC Proteomics
### Load protein data

In [4]:
df_protein_data = pd.read_csv(
    raw_data_dirpath / "ProteinData.csv",
    index_col=None,
)
# Check to see if expected columns are included. If so, then order columns as listed.
# Comes directly from UniProt if possible
df_protein_data = df_protein_data.loc[
    :,
    [
        "Entry",
        "Entry Name",
        "Protein",
        "Protein Names",
        "Gene Names (Primary)",
        "Length",
        "Mass",  # Should be in DA
    ],
]
# Sort the data via alphabetical order of protein IDs for consistency
df_protein_data = df_protein_data.set_index("Entry").sort_index()
df_protein_data.head()

Unnamed: 0_level_0,Entry Name,Protein,Protein Names,Gene Names (Primary),Length,Mass
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A2AAY5,SPD2B_MOUSE,SPD2B,SH3 and PX domain-containing protein 2B (Facto...,Sh3pxd2b,908,101517
A2ADY9,DDI2_MOUSE,DDI2,Protein DDI1 homolog 2 (EC 3.4.23.-),Ddi2,399,44591
A2AGT5,CKAP5_MOUSE,CKAP5,Cytoskeleton-associated protein 5,Ckap5,2032,225635
A2AN08,UBR4_MOUSE,UBR4,E3 ubiquitin-protein ligase UBR4 (EC 2.3.2.27)...,Ubr4,5180,572290
A2AQ07,TBB1_MOUSE,TBB1,Tubulin beta-1 chain,Tubb1,451,50441


#### Load proteomics and map to UniProt if necessary

In [5]:
df_proteomics = pd.read_csv(
    raw_data_dirpath / f"Protein{protein_values_dtype}.csv",
    index_col=None,
)
original_ids_type = "uniprot"

# Create sample IDs from donor and time points, then set as index
df_proteomics.index = pd.Index(
    df_proteomics[[donor_key, time_key]]
    .apply(lambda x: f"{x[donor_key]}_{time_abbrev}{x[time_key]}", axis=1)
    .values,
    name=sample_key,
)

# Transform Protein IDs to UniProt IDs
if original_ids_type != "uniprot" and any(
    df_proteomics.columns.isin(df_protein_data[original_ids_type])
):
    mapping_dict = df_protein_data.reset_index(drop=False)
    mapping_dict = mapping_dict.set_index(original_ids_type)[df_protein_data.index.name]
    mapping_dict = mapping_dict.to_dict()
    df_proteomics = df_proteomics.rename(mapping_dict, axis=1)

# Sort for consistency
df_proteomics = df_proteomics.sort_index(axis=0)[
    [donor_key, time_key] + list(df_protein_data.index)
]
donor_ids = df_proteomics[donor_key].unique()
timepoints = df_proteomics[time_key].unique()
print(f"Number of donors: {len(donor_ids)}")
print(f"Number of timepoints: {len(timepoints)}")
print(f"Number of expected samples: {len(donor_ids) * len(timepoints)}")
print(f"Number of actual samples: {len(df_proteomics)}")
df_proteomics

Number of donors: 36
Number of timepoints: 3
Number of expected samples: 108
Number of actual samples: 107


Unnamed: 0_level_0,MOUSE ID,TIME,A2AAY5,A2ADY9,A2AGT5,A2AN08,A2AQ07,A2AVZ9,A6X935,B2RPV6,...,Q9Z1Z0,Q9Z2K1,Q9Z2L7,Q9Z2M7,Q9Z2U0,Q9Z2U1,Q9Z2W0,Q9Z2X1,Q9Z2Y8,V9GXG1
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A10_Post,A10,Post,318.46050,161438.160,0.0,302582.25,37780.363,0.000,96832.305,0.00,...,0.0000,24017.834,90886.750,346674.88,830986.44,1003041.30,287848.50,0.000,65348.203,0.0
A10_Pre,A10,Pre,0.00000,51032.973,0.0,363544.70,97704.414,0.000,278689.800,43538.17,...,7095.1300,0.000,61173.242,367621.12,810137.06,1139476.00,431312.88,0.000,60206.770,0.0
A10_TD,A10,TD,0.00000,330797.120,0.0,945866.40,261775.860,15244.284,112416.836,43084.26,...,28166.4880,19541.887,152629.340,397354.60,786441.44,1208090.00,303334.97,13966.819,66419.260,54033.2
A11_Post,A11,Post,0.00000,121373.320,0.0,527708.70,139184.920,0.000,659869.700,46562.80,...,0.0000,0.000,117793.810,318111.47,871332.10,1193655.10,194997.90,0.000,65527.793,0.0
A11_Pre,A11,Pre,0.00000,50554.890,0.0,284532.72,0.000,0.000,0.000,0.00,...,0.0000,0.000,50005.133,377780.56,502533.56,966400.56,432990.10,0.000,38804.742,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MED8_Pre,MED8,Pre,0.00000,100151.140,0.0,322600.90,212596.700,0.000,1122667.900,74924.14,...,6502.6094,0.000,70870.730,282731.72,766091.60,1073668.90,250482.58,40331.793,54894.980,0.0
MED8_TD,MED8,TD,0.00000,132591.530,0.0,357020.56,1709988.000,0.000,92423.266,90750.09,...,0.0000,0.000,73874.960,272972.72,541198.20,771852.00,338136.06,47113.010,46276.266,0.0
MED9_Post,MED9,Post,0.00000,224659.750,0.0,420088.47,381507.940,0.000,20209.568,56303.03,...,0.0000,14465.219,59123.812,299184.10,546876.44,978658.30,209130.08,0.000,34989.305,0.0
MED9_Pre,MED9,Pre,0.00000,120813.750,0.0,289419.80,82233.790,0.000,222721.160,238749.81,...,7378.2730,0.000,76976.190,439084.10,720954.00,1209502.60,332436.47,0.000,75257.670,0.0


### Load metadata corresponding to samples (optional)
#### Genotype data

In [6]:
try:
    df_genotypes = pd.read_csv(
        raw_data_dirpath / "Genotypes.csv",
        index_col=[donor_key],
    )
except FileNotFoundError:
    df_genotypes = pd.DataFrame([])
df_genotypes

#### Phenotype data

In [7]:
try:
    df_phenotypes = pd.read_csv(
        raw_data_dirpath / "Phenotypes.csv",
        index_col=[donor_key],
    )
except FileNotFoundError:
    df_phenotypes = pd.DataFrame([])
df_phenotypes

Unnamed: 0_level_0,G6PD_PHENOTYPE
MOUSE ID,Unnamed: 1_level_1
A1,A
A10,A
A11,A
A12,A
A2,A
A3,A
A4,A
A5,A
A6,A
A7,A


#### Combine into one DataFrame for MetaData

In [8]:
print(f"Proteomics: {df_proteomics[donor_key].nunique()} donors")
print(
    f"  Genomics: {df_genotypes.index.nunique() if not df_genotypes.empty else 0} donors"
)
print(
    f"Phenotypes: {df_phenotypes.index.nunique() if not df_phenotypes.empty else 0} donors"
)

df_metadata = pd.concat((df_genotypes, df_phenotypes), axis=1)


if not df_metadata.empty:
    df_metadata = df_metadata.reset_index(drop=False)
    # Ensure only metadata corresponds to the available omics data
    if not df_metadata[donor_key].isin(df_proteomics[donor_key]).all():
        df_metadata = df_metadata[df_metadata[donor_key].isin(df_proteomics[donor_key])]

    # If time was not included in metadata, add as a part of index to ensure index matches samples
    if time_key and time_key not in df_metadata.index:
        df_metadata = (
            pd.concat(
                (
                    df_metadata,
                    pd.Series(
                        [list(df_proteomics[time_key].unique())]
                        * len(df_metadata.index),
                        index=df_metadata.index,
                        name=time_key,
                    ),
                ),
                axis=1,
            )
            .explode(time_key)
            .reset_index(drop=True)
        )
    # Create sample IDs from donor and time points, then set as index
    df_metadata.index = pd.Index(
        df_metadata[[donor_key, time_key]]
        .apply(lambda x: f"{x[donor_key]}_{time_abbrev}{x[time_key]}", axis=1)
        .values,
        name=sample_key,
    )
    print(f"\nFinal data: {df_metadata[donor_key].nunique()} donors")
    df_metadata = df_metadata.drop([donor_key, time_key], axis=1)
else:
    print(f"\nFinal Meta: 0 donors")

df_metadata.head()

Proteomics: 36 donors
  Genomics: 0 donors
Phenotypes: 36 donors

Final data: 36 donors


Unnamed: 0_level_0,G6PD_PHENOTYPE
SAMPLE ID,Unnamed: 1_level_1
A1_Post,A
A1_Pre,A
A1_TD,A
A10_Post,A
A10_Pre,A


### Get MCH per sample

In [9]:
# Provide in picograms. Set as None to use metadata if provided
mch_sample_value = 13.9
if mch_sample_value is None:
    try:
        df_MCH_per_sample = pd.read_csv(
            raw_data_dirpath / "Phenotypes.csv",
            index_col=None,
        )

    except FileNotFoundError:
        raise ValueError(
            "Cannot determine MCH. No phenotype data provided and a default value is not provided"
        )

    # Ensure only metadata corresponds to the available omics data
    if not df_MCH_per_sample[donor_key].isin(df_proteomics[donor_key]).all():
        df_MCH_per_sample = df_MCH_per_sample[
            df_MCH_per_sample[donor_key].isin(df_proteomics[donor_key])
        ]

    if "CBC.MCH" not in df_MCH_per_sample.columns:
        if all([x in df_MCH_per_sample.columns for x in ["CBC.HGB", "CBC.RBC"]]):
            df_MCH_per_sample["CBC.MCH"] = (
                df_MCH_per_sample["CBC.HGB"] / df_MCH_per_sample["CBC.RBC"]
            ) * 10
        elif all([x in df_MCH_per_sample.columns for x in ["CBC.MCHC", "CBC.MCV"]]):
            df_MCH_per_sample["CBC.MCH"] = (
                df_MCH_per_sample["CBC.MCHC"] * df_MCH_per_sample["CBC.MCV"]
            ) / 100
        else:
            raise ValueError(
                "Cannot determine MCH, one of the following combinations is needed: (CBC.HGB and CBC.RBC) or (CBC.MCHC and CBC.MCV)"
            )
    df_MCH_per_sample = df_MCH_per_sample.set_index(donor_key)["CBC.MCH"]
    n_missing = len(df_MCH_per_sample[df_MCH_per_sample.isna()])
    print(f"Missing values for {n_missing} samples.")
    print(f"Mean MCH in pg: {df_MCH_per_sample.mean():.2f}")
    df_MCH_per_sample = df_MCH_per_sample.fillna(df_MCH_per_sample.mean())
else:
    print("Using default MCH value provided for all samples")
    df_MCH_per_sample = pd.Series(
        [mch_sample_value] * df_proteomics[donor_key].nunique(),
        index=pd.Index(df_proteomics[donor_key].unique(), name=donor_key),
        name="CBC.MCH",
    )
    print(f"Mean MCH in pg: {mch_sample_value:.2f}")

df_MCH_per_sample = df_MCH_per_sample.reset_index(drop=False)
# If time was not included in metadata, add as a part of index to ensure index matches samples
if time_key and time_key not in df_MCH_per_sample.index:
    df_MCH_per_sample = (
        pd.concat(
            (
                df_MCH_per_sample,
                pd.Series(
                    [list(df_proteomics[time_key].unique())]
                    * len(df_MCH_per_sample.index),
                    index=df_MCH_per_sample.index,
                    name=time_key,
                ),
            ),
            axis=1,
        )
        .explode(time_key)
        .reset_index(drop=True)
    )
# Create sample IDs from donor and time points, then set as index
df_MCH_per_sample.index = pd.Index(
    df_MCH_per_sample[[donor_key, time_key]]
    .apply(lambda x: f"{x[donor_key]}_{time_abbrev}{x[time_key]}", axis=1)
    .values,
    name=sample_key,
)
df_MCH_per_sample = df_MCH_per_sample.loc[df_proteomics.index]
sample_ids = list(df_proteomics.index)

df_MCH_per_sample.head()

Using default MCH value provided for all samples
Mean MCH in pg: 13.90


Unnamed: 0_level_0,MOUSE ID,CBC.MCH,TIME
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A10_Post,A10,13.9,Post
A10_Pre,A10,13.9,Pre
A10_TD,A10,13.9,TD
A11_Post,A11,13.9,Post
A11_Pre,A11,13.9,Pre


### Get data subsets using operations

In [10]:
operations = [
    "mean",
    "median",
]
operation_dfs_proteomics = []
operation_dfs_MCH = []
fill_keys = set()


def group_data(df, operation, keys, columns, prefix_values=None, name_col=None):
    keys = ensure_iterable(keys)
    if not prefix_values:
        prefix_values = [""] * len(keys)
    if isinstance(prefix_values, dict):
        prefix_values = {k: prefix_values.get(k, "") for k in keys}
    else:
        prefix_values = dict(zip(keys, prefix_values))

    df = df.groupby(keys, as_index=False, observed=False)[columns]
    df = getattr(df, operation.lower())()
    labels = df[keys].apply(
        lambda x: "_".join([f"{prefix_values[key]}{x[key]}" for key in keys]),
        axis=1,
    )
    df[name_col] = [f"{operation.capitalize()}_{value}" for value in labels]
    return df

#### Group by time and phenotype

In [11]:
keys = [time_key, "G6PD_PHENOTYPE"]
prefix_values = {}

operation_dfs_proteomics += [
    group_data(
        pd.merge(
            df_proteomics, df_metadata, left_index=True, right_index=True, how="left"
        ).reset_index(drop=False),
        operation,
        keys=keys,
        columns=list(df_protein_data.index),
        prefix_values=None,
        name_col=sample_key,
    )
    for operation in operations
]

operation_dfs_MCH += [
    group_data(
        pd.merge(
            df_MCH_per_sample,
            df_metadata,
            left_index=True,
            right_index=True,
            how="left",
        ).reset_index(drop=False),
        operation,
        keys=keys,
        columns=["CBC.MCH"],
        prefix_values=None,
        name_col=sample_key,
    )
    for operation in operations
]

### Add to DataFrames

In [12]:
try:
    df_proteomics_op = pd.concat(operation_dfs_proteomics, axis=0).drop_duplicates()
except (KeyError, ValueError):
    df_proteomics_final = df_proteomics[df_protein_data.index].copy()
else:
    df_proteomics_final = pd.concat(
        (df_proteomics.reset_index(drop=False), df_proteomics_op), axis=0
    )
    df_proteomics_final = df_proteomics_final.set_index(sample_key)[
        df_protein_data.index
    ]

try:
    df_MCH_op = pd.concat(operation_dfs_MCH, axis=0).drop_duplicates()
except (KeyError, ValueError):
    df_MCH_final = df_MCH_per_sample["CBC.MCH"].copy()
else:
    df_MCH_final = pd.concat(
        (df_MCH_per_sample.reset_index(drop=False), df_MCH_op), axis=0
    )
    df_MCH_final = df_MCH_final.set_index(sample_key)["CBC.MCH"]

df_MCH_final.name = "MCH"
df_proteomics_final

Unnamed: 0_level_0,A2AAY5,A2ADY9,A2AGT5,A2AN08,A2AQ07,A2AVZ9,A6X935,B2RPV6,B2RQC6,C0HKE1,...,Q9Z1Z0,Q9Z2K1,Q9Z2L7,Q9Z2M7,Q9Z2U0,Q9Z2U1,Q9Z2W0,Q9Z2X1,Q9Z2Y8,V9GXG1
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A10_Post,318.4605,161438.1600,0.0,302582.25,37780.363,0.000,96832.305,0.0000,0.0000,153414.3600,...,0.000,24017.834,90886.750,346674.88,830986.440,1003041.30,287848.500,0.000,65348.2030,0.000
A10_Pre,0.0000,51032.9730,0.0,363544.70,97704.414,0.000,278689.800,43538.1700,0.0000,87148.8750,...,7095.130,0.000,61173.242,367621.12,810137.060,1139476.00,431312.880,0.000,60206.7700,0.000
A10_TD,0.0000,330797.1200,0.0,945866.40,261775.860,15244.284,112416.836,43084.2600,138335.2500,7266.3867,...,28166.488,19541.887,152629.340,397354.60,786441.440,1208090.00,303334.970,13966.819,66419.2600,54033.200
A11_Post,0.0000,121373.3200,0.0,527708.70,139184.920,0.000,659869.700,46562.8000,0.0000,214803.8600,...,0.000,0.000,117793.810,318111.47,871332.100,1193655.10,194997.900,0.000,65527.7930,0.000
A11_Pre,0.0000,50554.8900,0.0,284532.72,0.000,0.000,0.000,0.0000,0.0000,14401.9330,...,0.000,0.000,50005.133,377780.56,502533.560,966400.56,432990.100,0.000,38804.7420,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Median_Pre_HumCan,0.0000,105603.7425,0.0,461056.09,151337.525,0.000,381460.135,35275.6265,0.0000,242985.1100,...,0.000,0.000,56020.795,350434.03,612192.650,1086103.40,340375.530,0.000,41955.6035,0.000
Median_Pre_MED,288.2130,117896.5800,0.0,410021.10,149833.515,0.000,881461.100,39635.0265,0.0000,155779.9000,...,0.000,0.000,61034.130,309039.01,712458.250,1099143.40,346932.185,0.000,49261.7630,0.000
Median_TD_A,0.0000,242014.0050,0.0,821800.27,310434.950,0.000,284526.915,138880.2730,139003.1950,70731.0390,...,6685.530,0.000,101523.708,439543.95,647543.925,998826.90,339279.000,0.000,56345.5015,31782.089
Median_TD_HumCan,0.0000,225756.8700,0.0,500839.92,297763.565,0.000,239037.340,43170.0465,0.0000,36792.9360,...,0.000,0.000,91657.173,408822.08,604273.225,814564.02,341810.800,0.000,54365.4160,0.000


### Normalize data by hemoglobin mass
#### Set percent for hemoglobin and low abundance protoemes

In [13]:
HB_PERCENT, LA_PERCENT = (0.95, 0.05)

#### Scale data

In [14]:
MODELED_PERCENT = HB_PERCENT + LA_PERCENT
assert 1 >= MODELED_PERCENT
# Identify hemoglobin proteins
HB_PROTEINS = {
    k.replace("-", "_"): v
    for k, v in {
        "Hba": "P01942",  # Hemoglobin subunit alpha
        "Hba-a1": "P01942",
        "Hbb-b1": "P02088",  # Hemoglobin subunit beta-1
        "Hbb-b2": "P02089",  # Hemoglobin subunit beta-2
        "Hbb-bh0": "P04443",  # Hemoglobin subunit beta-H0
        "Hbb-bh1": "P04444",  # Hemoglobin subunit beta-H1
        "Hbz": "P06467",  # Hemoglobin subunit zeta
        "Hba-x": "P06467",
        "Hbz1": "P06467",
        "Hbb-y": "P02104",  # Hemoglobin subunit epsilon-Y2
    }.items()
}
# Protein intensity / Total intensity --> Percent protein abundance / total protein
df_percent_abundance = df_proteomics_final.apply(lambda x: x / x.sum(), axis=1)
df_percent_hb = df_percent_abundance.loc[
    :, df_percent_abundance.columns.isin(list(HB_PROTEINS.values()))
]
df_percent_la = df_percent_abundance.loc[
    :, ~df_percent_abundance.columns.isin(list(HB_PROTEINS.values()))
]

# Scale hemoglobin and low abundance protoeme percentages
df_percent_hb_normalized = HB_PERCENT * df_percent_hb.div(
    df_percent_hb.sum(axis=1), axis=0
)
df_percent_la_normalized = LA_PERCENT * df_percent_la.div(
    df_percent_la.sum(axis=1), axis=0
)

# Combine dataframes back into one
df_percent_abundance_normalized = pd.concat(
    (df_percent_hb_normalized, df_percent_la_normalized), axis=1
)

df_summary = {
    "Perfect total": 1.0,
    "Current total": df_percent_abundance.loc[sample_ids].sum(axis=1).mean().item(),
    "Hemoglobin total": df_percent_hb.loc[sample_ids].sum(axis=1).mean().item(),
    "Low abundance total": df_percent_la.loc[sample_ids].sum(axis=1).mean().item(),
}
df_summary["Hemoglobin scaled"] = HB_PERCENT
df_summary["Low abundance scaled"] = LA_PERCENT
df_summary["Remaining scaled"] = 1 - (HB_PERCENT + LA_PERCENT)
df_summary = pd.DataFrame.from_dict(
    {" " * max(30 - len(k), 0) + k: [f"{v * 100:.1f}%"] for k, v in df_summary.items()},
    orient="index",
    columns=["Percentage"],
)
print(df_summary)
df_percent_abundance_normalized.sum(axis=1)

                     Percentage
       Perfect total     100.0%
       Current total     100.0%
    Hemoglobin total      62.8%
 Low abundance total      37.2%
   Hemoglobin scaled      95.0%
Low abundance scaled       5.0%
    Remaining scaled       0.0%


SAMPLE ID
A10_Post             1.0
A10_Pre              1.0
A10_TD               1.0
A11_Post             1.0
A11_Pre              1.0
                    ... 
Median_Pre_HumCan    1.0
Median_Pre_MED       1.0
Median_TD_A          1.0
Median_TD_HumCan     1.0
Median_TD_MED        1.0
Length: 125, dtype: float64

### Transform data to copy numbers and expected format

In [15]:
df_uniprot_to_mw = df_protein_data["Mass"].astype(float)

gDW_total_protein = (
    df_MCH_final  # pgDW HB
    * (1 / HB_PERCENT)  # pgDW total protein / pgDW HB
    * (1 / 1e12)  #  gDW total protein / pgDW total protein
)  #  gDW total protein

# Percent protein abundance / total protein --> Specific protein concentration / total protein
df_mol_per_gDW = df_percent_abundance_normalized.div(
    df_uniprot_to_mw, axis=1  # mol protein / gDW total protein
)
# Convert from mol / gDW protein --> nmol / gDW protein
df_nmol_per_gDW = df_mol_per_gDW * (1e9 / 1)  # nmol protein / mol protein

# Convert from mol / gDW protein --> copy numbers / cell
df_copy_numbers = df_mol_per_gDW.mul(gDW_total_protein, axis=0) * AVOGADRO_NUMBER
df_copy_numbers

Unnamed: 0_level_0,A2AAY5,A2ADY9,A2AGT5,A2AN08,A2AQ07,A2AVZ9,A6X935,B2RPV6,B2RQC6,C0HKE1,...,Q9Z1Z0,Q9Z2K1,Q9Z2L7,Q9Z2M7,Q9Z2U0,Q9Z2U1,Q9Z2W0,Q9Z2X1,Q9Z2Y8,V9GXG1
SAMPLE ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A10_Post,1.532839,1769.044429,0.0,258.349154,365.984078,0.000000,452.088482,0.000000,0.000000,5303.345886,...,0.000000,227.412008,896.102713,6124.872567,14577.079667,18557.254473,2694.107903,0.00000,1062.633196,0.000000
A10_Pre,0.000000,586.771902,0.0,325.692166,993.107240,0.000000,1365.243586,164.005553,0.000000,3161.051883,...,34.002530,0.000000,632.855581,6814.924296,14911.488174,22120.044116,4235.740458,0.00000,1027.261369,0.000000
A10_TD,0.000000,3035.850199,0.0,676.362423,2123.790809,111.332314,439.562472,129.540986,232.737983,210.372285,...,107.741684,154.964468,1260.321432,5879.481191,11553.912040,18718.897846,2377.713771,124.98622,904.544295,217.444422
A11_Post,0.000000,1141.377034,0.0,386.661745,1157.075256,0.000000,2643.837242,143.454748,0.000000,6372.346437,...,0.000000,0.000000,996.673806,4823.112242,13116.974099,18951.654166,1566.225558,0.00000,914.426129,0.000000
A11_Pre,0.000000,655.596695,0.0,287.499343,0.000000,0.000000,0.000000,0.000000,0.000000,589.176981,...,0.000000,0.000000,583.462367,7898.694813,10432.363352,21158.901919,4795.898979,0.00000,746.750590,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Median_Pre_HumCan,0.000000,1167.995961,0.0,397.325876,1479.694919,0.000000,1797.553482,127.822324,0.000000,8477.993742,...,0.000000,0.000000,557.488581,6248.998880,10839.124177,20281.289833,3215.428347,0.00000,688.603447,0.000000
Median_Pre_MED,1.319907,1229.199945,0.0,333.087693,1381.000440,0.000000,3915.571037,135.384959,0.000000,5123.705330,...,0.000000,0.000000,572.557042,5194.894968,11891.175645,19348.087748,3089.472904,0.00000,762.164057,0.000000
Median_TD_A,0.000000,2484.554987,0.0,657.363291,2817.359375,0.000000,1244.520630,467.109411,261.606593,2290.707042,...,28.607269,0.000000,937.778481,7275.327753,10641.954472,17312.536854,2974.976693,0.00000,858.389805,143.073604
Median_TD_HumCan,0.000000,2513.424228,0.0,434.465018,2930.622647,0.000000,1133.864610,157.462550,0.000000,1292.232520,...,0.000000,0.000000,918.154902,7338.398770,10769.665695,15311.312192,3250.342128,0.00000,898.182783,0.000000


## Export absolute quantitative data and metadata per sample

In [16]:
dataframes_dict = {
    "ProteinData": df_protein_data,
    "ProteinIntensities": df_proteomics_final,
    "ProteinConcentrations": df_nmol_per_gDW,
    "ProteinCopyNumbers": df_copy_numbers,
    "MCH": df_MCH_final,
    "Metadata": df_metadata,
}
for data_type, df in dataframes_dict.items():
    # df.to_csv(
    #     processed_data_dirpath / f"{data_type}.tsv", sep="\t", index=True
    # )
    df.to_csv(processed_data_dirpath / f"{data_type}.csv", index=True)
    print(f"Saved data for {data_type}")

Saved data for ProteinData
Saved data for ProteinIntensities
Saved data for ProteinConcentrations
Saved data for ProteinCopyNumbers
Saved data for MCH
Saved data for Metadata
