In [1]:
import pickle
import pandas as pd

train_data = pickle.load(open("../data/train/meta.pkl", "rb"))
test_data = pickle.load(open("../data/test/meta.pkl", "rb"))

# Columns we will be measuring
score_columns = ["MSGFScore", "DeNovoScore", "SpecEValue", "EValue"]

# Build 'true_<score_column>' columns so that false rows can compare their
# scores to their associated true row for similarity testing
for dataset in [train_data, test_data]:
    for score_column in score_columns:
        for outer_key, inner_dict in dataset.items():
            # Loop through inner dictionary
            for inner_key, df in inner_dict.items():
                # Get true value if exists
                true_row = df[df["Label"] == 1].iloc[0]
                if true_row is not None:
                    df["true_" + score_column] = true_row[score_column]
                else:
                    df["true_" + score_column] = None

# Flatten the dataframes into one dataframe
full_dfs = []
for dataset in [train_data, test_data]:
    df_list = []
    for outer_key, inner_dict in dataset.items():
        # Loop through inner dictionary
        for inner_key, df in inner_dict.items():
            # Create a copy of the dataframe to avoid modifying the original
            df_copy = df.copy()
            # Add outer and inner keys as columns
            df_copy["outer_key"] = outer_key
            df_copy["inner_key"] = inner_key
            # Append the dataframe to the list
            df_list.append(df_copy)

    # Concatenate all dataframes in the list
    full_df = pd.concat(df_list, ignore_index=True)
    full_dfs.append(full_df)

train_full_df = full_dfs[0]
test_full_df = full_dfs[1]

# Clean (might not be needed anymore) and create length column
for dataset in [train_full_df, test_full_df]:
    dataset.columns = dataset.columns.str.strip()
    dataset["Peptide"] = dataset["Peptide"].str.strip()
    dataset["peptide_length"] = dataset["Peptide"].str.len()

    # Build peptide_unmodified column that is only numeric chars from Peptide col
    # This feels complicated but I couldn't get it to work otherwise with regex
    dataset["peptide_unmodified"] = dataset["Peptide"].str.replace("+", "")
    for i in range(10):
        dataset["peptide_unmodified"] = dataset["peptide_unmodified"].str.replace(
            str(i), ""
        )
    dataset["peptide_unmodified"] = dataset["peptide_unmodified"].str.replace(".", "")
    dataset["peptide_unmodified"] = (
        dataset["peptide_unmodified"].str[0]
        + "."
        + dataset["peptide_unmodified"].str[1:-1]
        + "."
        + dataset["peptide_unmodified"].str[-1]
    )

# Output the dataframes to csv
train_full_df.to_csv("../data/train/meta_fulldf.csv", index=False)
test_full_df.to_csv("../data/test/meta_fulldf.csv", index=False)
