### Imports 

In [162]:
import os 
import re 
import numpy as np 
import pandas as pd 
from collections import Counter

### Data prep

#### File import

In [163]:
def combine_files_into_df(directory_path="../data/", file_types=['.parquet', '.tsv', '.csv']):
    """
    Combines all files in a directory into one DataFrame.
    @param directory_path: str, path to the directory containing the files
    @param file_types: list, list of file types to be imported
    @return: df: DataFrame
    """
    # Map file extensions to their respective pandas read functions and parameters
    read_funcs = {
        '.parquet': (pd.read_parquet, {'engine': 'fastparquet'}),
        '.tsv': (pd.read_csv, {'sep': '\t'}),
        '.csv': (pd.read_csv, {})
    }
    
    message_prefix = "Step 1/? complete."

    dfs = []
    # Iterate through all files in the specified directory
    for file in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file)
        file_extension = os.path.splitext(file)[1]
        
        # Check if the file extension is in the list of types to read
        if file_extension in file_types:
            read_func, params = read_funcs.get(file_extension, (None, None))
            if read_func:
                df = read_func(file_path, **params)
                dfs.append(df)
            else:
                print(f"Skipping unsupported file type: {file_extension}")

    # Combine all DataFrames in the list into a single DataFrame
    if dfs:
        df = pd.concat(dfs, ignore_index=True)
        print(f"{message_prefix} Combined {len(dfs)} files into one DataFrame.")
    else:
        df = pd.DataFrame()
        print("No files combined.")

    return df

In [164]:
def filter_dataframe_columns(df, columns_to_keep=['modified_sequence', 'precursor_charge', 'precursor_intensity']):
    """
    Filters a DataFrame to keep only the specified columns.
    @param df: DataFrame, the DataFrame to filter
    @param columns_to_keep: list, the names of the columns to keep
    @return: df_filtered: DataFrame
    """
    df_filtered = df[columns_to_keep].copy() if all(col in df for col in columns_to_keep) else df
    print(f"Step 2/? complete. Removed {len(df.columns) - len(df_filtered.columns)} columns from the DataFrame.")
    return df_filtered

In [165]:
def drop_na(df, column="precursor_intensity"):
    """
    Drop all rows with NaN values in a specific column
    Default: drop na from precursor_intensity column
    @param df: DataFrame
    @param column: column to drop NaN values from
    @return: df: DataFrame
    """

    df = df[df[column].notna()]
    print(f"Step 3/? complete. Dropped rows with NaN for intensities.")
    return df

In [166]:
def keep_desired_charges(df, charge_list=[1, 2, 3, 4, 5, 6], min_count=None):
    """
    Keep only desired charge states and filter out charges with counts less than min_count.
    Default: keep charge states 1-6 with no minimum count filtering.

    @param df: DataFrame
    @param charge_list: list of charge states to be kept
    @param min_count: minimum count of charge states to be retained
    """
    message_prefix = "Step 4/? complete."
    summary_message = ""  # Initialize an empty summary message

    if min_count is not None:
        charge_counts = df["precursor_charge"].value_counts()
        filtered_charges = [
            charge for charge in charge_list if charge_counts.get(charge, 0) >= min_count
        ]
        
        removed_charges = set(charge_list) - set(filtered_charges)
        if removed_charges:
            summary_message += f"Removed charge states with less than {min_count} occurrences in the dataset. "
            summary_message += f"Charges removed: {sorted(removed_charges)}. "
        charge_list = filtered_charges
    else:
        summary_message += f"Focused on keeping charge states within {charge_list}. "

    # Apply filtering based on the updated charge_list
    df_filtered = df[df["precursor_charge"].isin(charge_list)]

    # Final step message, summarizing the action taken
    final_message = f"{message_prefix} {summary_message}Resulting in {len(df_filtered)} entries."
    print(final_message)

    return df_filtered

In [167]:
def aggregate_sequences(df):
    """
    Aggregates all sequences to unique sequences
    @param df: DataFrame
    @return: df: DataFrame
    """
    df = (
        df.groupby("modified_sequence")[["precursor_charge", "precursor_intensity"]]
        .agg(list)
        .reset_index()
    )
    print(f"Step 5/? complete. Aggregated all sequences to unique sequences.")
    return df

In [168]:
def remove_rare_sequence_lengths(df, representation_threshold=100):
    """
    Remove sequences of specific length represented less than a certain number of times.
    
    @param df: DataFrame containing a "modified_sequence" column
    @param representation_threshold: int, threshold for the number of times a sequence length must be represented
    @return: tuple of (DataFrame, int), where DataFrame contains only sequence lengths represented more than
             representation_threshold times, and int is the length of the longest sequence
    """
    before_len = len(df)
    # Calculate sequence lengths directly within the groupby and count operation
    sequence_lengths = df["modified_sequence"].str.len()
    # Identify sequence lengths that meet the representation threshold
    valid_lengths = sequence_lengths.value_counts()[lambda x: x >= representation_threshold].index

    # Filter the DataFrame based on valid sequence lengths
    df_filtered = df[sequence_lengths.isin(valid_lengths)].copy()
    padding_length = sequence_lengths.max()

    after_len = len(df_filtered)
    print(
        f"Step 6/? complete. Removed {before_len - after_len} of {before_len} sequences because their sequence length "
        f"is represented less than {representation_threshold} times."
    )
    return df_filtered, padding_length

In [169]:
def complete_vocabulary(df):
    """
    Find all UNIMOD annotations and add them to the vocabulary
    (The length of the vocabulary +1 is used later for the embedding layer)
    @param df: DataFrame
    @return: vocabulary: list, list of all amino acids and modifications
    @return: vocab_len: int, length of the vocabulary
    """
    vocabulary = []
    vocabulary += list("XACDEFGHIKLMNPQRSTVWY")
    annotations = re.findall(r"(\w\[UNIMOD:\d+])", " ".join(df["modified_sequence"]))
    for item in annotations:
        if item not in vocabulary:
            vocabulary.append(item)
    vocab_len = len(vocabulary)
    print(f"Step 7/? complete. Completed vocabulary with {vocab_len} entries.")
    return vocabulary, vocab_len

In [170]:
def encode_charge_states(df, charge_states=None):
    """
    Encode all occuring charge states per unique sequence in a binary vector

    input: df containing "precursor_charge" column output: df containing an additional "charge_state_vector"
    column encoding all occuring charge states per unique sequence in a binary vector
    @param df: DataFrame
    @return: df: DataFrame
    """
    df["charge_state_vector"] = df["precursor_charge"].apply(
        lambda x: [
            1 if i in x else 0 for i in range(charge_states[0], charge_states[-1] + 1)
        ]
    )
    print(
        f"Step ?/? complete. Encoded all occuring charge states per unique sequence in a binary vector."
    )
    return df

In [171]:
def select_most_abundant_by_count(df, charge_list=None):
    """
    Selects the most abundant charge state by count.
    
    @param df: DataFrame with a 'precursor_charge' column containing a list of charge states.
    @return: DataFrame with an added 'most_abundant_by_count' and 'most_abundant_charge_vector' column.
    """
    # Define a function to find the most common element in a list
    def most_common(lst):
        return Counter(lst).most_common(1)[0][0]

    # Apply the function to each row in the 'precursor_charge' column and assign to a new column
    df['most_abundant_charge_by_count'] = df['precursor_charge'].apply(most_common)

    if charge_list is None:
        charge_list = [1, 2, 3, 4, 5, 6]
    df["most_abundant_charge_by_count_vector"] = df["most_abundant_charge_by_count"].apply(
        lambda x: [1 if x == i else 0 for i in charge_list]
    )
    print(
        f"Step ?/? complete. Selected most abundant charge state by count and generated one-hot encoding"
    )
    
    return df

In [172]:
def get_topK_charge_states_by_count(df, k=2):
    """
    Get top-k charge states for each sequence according to the count in precursor_charge.
    Default: k=2
    @param df: DataFrame with a 'precursor_charge' column containing a list of charge states.
    @param k: int, number of top charge states to be selected
    @return: DataFrame with an added column for top-k charge states.
    """

    def get_topK_charge_counts(charge_list):
        # Count the occurrences of each charge state and get the top-k
        count = Counter(charge_list)
        top_k = [charge for charge, _ in count.most_common(k)]
        return top_k

    # Apply the function to each row in the 'precursor_charge' column and assign to a new column
    df[f"top_{k}_charge_states_by_count"] = df['precursor_charge'].apply(get_topK_charge_counts)

    print(f"Step ?/? complete. Selected top {k} charge states per sequence based on count.")
    return df

In [173]:
# TODO
def one_hot_encode_charge(df, charge_list=None):
    """
    One-hot encodes the most abundant charge state
    @param df: DataFrame
    @param charge_list: list, list of charge states
    @return: df: DataFrame
    """
    if charge_list is None:
        charge_list = [1, 2, 3, 4, 5, 6]
    df["most_abundant_charge_vector"] = df["most_abundant_charge"].apply(
        lambda x: [1 if x == i else 0 for i in charge_list]
    )
    return df

In [174]:
# TODO 
def normalize_precursor_intensities(df_charge_list, df_intensity_list):
    """
    Normalizes the precursor intensities based on the most abundant precursor intensity
    @param df_charge_list: list, list of charge states
    @param df_intensity_list: list, list of precursor intensities
    @return: charge_dict: dict, dictionary with charge states as keys and normalized intensities as values
    """
    # Get the index of the most abundant precursor intensity
    charge_dict = dict()
    for index, i in enumerate(df_charge_list):
        charge_dict[i] = []
        charge_dict[i].append(df_intensity_list[index])

    # Normalize the precursor intensity based on the most abundant precursor intensity
    for key, value in charge_dict.items():
        if len(value) > 1:
            charge_dict[key] = sum(value) - min(value) / (max(value) - min(value))

    # convert list of one float to float values
    charge_dict = {key: value[0] for key, value in charge_dict.items()}
    return charge_dict

In [175]:
# TODO 
def get_most_abundant(df_charge_list, df_intensity_list, distributions=False):
    """
    Get the most abundant charge state
    @param df_charge_list: list, list of charge states
    @param df_intensity_list: list, list of precursor intensities
    @param distributions: bool, if True returns a dictionary with all charge states and their intensities
    @return: charge_dict: dict, dictionary with charge states as keys and intensities as values
    """
    charge_dict = dict()
    for index, i in enumerate(df_charge_list):
        if i not in charge_dict:
            charge_dict[i] = df_intensity_list[index]
        else:
            charge_dict[i] += df_intensity_list[index]
    if distributions:
        return charge_dict
    else:
        return max(charge_dict, key=charge_dict.get)

In [176]:
# TODO
def select_most_abundant_by_normalized_intensity(df, charge_list=None):
    """
    Normalizes the precursor intensities and selects the most abundant charge state
    @param df: DataFrame
    @param charge_list: list, list of charge states
    @return: df: DataFrame
    """

    if charge_list is None:
        charge_list = [1, 2, 3, 4, 5, 6]
    df["normalized"] = df.apply(
        lambda x: normalize_precursor_intensities(
            x["precursor_charge"], x["precursor_intensity"]
        ),
        axis=1,
    )
    df["pre_normalization"] = df.apply(
        lambda x: get_most_abundant(
            x["precursor_charge"], x["precursor_intensity"], True
        ),
        axis=1,
    )
    df["most_abundant_charge"] = df["normalized"].apply(lambda x: max(x, key=x.get))
    df = one_hot_encode_charge(df, charge_list)
    print(
        f"Step ?/? complete. Applied normalization, selected most abundant charge state and one-hot encoded "
        f"it."
    )
    return df

In [177]:
# TODO
def get_topK_charge_states(df, k=2):
    """
    get topK charge states for each sequence according to the normalized precursor intensity
    Default: k=2
    @param df: DataFrame
    @param k: int, number of top charge states to be selected
    @return: df: DataFrame
    """

    def get_topK(label_dict):
        allowed_keys = list()
        sorted_values = sorted(label_dict.values(), reverse=True)
        for i in sorted_values:
            for key, value in label_dict.items():
                if i == value and len(allowed_keys) <= k - 1:
                    allowed_keys.append(key)
        return allowed_keys

    df[f"top_{k}_charge_states"] = df["normalized"].apply(get_topK)
    print(f"Step ?/? complete. Selected top {k} charge states per sequence.")
    return df

## Data prep process

In [178]:
df = combine_files_into_df()

Step 1/? complete. Combined 13 files into one DataFrame.


In [179]:
df = filter_dataframe_columns(df)

Step 2/? complete. Removed 12 columns from the DataFrame.


In [180]:
# df.to_csv("dlomix/data/prelim_unprocessed_data.csv")

In [181]:
df = drop_na(df)

Step 3/? complete. Dropped rows with NaN for intensities.


In [182]:
df = keep_desired_charges(df, charge_list=[1,2,3,4,5,6,7], min_count=10000)

Step 4/? complete. Removed charge states with less than 10000 occurrences in the dataset. Charges removed: [7]. Resulting in 65160159 entries.


In [183]:
aggregated_df = aggregate_sequences(df)

Step 5/? complete. Aggregated all sequences to unique sequences.


In [184]:
df, max_seq_len = remove_rare_sequence_lengths(aggregated_df, representation_threshold=200)

Step 6/? complete. Removed 1301 of 831677 sequences because their sequence length is represented less than 200 times.


In [185]:
complete_vocabulary, vocab_len = complete_vocabulary(df)

Step 7/? complete. Completed vocabulary with 23 entries.


In [186]:
df = select_most_abundant_by_count(df)

Step ?/? complete. Selected most abundant charge state by count and generated one-hot encoding


In [187]:
df = encode_charge_states(df, charge_states=[1,2,3,4,5,6])

Step ?/? complete. Encoded all occuring charge states per unique sequence in a binary vector.


In [188]:
df = get_topK_charge_states_by_count(df, k=2)

Step ?/? complete. Selected top 2 charge states per sequence based on count.


In [189]:
df

Unnamed: 0,modified_sequence,precursor_charge,precursor_intensity,most_abundant_charge_by_count,most_abundant_charge_by_count_vector,charge_state_vector,top_2_charge_states_by_count
0,AAAAAAAAAAAAAAAAGLGLGP,"[2, 3, 2, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, ...","[12746190.0, 2009914.0, 12746190.0, 791369.9, ...",2,"[0, 1, 0, 0, 0, 0]","[1, 1, 1, 0, 0, 0]","[2, 3]"
1,AAAAAAAAAAAAAAAASAGGK,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]","[5944787.0, 5944787.0, 8502878.0, 6425294.0, 8...",2,"[0, 1, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0]",[2]
2,AAAAAAAAAAAAAAAGAGAGAK,"[2, 2, 2, 2]","[3067981.0, 3067981.0, 3067981.0, 3067981.0]",2,"[0, 1, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0]",[2]
3,AAAAAAAAAAAAAAASGFAYPGTSER,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[11913300.0, 11913300.0, 11913300.0, 11913300....",2,"[0, 1, 0, 0, 0, 0]","[0, 1, 1, 0, 0, 0]","[2, 3]"
4,AAAAAAAAAAAAAAG,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1573785.0, 4638296.0, 7934979.0, 2019978.0, 4...",1,"[1, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0]",[1]
...,...,...,...,...,...,...,...
831672,YYYVGFAYL,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[5439354.0, 832455.9, 3048967.0, 1694847.0, 56...",1,"[1, 0, 0, 0, 0, 0]","[1, 1, 0, 0, 0, 0]","[1, 2]"
831673,YYYVPADFVEYEK,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[260337200.0, 501610000.0, 260337200.0, 223532...",2,"[0, 1, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0]",[2]
831674,YYYVQNVYTPVDEHVYPDHR,"[3, 4, 3, 4, 3, 3, 3, 3, 3, 3, 3]","[11725650.0, 14743500.0, 11725650.0, 14743500....",3,"[0, 0, 1, 0, 0, 0]","[0, 0, 1, 1, 0, 0]","[3, 4]"
831675,YYYWVINPADSSGITPK,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[36403250.0, 77270170.0, 11621660.0, 679866900...",2,"[0, 1, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0]",[2]


In [190]:
export_df = df[["modified_sequence", "most_abundant_charge_by_count",  "most_abundant_charge_by_count_vector", "charge_state_vector", "top_2_charge_states_by_count"]]
export_df.head()

Unnamed: 0,modified_sequence,most_abundant_charge_by_count,most_abundant_charge_by_count_vector,charge_state_vector,top_2_charge_states_by_count
0,AAAAAAAAAAAAAAAAGLGLGP,2,"[0, 1, 0, 0, 0, 0]","[1, 1, 1, 0, 0, 0]","[2, 3]"
1,AAAAAAAAAAAAAAAASAGGK,2,"[0, 1, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0]",[2]
2,AAAAAAAAAAAAAAAGAGAGAK,2,"[0, 1, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0]",[2]
3,AAAAAAAAAAAAAAASGFAYPGTSER,2,"[0, 1, 0, 0, 0, 0]","[0, 1, 1, 0, 0, 0]","[2, 3]"
4,AAAAAAAAAAAAAAG,1,"[1, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0]",[1]


In [191]:
export_df.to_parquet("dlomix/data/preprocessed_data.parquet")

In [192]:
import_test = pd.read_parquet("dlomix/data/preprocessed_data.parquet")
import_test.head()

Unnamed: 0,modified_sequence,most_abundant_charge_by_count,most_abundant_charge_by_count_vector,charge_state_vector,top_2_charge_states_by_count
0,AAAAAAAAAAAAAAAAGLGLGP,2,"[0, 1, 0, 0, 0, 0]","[1, 1, 1, 0, 0, 0]","[2, 3]"
1,AAAAAAAAAAAAAAAASAGGK,2,"[0, 1, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0]",[2]
2,AAAAAAAAAAAAAAAGAGAGAK,2,"[0, 1, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0]",[2]
3,AAAAAAAAAAAAAAASGFAYPGTSER,2,"[0, 1, 0, 0, 0, 0]","[0, 1, 1, 0, 0, 0]","[2, 3]"
4,AAAAAAAAAAAAAAG,1,"[1, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0]",[1]


In [193]:
df.columns

Index(['modified_sequence', 'precursor_charge', 'precursor_intensity',
       'most_abundant_charge_by_count', 'most_abundant_charge_by_count_vector',
       'charge_state_vector', 'top_2_charge_states_by_count'],
      dtype='object')

In [None]:
# save df 
df.to_csv("dlomix/data/prelim_preprocessed_data.csv", index=False)