### Imports 

In [1]:
import os 
import re 
import numpy as np 
import pandas as pd 
from collections import Counter

### Data prep

#### File import

In [2]:
def combine_files_into_df(directory_path="../data/", file_types=['.parquet', '.tsv', '.csv']):
    """
    Combines all files in a directory into one DataFrame.
    @param directory_path: str, path to the directory containing the files
    @param file_types: list, list of file types to be imported
    @return: df: DataFrame
    """
    # Map file extensions to their respective pandas read functions and parameters
    read_funcs = {
        '.parquet': (pd.read_parquet, {'engine': 'fastparquet'}),
        '.tsv': (pd.read_csv, {'sep': '\t'}),
        '.csv': (pd.read_csv, {})
    }
    
    message_prefix = "Step 1/? complete."

    dfs = []
    # Iterate through all files in the specified directory
    for file in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file)
        file_extension = os.path.splitext(file)[1]
        
        # Check if the file extension is in the list of types to read
        if file_extension in file_types:
            read_func, params = read_funcs.get(file_extension, (None, None))
            if read_func:
                df = read_func(file_path, **params)
                dfs.append(df)
            else:
                print(f"Skipping unsupported file type: {file_extension}")

    # Combine all DataFrames in the list into a single DataFrame
    if dfs:
        df = pd.concat(dfs, ignore_index=True)
        print(f"{message_prefix} Combined {len(dfs)} files into one DataFrame.")
    else:
        df = pd.DataFrame()
        print("No files combined.")

    return df

In [3]:
def filter_dataframe_columns(df, columns_to_keep=['modified_sequence', 'precursor_charge', 'precursor_intensity']):
    """
    Filters a DataFrame to keep only the specified columns.
    @param df: DataFrame, the DataFrame to filter
    @param columns_to_keep: list, the names of the columns to keep
    @return: df_filtered: DataFrame
    """
    df_filtered = df[columns_to_keep].copy() if all(col in df for col in columns_to_keep) else df
    print(f"Step 2/? complete. Removed {len(df.columns) - len(df_filtered.columns)} columns from the DataFrame.")
    return df_filtered

In [4]:
def drop_na(df, column="precursor_intensity"):
    """
    Drop all rows with NaN values in a specific column
    Default: drop na from precursor_intensity column
    @param df: DataFrame
    @param column: column to drop NaN values from
    @return: df: DataFrame
    """

    df = df[df[column].notna()]
    print(f"Step 3/? complete. Dropped rows with NaN for intensities.")
    return df

In [5]:
def keep_desired_charges(df, charge_list=[1, 2, 3, 4, 5, 6], min_count=None):
    """
    Keep only desired charge states and filter out charges with counts less than min_count.
    Default: keep charge states 1-6 with no minimum count filtering.

    @param df: DataFrame
    @param charge_list: list of charge states to be kept
    @param min_count: minimum count of charge states to be retained
    """
    message_prefix = "Step 4/? complete."
    summary_message = "" 

    if min_count is not None:
        charge_counts = df["precursor_charge"].value_counts()
        filtered_charges = [
            charge for charge in charge_list if charge_counts.get(charge, 0) >= min_count
        ]
        
        removed_charges = set(charge_list) - set(filtered_charges)
        if removed_charges:
            summary_message += f"Removed charge states with less than {min_count} occurrences in the dataset. "
            summary_message += f"Charges removed: {sorted(removed_charges)}. "
        charge_list = filtered_charges
    else:
        summary_message += f"Focused on keeping charge states within {charge_list}. "

    # Apply filtering based on the updated charge_list
    df_filtered = df[df["precursor_charge"].isin(charge_list)]

    final_message = f"{message_prefix} {summary_message}Resulting in {len(df_filtered)} entries."
    print(final_message)

    return df_filtered

In [6]:
def aggregate_unique_sequences(df):
    """
    Aggregates all sequences to unique sequences
    @param df: DataFrame
    @return: df: DataFrame
    """
    df = (
        df.groupby("modified_sequence", as_index=False)[["precursor_charge", "precursor_intensity"]]
        .agg(list)
    )
    print(f"Step 5/? complete. Aggregated all sequences to unique sequences.")
    return df

In [7]:
def remove_rare_sequence_lengths(df, representation_threshold=100):
    """
    Remove sequences of specific length represented less than a certain number of times.
    
    @param df: DataFrame containing a "modified_sequence" column
    @param representation_threshold: int, threshold for the number of times a sequence length must be represented
    @return: tuple of (DataFrame, int), where DataFrame contains only sequence lengths represented more than
             representation_threshold times, and int is the length of the longest sequence
    """
    before_len = len(df)
    # Calculate sequence lengths directly within the groupby and count operation
    sequence_lengths = df["modified_sequence"].str.len()
    # Identify sequence lengths that meet the representation threshold
    valid_lengths = sequence_lengths.value_counts()[lambda x: x >= representation_threshold].index
    # Filter the DataFrame based on valid sequence lengths
    df_filtered = df[sequence_lengths.isin(valid_lengths)].copy()
    padding_length = sequence_lengths.max()
    after_len = len(df_filtered)
    
    print(
        f"Step 6/? complete. Removed {before_len - after_len} of {before_len} sequences because their sequence length "
        f"is represented less than {representation_threshold} times."
    )
    return df_filtered, padding_length

In [8]:
def complete_vocabulary(df):
    """
    Find all UNIMOD annotations and add them to the vocabulary
    (The length of the vocabulary +1 is used later for the embedding layer)
    @param df: DataFrame
    @return: vocabulary: list, list of all amino acids and modifications
    @return: vocab_len: int, length of the vocabulary
    """
    vocabulary = []
    vocabulary += list("XACDEFGHIKLMNPQRSTVWY")
    annotations = re.findall(r"(\w\[UNIMOD:\d+])", " ".join(df["modified_sequence"]))
    for item in annotations:
        if item not in vocabulary:
            vocabulary.append(item)
    vocab_len = len(vocabulary)
    
    print(f"Step 7/? complete. Completed vocabulary with {vocab_len} entries.")
    return vocabulary, vocab_len

In [9]:
# TODO obsolete as it is now handeled by add_labels
def encode_charge_states(df, charge_states=None):
    """
    Encode all occuring charge states per unique sequence in a binary vector

    input: df containing "precursor_charge" column output: df containing an additional "charge_state_vector"
    column encoding all occuring charge states per unique sequence in a binary vector
    @param df: DataFrame
    @return: df: DataFrame
    """
    df["charge_state_vector"] = df["precursor_charge"].apply(
        lambda x: [
            1 if i in x else 0 for i in range(charge_states[0], charge_states[-1] + 1)
        ]
    )
    print(
        f"Step ?/? complete. Encoded all occuring charge states per unique sequence in a binary vector."
    )
    return df

## Select most abundand charge states for TASK 1

### By count // OBSOLETE

In [10]:
def select_most_abundant_by_count(df, charge_list=None):
    """
    Selects the most abundant charge state by count.
    
    @param df: DataFrame with a 'precursor_charge' column containing a list of charge states.
    @return: DataFrame with an added 'most_abundant_by_count' and 'most_abundant_charge_vector' column.
    """
    # Define a function to find the most common element in a list
    def most_common(lst):
        return Counter(lst).most_common(1)[0][0]

    # Apply the function to each row in the 'precursor_charge' column and assign to a new column
    df['most_abundant_charge_by_count'] = df['precursor_charge'].apply(most_common)

    if charge_list is None:
        charge_list = [1, 2, 3, 4, 5, 6]
    df["most_abundant_charge_by_count_vector"] = df["most_abundant_charge_by_count"].apply(
        lambda x: [1 if x == i else 0 for i in charge_list]
    )
    print(
        f"Step ?/? complete. Selected most abundant charge state by count and generated one-hot encoding"
    )
    
    return df

In [11]:
def get_topK_charge_states_by_count(df, k=2):
    """
    Get top-k charge states for each sequence according to the count in precursor_charge.
    Default: k=2
    @param df: DataFrame with a 'precursor_charge' column containing a list of charge states.
    @param k: int, number of top charge states to be selected
    @return: DataFrame with an added column for top-k charge states.
    """

    def get_topK_charge_counts(charge_list):
        # Count the occurrences of each charge state and get the top-k
        count = Counter(charge_list)
        top_k = [charge for charge, _ in count.most_common(k)]
        return top_k

    # Apply the function to each row in the 'precursor_charge' column and assign to a new column
    df[f"top_{k}_charge_states_by_count"] = df['precursor_charge'].apply(get_topK_charge_counts)

    print(f"Step ?/? complete. Selected top {k} charge states per sequence based on count.")
    return df

### By intensities

In [12]:
# TODO
def select_most_abundant_charge_by_intensity(df, aggregation='max'):
    charge_col = f'charge_by_{aggregation}_intensity'
    intensity_col = f'{aggregation}_intensity'
    df[charge_col] = None
    df[intensity_col] = None

    for index, row in df.iterrows():
        charges = row['precursor_charge']
        intensities = row['precursor_intensity']

        # Aggregate intensities for each unique charge
        charge_intensity_dict = {}
        for charge, intensity in zip(charges, intensities):
            if charge in charge_intensity_dict:
                charge_intensity_dict[charge].append(intensity)
            else:
                charge_intensity_dict[charge] = [intensity]

        # Calculate the average or maximum intensity for each charge
        if aggregation == 'avg':
            avg_intensity = {charge: sum(charge_intensity_dict[charge]) / len(charge_intensity_dict[charge]) for charge in charge_intensity_dict}
            most_abundant_charge = max(avg_intensity, key=avg_intensity.get)
            selected_intensity = avg_intensity[most_abundant_charge]
        elif aggregation == 'max':
            max_intensity = {charge: max(charge_intensity_dict[charge]) for charge in charge_intensity_dict}
            most_abundant_charge = max(max_intensity, key=max_intensity.get)
            selected_intensity = max_intensity[most_abundant_charge]

        df.at[index, charge_col] = most_abundant_charge
        df.at[index, intensity_col] = selected_intensity

    return df

In [13]:
def top_k_abundant_charges_by_intensity(df, k=1, aggregation='max'):
    # Define new columns for the top-k most abundant charges and their intensities
    charge_col = f'top_{k}_abundant_charges_by_{aggregation}'
    intensity_col = f'top_{k}_{aggregation}_intensities'
    
    # Initialize new columns
    df[charge_col] = None
    df[intensity_col] = None

    for index, row in df.iterrows():
        charges = row['precursor_charge']
        intensities = row['precursor_intensity']

        # Aggregate intensities for each unique charge
        charge_intensity_dict = {}
        for charge, intensity in zip(charges, intensities):
            charge_intensity_dict.setdefault(charge, []).append(intensity)

        # Calculate the average or maximum intensity for each charge
        if aggregation == 'avg':
            charge_intensity_aggregated = {charge: sum(intensities) / len(intensities) for charge, intensities in charge_intensity_dict.items()}
        elif aggregation == 'max':
            charge_intensity_aggregated = {charge: max(intensities) for charge, intensities in charge_intensity_dict.items()}

        # Sort the charges by their aggregated intensity and select the top-k
        sorted_charges = sorted(charge_intensity_aggregated.items(), key=lambda item: item[1], reverse=True)[:k]
        top_k_charges, top_k_intensities = zip(*sorted_charges) if sorted_charges else ([], [])

        # Assign the top-k charges and their intensities to the dataframe
        df.at[index, charge_col] = list(top_k_charges)
        df.at[index, intensity_col] = list(top_k_intensities)

    return df

In [14]:
def add_labels(df, aggregation='max'):
    df['most_abundant_one_hot'] = None # TASK 1
    df['charge_state_vector'] = None # TASK 2 / 3
    df['normalized_intensity_distribution'] = None # TASK 3
      
    # Determine the maximum charge state across all sequences
    max_charge_state = max(max(charges) for charges in df['precursor_charge'])

    for index, row in df.iterrows():
        charges = row['precursor_charge']
        intensities = row['precursor_intensity']

        # Create the charge_state_vector
        charge_state_vector = [1 if i in charges else 0 for i in range(1, max_charge_state + 1)]
        
        # Map intensities to their respective charge states
        charge_intensity_dict = {charge: [] for charge in charges}
        for charge, intensity in zip(charges, intensities):
            charge_intensity_dict[charge].append(intensity)

        # Determine the most abundant charge based on the specified aggregation
        if aggregation == 'avg':
            aggregated_intensity = {charge: sum(intensities) / len(intensities) for charge, intensities in charge_intensity_dict.items()}
        else:  # 'max'
            aggregated_intensity = {charge: max(intensities) for charge, intensities in charge_intensity_dict.items()}
        
        most_abundant_charge = max(aggregated_intensity, key=aggregated_intensity.get)

        # Calculate the normalized intensity distribution
        total_intensity = sum(sum(intensities) for intensities in charge_intensity_dict.values())
        normalized_distribution = [sum(charge_intensity_dict.get(i, [])) / total_intensity for i in range(1, max_charge_state + 1)]

        # One-hot encode the most abundant charge
        one_hot_most_abundant_charge = [1 if i == most_abundant_charge else 0 for i in range(1, max_charge_state + 1)]
        
        df.at[index,'most_abundant_one_hot'] = one_hot_most_abundant_charge
        df.at[index, 'charge_state_vector'] = charge_state_vector
        df.at[index, 'normalized_intensity_distribution'] = normalized_distribution

    return df


In [15]:
def generate_charge_state_encodings(df, aggregation='max'):
    '''
    Calculate the most abundant charge state for each sequence based on the precursor intensity
    and generate the charge state vector and one-hot encoded vector for the most abundant charge.
    '''
    df['one_hot_most_abundant_charge'] = None
    df['charge_state_vector'] = None

    # Determine the maximum charge state across all sequences to define the vector lengths
    max_charge_state = max(max(charges) for charges in df['precursor_charge'])

    for index, row in df.iterrows():
        charges = row['precursor_charge']
        intensities = row['precursor_intensity']

        # Map intensities to their respective charge states
        charge_intensity_dict = {charge: [] for charge in charges}
        for charge, intensity in zip(charges, intensities):
            charge_intensity_dict[charge].append(intensity)

        # Aggregate intensities for determining the most abundant charge
        if aggregation == 'avg':
            aggregated_intensity = {charge: sum(intensities) / len(intensities) for charge, intensities in charge_intensity_dict.items()}
        else:  # max
            aggregated_intensity = {charge: max(intensities) for charge, intensities in charge_intensity_dict.items()}
        
        most_abundant_charge = max(aggregated_intensity, key=aggregated_intensity.get)

        # Generate the one-hot encoded vector for the most abundant charge
        one_hot_vector = [1 if charge == most_abundant_charge else 0 for charge in range(1, max_charge_state + 1)]
        
        # Generate the charge state vector for all charges
        charge_state_vector = [1 if charge in charges else 0 for charge in range(1, max_charge_state + 1)]

        df.at[index, 'one_hot_most_abundant_charge'] = one_hot_vector
        df.at[index, 'charge_state_vector'] = charge_state_vector

    return df

In [16]:
def calculate_most_abundant_charge(df, aggregation='max'):
    '''
    TASK 1: Calculate the most abundant charge state for each sequence based on the precursor intensity.
    '''
    most_abundant_charge = []
    for charges, intensities in zip(df['precursor_charge'], df['precursor_intensity']):
        charge_intensity_dict = {charge: [] for charge in charges}
        for charge, intensity in zip(charges, intensities):
            charge_intensity_dict[charge].append(intensity)
        
        if aggregation == 'avg':
            aggregated_intensity = {charge: sum(intensities) / len(intensities) for charge, intensities in charge_intensity_dict.items()}
        else:  # max
            aggregated_intensity = {charge: max(intensities) for charge, intensities in charge_intensity_dict.items()}
        
        most_abundant_charge.append(max(aggregated_intensity, key=aggregated_intensity.get))
    
    df['most_abundant_charge_by_intensity'] = most_abundant_charge
    return df

In [17]:
def generate_charge_state_vector(df):
    '''
    TASK 2: Generate a binary vector for each sequence indicating the presence of each charge state.
    '''
    max_charge_state = max(max(charges) for charges in df['precursor_charge'])
    charge_state_vector = [[1 if i in charges else 0 for i in range(1, max_charge_state + 1)] for charges in df['precursor_charge']]
    
    df['charge_state_vector'] = charge_state_vector
    return df

In [18]:
def compute_normalized_intensity_distribution(df):
    '''
    TASK 3: Compute the normalized intensity distribution for each sequence.
    '''
    normalized_intensity_distribution = []
    for charges, intensities in zip(df['precursor_charge'], df['precursor_intensity']):
        total_intensity = sum(intensities)
        charge_intensity_dict = {charge: 0 for charge in charges}
        for charge, intensity in zip(charges, intensities):
            charge_intensity_dict[charge] += intensity
        max_charge_state = max(charges)
        distribution = [charge_intensity_dict.get(i, 0) / total_intensity for i in range(1, max_charge_state + 1)]
        normalized_intensity_distribution.append(distribution)
    
    df['normalized_intensity_distribution'] = normalized_intensity_distribution
    return df

In [19]:
# TODO obsolete as it is now handeled by add_labels
def one_hot_encode_charge(df, charge_list=None):
    """
    One-hot encodes the most abundant charge state
    @param df: DataFrame
    @param charge_list: list, list of charge states
    @return: df: DataFrame
    """
    if charge_list is None:
        charge_list = [1, 2, 3, 4, 5, 6]
    df["most_abundant_charge_vector"] = df["most_abundant_charge"].apply(
        lambda x: [1 if x == i else 0 for i in charge_list]
    )
    return df

## Data prep process

In [None]:
df = combine_files_into_df()

In [None]:
df = filter_dataframe_columns(df)

In [None]:
df = drop_na(df)

In [None]:
df = keep_desired_charges(df, charge_list=[1,2,3,4,5,6,7], min_count=10000)

In [None]:
aggregated_df = aggregate_unique_sequences(df)

In [None]:
df, max_seq_len = remove_rare_sequence_lengths(aggregated_df, representation_threshold=200)

In [None]:
complete_vocabulary, vocab_len = complete_vocabulary(df)

In [None]:
df = select_most_abundant_charge_by_intensity(df, aggregation='max')
df = select_most_abundant_charge_by_intensity(df, aggregation='avg')

In [None]:
df = add_labels(df, aggregation='max')

## TEST CASES

In [20]:
input = {
    "modified_sequence": ["A"]*6 + ["B"]*5 + ["C"]*4 + ["D"]*3 + ["E"]*2 + ["F"]*3 + ["G"]*4 + ["H"]*3 + ["I"]*5 + ["J"] + ["X"]*6,
    "precursor_charge": [2, 2, 2, 1, 3, 2, 2, 2, 1, 1, 3, 1, 1, 2, 2, 3, 3, 2, 1, 1, 2, 2, 2, 1, 1, 2, 3, 3, 3, 2, 4, 4, 4, 4, 5, 2, 1, 1, 1, 1, 2, 2],
    "precursor_intensity": [10, 20, 30, 40, 15, 25, 50, 60, 70, 80, 90, 100, 200, 150, 50, 300, 500, 400, 10, 20, 30, 40, 50, 60, 70, 65, 60, 100, 200, 300, 400, 500, 600, 700, 800, 900, 550, 560, 570, 550, 200, 900]
}
test_data = pd.DataFrame(input)

In [21]:
aggregated_test_data = aggregate_unique_sequences(test_data)
aggregated_test_data = select_most_abundant_charge_by_intensity(aggregated_test_data, aggregation='max')
aggregated_test_data = generate_charge_state_encodings(aggregated_test_data)
aggregated_test_data = compute_normalized_intensity_distribution(aggregated_test_data)
aggregated_test_data

Step 5/? complete. Aggregated all sequences to unique sequences.


## ---------------------------