In [None]:
import pandas as pd
from pyteomics import parser
import re

# Read input files

# Sequence file
qReps = pd.read_csv("data/ComplemEdge_seq_HPRR.csv")
# Method file 
vebios_method = pd.read_csv("data/Vebios_ComplemEdge_method_20241108.csv")


In [None]:
# Extract unique sequences from qReps
qReps['aa_seq'] = qReps['aa_seq'].str.replace('[^A-Z]', '', regex=True)
all_seq = qReps['aa_seq'].unique()

# Function to perform in silico digestion
def digest_df(seq):
    dig_seq = parser.cleave(seq, parser.expasy_rules['trypsin'], missed_cleavages=2)
    dig_seq_df = pd.DataFrame(dig_seq, columns=['peptide'])
    dig_seq_df['aa_seq'] = seq
    Prot_df = qReps[qReps['aa_seq'] == seq].merge(dig_seq_df, on='aa_seq', how='left')
    return Prot_df

In [None]:
# in silico digestion of all targets sequences from spikedin list
all_dig = pd.concat([digest_df(seq) for seq in all_seq]).drop_duplicates(subset=['peptide'])
all_dig

In [None]:
# Add Unique seqyebce conlumn
vebios_method['Unique_Seq'] = vebios_method['Compound'].str.replace('[^A-Z]', '', regex=True)
# Add column is_heavy if detect 'heavy' in Compund
vebios_method['is_heavy'] = vebios_method['Compound'].str.contains('heavy', case=False)
# Add column "charge" which is the last number in the last parenthesis
vebios_method['charge'] = vebios_method['Compound'].str.extract(r'\(([^()]*)\)$')


vebios_method

In [None]:
# Extract unique sequences from method
method_seq = vebios_method['Unique_Seq'].unique()

# Filter peptides not in method_seq but can be found from insilico digestion
filtered_peptides = all_dig[~all_dig['peptide'].isin(method_seq)]['peptide'].unique()
# print unique sequence from filtered_peptides


In [None]:
# # Process vebios_method for deepmrm
vebios_deepmrm = vebios_method[['Unique_Seq', 'Precursor (m/z)', 'Product (m/z)', 'is_heavy']].copy()
vebios_deepmrm.columns = ['Unique_Seq', 'precursor_mz', 'product_mz', 'is_heavy']
vebios_deepmrm

In [None]:
# count rows by peptide_id and is_heavy in vebios_deepmrm

peptide_counts = vebios_deepmrm.groupby(['Unique_Seq', 'is_heavy']).size()
# Extract odd counts from peptie_counts
# odd_counts = peptide_counts[(peptide_counts['False'] %2 == 1)].sum(axis=1)
# odd_pept = peptide_counts[(peptide_counts[False] != peptide_counts[True])].index

# peptide_counts[peptide_counts % 2 == 1]
peptide_counts