In [22]:
import pandas as pd
from pyteomics import parser
import re

# Read input files

# Sequence file
qReps = pd.read_csv("data/ComplemEdge_seq_HPRR.csv")
# Method file 
vebios_method = pd.read_csv("data/Vebios_ComplemEdge_method_20241108.csv")


In [23]:
# Extract unique sequences from qReps
qReps['aa_seq'] = qReps['aa_seq'].str.replace('[^A-Z]', '', regex=True)
all_seq = qReps['aa_seq'].unique()

# Function to perform in silico digestion
def digest_df(seq):
    dig_seq = parser.cleave(seq, parser.expasy_rules['trypsin'], missed_cleavages=2)
    dig_seq_df = pd.DataFrame(dig_seq, columns=['peptide'])
    dig_seq_df['aa_seq'] = seq
    Prot_df = qReps[qReps['aa_seq'] == seq].merge(dig_seq_df, on='aa_seq', how='left')
    return Prot_df

In [24]:
# in silico digestion of all targets sequences from spikedin list
all_dig = pd.concat([digest_df(seq) for seq in all_seq]).drop_duplicates(subset=['peptide'])
all_dig

Unnamed: 0,HPRR,qreps_name,gene_name,uniprot_id,aa_seq,peptide
0,HPRR260200,QR0260200,PAPPA,Q13219,SCLDHNSESIILPMNVTVRDIPHWLNPTRVERVVCTAGLKWYPHPA...,GCEPFMGDNYCDAINNR
1,HPRR260200,QR0260200,PAPPA,Q13219,SCLDHNSESIILPMNVTVRDIPHWLNPTRVERVVCTAGLKWYPHPA...,SCLDHNSESIILPMNVTVRDIPHWLNPTRVER
2,HPRR260200,QR0260200,PAPPA,Q13219,SCLDHNSESIILPMNVTVRDIPHWLNPTRVERVVCTAGLKWYPHPA...,DIPHWLNPTRVER
3,HPRR260200,QR0260200,PAPPA,Q13219,SCLDHNSESIILPMNVTVRDIPHWLNPTRVERVVCTAGLKWYPHPA...,DPQAQEHSR
4,HPRR260200,QR0260200,PAPPA,Q13219,SCLDHNSESIILPMNVTVRDIPHWLNPTRVERVVCTAGLKWYPHPA...,DL
...,...,...,...,...,...,...
26,HPRR000110,QR7000110,COLEC10,Q9Y6Z7,EKGKAGTVCDCGRYRKFVGQLDISIARLKTSMKFVKNVIAGIRETE...,NVIAGIRETEEKFYYIVQEEK
30,HPRR000110,QR7000110,COLEC10,Q9Y6Z7,EKGKAGTVCDCGRYRKFVGQLDISIARLKTSMKFVKNVIAGIRETE...,GKAGTVCDCGRYR
31,HPRR000110,QR7000110,COLEC10,Q9Y6Z7,EKGKAGTVCDCGRYRKFVGQLDISIARLKTSMKFVKNVIAGIRETE...,FVKNVIAGIRETEEK
32,HPRR000110,QR7000110,COLEC10,Q9Y6Z7,EKGKAGTVCDCGRYRKFVGQLDISIARLKTSMKFVKNVIAGIRETE...,AGTVCDCGRYRK


In [None]:
# Add Unique seqyebce conlumn
vebios_method['Unique_Seq'] = vebios_method['Compound'].str.replace('[^A-Z]', '', regex=True)
# Add column is_heavy if detect 'heavy' in Compund
vebios_method['is_heavy'] = vebios_method['Compound'].str.contains('heavy', case=False)
# Add column "charge" which is the last number in the last parenthesis
vebios_method['charge'] = vebios_method['Compound'].str.extract(r'\(([^()]*)\)$')


vebios_method

Unnamed: 0,Compound,Retention Time (min),RT Window (min),Precursor (m/z),Product (m/z),Collision Energy (V),Min Dwell Time (ms),Unique_Seq,is_heavy,charge
0,TTC[+57.021464]WDGK(+2),2.41,2.0,434.186,333.139,18.6,2.978,TTCWDGK,False,+2
1,TTC[+57.021464]WDGK(+2),2.41,2.0,434.186,505.241,18.6,2.978,TTCWDGK,False,+2
2,TTC[+57.021464]WDGK(+2),2.41,2.0,434.186,664.240,18.6,2.978,TTCWDGK,False,+2
3,TTC[+57.021464]WDGK(+2),2.41,2.0,434.186,665.271,18.6,2.978,TTCWDGK,False,+2
4,TTC[+57.021464]WDGK (heavy)(+2),2.41,2.0,438.194,337.146,18.6,2.978,TTCWDGK,True,+2
...,...,...,...,...,...,...,...,...,...,...
1720,GFLAYYQAVDLDEC[+57.021464]ASRSK (heavy)(+3),30.40,2.0,737.688,318.181,34.8,36.202,GFLAYYQAVDLDECASRSK,True,+3
1721,GFLAYYQAVDLDEC[+57.021464]ASRSK (heavy)(+3),30.40,2.0,737.688,389.218,34.8,36.202,GFLAYYQAVDLDECASRSK,True,+3
1722,GFLAYYQAVDLDEC[+57.021464]ASRSK (heavy)(+3),30.40,2.0,737.688,830.392,34.8,36.202,GFLAYYQAVDLDECASRSK,True,+3
1723,GFLAYYQAVDLDEC[+57.021464]ASRSK (heavy)(+3),30.40,2.0,737.688,911.923,34.8,36.202,GFLAYYQAVDLDECASRSK,True,+3


In [None]:
# Extract unique sequences from method
method_seq = vebios_method['Unique_Seq'].unique()

# Filter peptides not in method_seq but can be found from insilico digestion
filtered_peptides = all_dig[~all_dig['peptide'].isin(method_seq)]['peptide'].unique()
# print unique sequence from filtered_peptides


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [40]:
# # Process vebios_method for deepmrm
vebios_deepmrm = vebios_method[['Unique_Seq', 'Precursor (m/z)', 'Product (m/z)', 'is_heavy']].copy()
vebios_deepmrm.columns = ['Unique_Seq', 'precursor_mz', 'product_mz', 'is_heavy']
vebios_deepmrm

Unnamed: 0,Unique_Seq,precursor_mz,product_mz,is_heavy
0,TTCWDGK,434.186,333.139,False
1,TTCWDGK,434.186,505.241,False
2,TTCWDGK,434.186,664.240,False
3,TTCWDGK,434.186,665.271,False
4,TTCWDGK,438.194,337.146,True
...,...,...,...,...
1720,GFLAYYQAVDLDECASRSK,737.688,318.181,True
1721,GFLAYYQAVDLDECASRSK,737.688,389.218,True
1722,GFLAYYQAVDLDECASRSK,737.688,830.392,True
1723,GFLAYYQAVDLDECASRSK,737.688,911.923,True


In [48]:
# count rows by peptide_id and is_heavy in vebios_deepmrm

peptide_counts = vebios_deepmrm.groupby(['Unique_Seq', 'is_heavy']).size()
# Extract odd counts from peptie_counts
# odd_counts = peptide_counts[(peptide_counts['False'] %2 == 1)].sum(axis=1)
# odd_pept = peptide_counts[(peptide_counts[False] != peptide_counts[True])].index

# peptide_counts[peptide_counts % 2 == 1]
peptide_counts

Unique_Seq            is_heavy
AAELIANSLATAGDGLIELR  False       4
                      True        4
AAIISAEGDSK           False       4
                      True        4
AASGTQNNVLR           False       4
                                 ..
YRPSQDQGLPASR         True        3
YVMLPVADQDQCIR        False       4
                      True        4
YVVISQGLDKPR          False       4
                      True        4
Length: 435, dtype: int64