## Parsing pepXML file

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

In [None]:
# Load and parse the XML file
tree = ET.parse('47662_e10669_MB032_Pupae_Thr_tRNA_04.pepXML')
root = tree.getroot()

# Initialize lists to store peptides, proteins, and probabilities
peptides = []
proteins = []
probabilities = []

# Iterate through each search_hit element
for search_hit in root.iter('search_hit'):
    # Get the peptide sequence without modifications
    peptide_seq = search_hit.get('peptide')
    
    # Find the modification_info element
    mod_info = search_hit.find('modification_info')
    if mod_info is not None:
        modified_peptide = mod_info.get('modified_peptide')
        
        # check if the modified peptide contains 'T['
        if 'T[' in modified_peptide:
            # Add the modified peptide to the peptides list
            peptides.append(modified_peptide)
            
            # Get the primary protein
            primary_protein = search_hit.get('protein')
            protein_list = [primary_protein]
            
            # Get alternative proteins
            for alt_protein in search_hit.findall('alternative_protein'):
                protein_list.append(alt_protein.get('protein'))
                
            # Join all proteins with commas and add to the proteins list
            proteins.append(', '.join(protein_list))
            
            # Find the peptideprophet_result element and get the probability
            peptideprophet_result = search_hit.find('.//peptideprophet_result')
            if peptideprophet_result is not None:
                probability = peptideprophet_result.get('probability')
                probabilities.append(probability)

# Create a dataframe with the collected data
df = pd.DataFrame({
    'peptide': peptides,
    'protein': proteins,
    'probability': probabilities
})

In [3]:
len(df)

779

In [None]:
# convert the probability column to float
df['probability'] = df['probability'].astype(float)

# sort the dataframe by probability in descending order
probabilityok = df.sort_values(by='probability', ascending=False)

# drop duplicates, keeping only the first occurrence (which has the highest probability due to sorting)
probabilityok = probabilityok.drop_duplicates(subset=['peptide', 'protein'], keep='first')

# Delete rows that contain 'Decoy'
df = probabilityok[~probabilityok['protein'].str.contains('DECOY', case=False, na=False)]

In [21]:
# Save the results to a TSV file
df.to_csv('filtered.tsv', sep='\t', index=False)