# Calculate detected peptides

## Notebook summary:
1. Import MaxQuant output (evidence.txt) file.
2. Clean dataset.
    - Check for and remove contaminants and 'reverse' sequences by MaxQuant and empty protein IDs.
    - Keep only peptides with 1 missed cleavages.
3. Reformat dataset columns for ML training and export reformatted peptide dataset as a TSV file.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import libraries
import numpy as np
import pandas as pd
import re
import csv

# set display options
#pd.set_option("display.max_rows", None, "display.max_columns", None)

### 1. Import MaxQuant output (evidence.txt) file

In [None]:
# load dataset
evidence = pd.read_table('/content/drive/MyDrive/Colab Notebooks/peptide ml detection/data/evidence.txt')
evidence.head()

Unnamed: 0,Sequence,Length,Modifications,Modified sequence,Oxidation (M) Probabilities,Oxidation (M) Score Diffs,Oxidation (M),Missed cleavages,Proteins,Leading proteins,...,Potential contaminant,id,Protein group IDs,Peptide ID,Mod. peptide ID,MS/MS IDs,Best MS/MS,Oxidation (M) site IDs,Taxonomy IDs,Mass deficit
0,AAAAAAAAAPAAAATAPTTAATTAATAAQ,29,Unmodified,_AAAAAAAAAPAAAATAPTTAATTAATAAQ_,,,0,0,P37108;H0YLW0,P37108,...,,0,3599,0,0,,,,9606.0,0.074103
1,AAAAAAALESWQAAAPR,17,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAALESWQAAAPR_,,,0,0,Q9UH36,Q9UH36,...,,1,6739,1,1,,,,9606.0,0.03597
2,AAAAAAALQAK,11,Unmodified,_AAAAAAALQAK_,,,0,0,P36578;H3BM89;H3BU31,P36578,...,,2,3592,2,2,,,,9606.0,0.065529
3,AAAAAAALQAK,11,Unmodified,_AAAAAAALQAK_,,,0,0,P36578;H3BM89;H3BU31,P36578,...,,3,3592,2,2,,,,9606.0,0.065529
4,AAAAAAALQAK,11,Unmodified,_AAAAAAALQAK_,,,0,0,P36578;H3BM89;H3BU31,P36578,...,,4,3592,2,2,,,,9606.0,0.065529


#### Explore dataset

In [None]:
evidence.shape

(333653, 87)

In [None]:
evidence.columns

Index(['Sequence', 'Length', 'Modifications', 'Modified sequence',
       'Oxidation (M) Probabilities', 'Oxidation (M) Score Diffs',
       'Oxidation (M)', 'Missed cleavages', 'Proteins', 'Leading proteins',
       'Leading razor protein', 'Gene names', 'Protein names', 'Type',
       'Raw file', 'Experiment', 'MS/MS m/z', 'Charge', 'm/z', 'Mass',
       'Uncalibrated - Calibrated m/z [ppm]',
       'Uncalibrated - Calibrated m/z [Da]', 'Mass error [ppm]',
       'Mass error [Da]', 'Uncalibrated mass error [ppm]',
       'Uncalibrated mass error [Da]', 'Max intensity m/z 0', 'Retention time',
       'Retention length', 'Calibrated retention time',
       'Calibrated retention time start', 'Calibrated retention time finish',
       'Retention time calibration', 'Ion mobility index',
       'Ion mobility length', '1/K0', '1/K0 length', 'Calibrated 1/K0', 'CCS',
       'Calibrated CCS', 'CCS length', 'Match time difference',
       'Match m/z difference', 'Match K0 difference', 'Match q

In [None]:
# get summary
evidence.describe()

Unnamed: 0,Length,Oxidation (M) Probabilities,Oxidation (M) Score Diffs,Oxidation (M),Missed cleavages,MS/MS m/z,Charge,m/z,Mass,Uncalibrated - Calibrated m/z [ppm],...,Mass difference to range max,Sample 1/K0 - library 1/K0,Intensity,id,Peptide ID,Mod. peptide ID,MS/MS IDs,Best MS/MS,Taxonomy IDs,Mass deficit
count,333653.0,0.0,0.0,333653.0,333653.0,0.0,333653.0,333653.0,333653.0,209344.0,...,333653.0,333653.0,333653.0,333653.0,333653.0,333653.0,0.0,0.0,333105.0,333653.0
mean,12.036319,,,0.088493,0.225684,,2.204038,628.969162,1372.223718,0.0,...,13.094524,0.013044,67983.68,166826.0,25767.941322,28480.507279,,,9606.0,0.025571
std,3.313914,,,0.304061,0.468839,,0.441708,131.441102,343.815246,0.0,...,54.966697,0.011902,294079.9,96317.46902,14843.675882,16545.534128,,,0.0,0.054203
min,7.0,,,0.0,0.0,,1.0,399.202356,770.428653,0.0,...,-374.9933,-0.019324,0.0,0.0,0.0,0.0,,,9606.0,-0.234331
25%,10.0,,,0.0,0.0,,2.0,525.59519,1110.59209,0.0,...,6.144279,0.003925,0.0,83413.0,12942.0,14159.0,,,9606.0,-0.010486
50%,12.0,,,0.0,0.0,,2.0,611.806759,1333.62624,0.0,...,12.60901,0.011971,6568.3,166826.0,25789.0,28167.0,,,9606.0,0.026135
75%,14.0,,,0.0,0.0,,2.0,717.93196,1583.83478,0.0,...,18.747097,0.021688,43036.0,250239.0,38656.0,42946.0,,,9606.0,0.06219
max,44.0,,,4.0,2.0,,4.0,999.9848,3936.01635,0.0,...,425.79764,0.058391,21168000.0,333652.0,51455.0,57147.0,,,9606.0,0.277525


#### Verification: check number of proteins (unique rows) and peptides

In [None]:
evidence['Proteins'].describe()

count              333058
unique              19504
top       P78527;P78527-2
freq                  989
Name: Proteins, dtype: object

In [None]:
evidence['Sequence'].describe()

count      333653
unique      51456
top       HLQLAIR
freq          170
Name: Sequence, dtype: object

In [None]:
# there are more sequences than Proteins - maybe there are some missing proteins? Check for this
print(evidence['Proteins'].isnull().sum()) # after a manual check these seem to be all reverse sequences

595


In [None]:
# check missed cleavages
evidence['Missed cleavages'].describe(include=['category'])

count    333653.000000
mean          0.225684
std           0.468839
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           2.000000
Name: Missed cleavages, dtype: float64

### 2. Clean dataset

#### Remove contaminants, 'reverse' sequences and empty protein IDs

In [None]:
# check number of contaminant peptides
print(len(evidence.loc[evidence['Potential contaminant'] == '+']))

2719


In [None]:
# remove contaminant peptides
evidence_cleaned = evidence.loc[evidence['Potential contaminant'] != '+']
evidence_cleaned.shape

(330934, 87)

In [None]:
# check number of contaminant peptides after removing
print(len(evidence_cleaned.loc[evidence_cleaned['Potential contaminant'] == '+']))

0


In [None]:
# check number of 'reverse' sequences before removing contaminants
print(len(evidence.loc[evidence['Reverse'] == '+']))

595


In [None]:
# check number of 'reverse' sequences after removing contaminants
print(len(evidence_cleaned.loc[evidence_cleaned['Reverse'] == '+']))

595


In [None]:
# remove 'reverse' sequences
evidence_cleaned = evidence_cleaned.loc[evidence_cleaned['Reverse'] != '+']
evidence_cleaned.shape

(330339, 87)

In [None]:
# check number of 'reverse' sequences after removing them
print(len(evidence_cleaned.loc[evidence_cleaned['Reverse'] == '+']))

0


In [None]:
# check for number of proteins
evidence_cleaned['Leading razor protein'].describe()

count     330339
unique      7043
top       Q14204
freq        1252
Name: Leading razor protein, dtype: object

In [None]:
# check for number of peptides
evidence_cleaned['Sequence'].describe()

count      330339
unique      50935
top       HLQLAIR
freq          170
Name: Sequence, dtype: object

#### Keep only peptides with 0 missed cleavages.

In [None]:
# check number of 0 missed cleavages
print(len(evidence_cleaned.loc[evidence_cleaned['Missed cleavages'] == 0]))

263385


In [None]:
# check number of 1 missed cleavages
print(len(evidence_cleaned.loc[evidence_cleaned['Missed cleavages'] == 1]))

59529


In [None]:
# check number of 2 missed cleavages
print(len(evidence_cleaned.loc[evidence_cleaned['Missed cleavages'] == 2]))

7425


In [None]:
# check number of 3 missed cleavages (should be 0)
print(len(evidence.loc[evidence['Missed cleavages'] == 3]))

0


In [None]:
# keep only the peptides with 1 missed cleavages
evidence_cleaned = evidence_cleaned.loc[(evidence_cleaned['Missed cleavages'] == 1)]

In [None]:
# check for any missed cleavages
evidence_cleaned['Missed cleavages'].describe()

count    59529.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
Name: Missed cleavages, dtype: float64

In [None]:
# check number of missed cleavages after removing them
print(len(evidence_cleaned.loc[evidence_cleaned['Missed cleavages'] != 1]))

0


In [None]:
evidence_cleaned.shape

(59529, 87)

#### Verification: check total number of missing values for each feature to determine any further rows for removal

In [None]:
print(evidence_cleaned.isnull().sum())
# No missing proteins or sequences, so no need to remove any further rows. Features that do have missing data are expected.

Sequence                           0
Length                             0
Modifications                      0
Modified sequence                  0
Oxidation (M) Probabilities    59529
                               ...  
MS/MS IDs                      59529
Best MS/MS                     59529
Oxidation (M) site IDs         57003
Taxonomy IDs                       0
Mass deficit                       0
Length: 87, dtype: int64


#### Verification: check for any potential co-eluting contaminants and reverse sequences in 'Proteins' and 'leading razor protein' columns

In [None]:
# check for any potential co-eluting contaminants in 'Proteins' column
len(evidence_cleaned[evidence_cleaned['Proteins'].str.contains("CON__", na=False)])

59

In [None]:
# remove co-eluting contaminants in 'Proteins' column
evidence_cleaned = evidence_cleaned[~evidence_cleaned['Proteins'].str.contains("CON__", na=False)]
evidence_cleaned.shape

(59470, 87)

In [None]:
# verify if all contaminants are removed in 'Proteins' column
len(evidence_cleaned[evidence_cleaned["Proteins"].str.contains("CON__")])

0

In [None]:
# check for any co-eluting contaminants in 'Leading razor protein' column
len(evidence_cleaned[evidence_cleaned['Leading razor protein'].str.contains("CON__", na=False)])

0

In [None]:
# check for any co-eluting reverse sequences in 'Proteins' column
len(evidence_cleaned[evidence_cleaned['Proteins'].str.contains("REV__", na=False)])

0

In [None]:
# check for any co-eluting reverse sequences in 'Leading razor protein' column
len(evidence_cleaned[evidence_cleaned['Leading razor protein'].str.contains("REV__", na=False)])

0

In [None]:
evidence_cleaned.shape

(59470, 87)

In [None]:
# check how many identified peptides map to more than one protein (ambiguous protein groups)
len(evidence_cleaned[evidence_cleaned['Proteins'].str.contains(";")])

48455

In [None]:
# keep only rows that don't contain multiple proteins in "Proteins" column
evidence_cleaned = evidence_cleaned[~evidence_cleaned['Proteins'].str.contains(";", na=False)]
evidence_cleaned.shape

(11015, 87)

In [None]:
# verify these protein groups were removed
len(evidence_cleaned[evidence_cleaned['Proteins'].str.contains(";")])

0

In [None]:
# export cleaned dataset as TSV
evidence_cleaned.to_csv("/content/drive/MyDrive/Colab Notebooks/peptide ml detection/data/evidence_cleaned.tsv", sep='\t', index=False)

### 3. Reformat dataset columns for ML training

In [None]:
# extract protein and peptide sequence from evidence_cleaned
# make the "leading razor groups" column as the protein column
detected_peptides = evidence_cleaned[['Proteins', 'Sequence', 'PEP']]
detected_peptides.head()

Unnamed: 0,Proteins,Sequence,PEP
66,Q9Y2U8,AAAAASAPQQLSDEELFSQLRR,0.24049
67,Q9Y2U8,AAAAASAPQQLSDEELFSQLRR,0.004857
68,Q9Y2U8,AAAAASAPQQLSDEELFSQLRR,0.006897
599,Q8N3K9,AADEQMALSKVR,0.00586
600,Q8N3K9,AADEQMALSKVR,0.78993


#### Verification: final check on dimensions after formatting, and if expected number of proteins and peptides are present

In [None]:
# check dimensions
detected_peptides.shape

(11015, 3)

In [None]:
# rename columns
detected_peptides = detected_peptides.rename(columns={"Proteins": "Protein", "Sequence": "Peptide"})
detected_peptides.head()

Unnamed: 0,Protein,Peptide,PEP
66,Q9Y2U8,AAAAASAPQQLSDEELFSQLRR,0.24049
67,Q9Y2U8,AAAAASAPQQLSDEELFSQLRR,0.004857
68,Q9Y2U8,AAAAASAPQQLSDEELFSQLRR,0.006897
599,Q8N3K9,AADEQMALSKVR,0.00586
600,Q8N3K9,AADEQMALSKVR,0.78993


### 4. Export reformatted peptide dataset as TSV

In [None]:
detected_peptides.to_csv("/content/drive/MyDrive/Colab Notebooks/peptide ml detection/data/detected_peptides.tsv", sep='\t', index=False)

In [None]:
detected_peptides.shape

(11015, 3)