# Calculate undetected peptides and apply SpC-NSAF

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Notebook summary:
1. Import fasta peptides and detected peptides datasets.
2. Dataset cleaning.
    - Sort detected peptides by PEP score.
    - Remove duplicates in both datasets.
    - Only keep proteins in fasta peptides that are also present in detected peptides - this is to be the expected peptides.
3. Get undetected peptides.
    - Remove all rows in expected peptides that are also present in detected peptides.
    - Calculate SpC-NSAF for undetected peptides
4. Export a new data frame with undetected peptides as a TSV file.

In [None]:
# import libraries
import numpy as np
import pandas as pd
import csv

# set display options
#pd.set_option("display.max_rows", None, "display.max_columns", None)

# 1. Import fasta peptides and detected peptides

In [None]:
# import datasets
fasta_peptides = pd.read_table('/content/drive/MyDrive/Colab Notebooks/peptide ml detection/data/fasta_peptides.tsv')
detected_peptides = pd.read_table('/content/drive/MyDrive/Colab Notebooks/peptide ml detection/data/detected_peptides_NSAF.tsv')

In [None]:
fasta_peptides.shape

(209525, 6)

In [None]:
detected_peptides.shape

(1917, 6)

# 2. Dataset cleaning

### get expected peptides before removing duplicates

In [None]:
# initial check on how many total proteins in detected_peptides are not in fasta_peptides (should be 0)
len(detected_peptides[~detected_peptides["Protein"].isin(fasta_peptides["Protein"])])

573

In [None]:
# initial check on how many total peptides (i.e. PSMs) in detected_peptides are not in fasta_peptides (should be 0)
len(detected_peptides[~detected_peptides["Peptide"].isin(fasta_peptides["Peptide"])])
# NB: These sequences are actually there, they appear to be start of the sequences
# i.e. all these sequences start with Methionine in the fasta peptides.

1819

In [None]:
# initial check on how many total proteins and peptides in fasta_peptides are not in detected_peptides
len(fasta_peptides[~fasta_peptides["Protein"].isin(detected_peptides["Protein"])])

169723

In [None]:
# check how many protein+peptide in detected_peptides are present in fasta_peptides (should be 37,847)
len(detected_peptides.set_index(['Protein', 'Peptide']).index.isin(fasta_peptides.set_index(['Protein', 'Peptide']).index))

1917

In [None]:
len(fasta_peptides['Protein'])

209525

## keep proteins in fasta peptides that are present in detected peptides -expected peptides

In [None]:
detected_peptides.describe(include='all')

Unnamed: 0,Protein,Peptide,PEP,Protein_length,PSM_per_protein,Quantification
count,1917,1917,1917.0,1344.0,1917.0,1344.0
unique,1104,1917,,,,
top,Q09666,YYDSRPGGYGYGYGR,,,,
freq,40,1,,,,
mean,,,0.011226,877.968006,25.293166,0.047458
std,,,0.025251,1078.848877,44.13713,0.076706
min,,,0.004787,80.0,1.0,0.000393
25%,,,0.004788,314.0,5.0,0.012469
50%,,,0.004826,546.5,11.0,0.029112
75%,,,0.005565,940.75,25.0,0.057656


In [None]:
# initial check on how many total proteins and peptides in detected_peptides are in fasta_peptides
# (should be 37,847 if after removing duplicate peptides)
len(detected_peptides[detected_peptides["Protein"].isin(fasta_peptides["Protein"])])

1344

In [None]:
# initial check on how many total proteins in fasta_peptides are in detected_peptides
len(fasta_peptides[fasta_peptides["Protein"].isin(detected_peptides["Protein"])])

39802

In [None]:
# remove proteins in fasta_peptides not present in detected_peptides - let this be expected_peptides
expected_peptides = fasta_peptides[fasta_peptides["Protein"].isin(detected_peptides["Protein"])]
expected_peptides.describe(include="all")

Unnamed: 0,Protein,Peptide,Cleavage,Prediction,Probability,Length
count,39802,39802,39802.0,39802.0,39802.0,39802.0
unique,659,6182,,,,
top,Q09666,K,,,,
freq,909,3084,,,,
mean,,,0.170544,1.0,0.736235,892.267775
std,,,0.376115,0.0,0.118713,957.390632
min,,,0.0,1.0,0.5,80.0
25%,,,0.0,1.0,0.642,379.0
50%,,,0.0,1.0,0.779,612.0
75%,,,0.0,1.0,0.817,1042.0


# 3. Get undetected peptides

In [None]:
# initial check on undetected peptides
undetected_peptides = pd.merge(expected_peptides, detected_peptides,
                               on=["Peptide"],
                               how='left', indicator=True)
undetected_peptides

Unnamed: 0,Protein_x,Peptide,Cleavage,Prediction,Probability,Length,Protein_y,PEP,Protein_length,PSM_per_protein,Quantification,_merge
0,P18031,EFEQIDK,1,1,0.500,435,,,,,,left_only
1,Q96P70,AIITDSVK,1,1,0.500,1041,,,,,,left_only
2,Q96P70,AIITDSVK,1,1,0.500,1041,,,,,,left_only
3,P18031,EFEQIDK,1,1,0.500,435,,,,,,left_only
4,P33176,SAEIDSDDTGGSAAQK,1,1,0.500,963,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...
39797,Q9NZ45,IVHAFDMEDLGDK,0,1,0.981,108,,,,,,left_only
39798,Q92541,K,0,1,0.981,710,,,,,,left_only
39799,Q92541,QEEEQEK,0,1,0.981,710,,,,,,left_only
39800,P00558,FHVEEEGK,0,1,0.981,417,,,,,,left_only


In [None]:
# find peptides/PSMs present only in undetected_peptides
len(undetected_peptides[undetected_peptides['_merge'].str.contains("left_only", na=False)])

39223

In [None]:
# find number of undetected using isin
len(expected_peptides[~expected_peptides["Peptide"].isin(detected_peptides["Peptide"])])

39223

In [None]:
undetected_peptides = expected_peptides[~expected_peptides["Peptide"].isin(detected_peptides["Peptide"])]
undetected_peptides.shape

(39223, 6)

In [None]:
undetected_peptides.describe(include="all")

Unnamed: 0,Protein,Peptide,Cleavage,Prediction,Probability,Length
count,39223,39223,39223.0,39223.0,39223.0,39223.0
unique,659,6084,,,,
top,Q09666,K,,,,
freq,858,3084,,,,
mean,,,0.172093,1.0,0.736269,889.170232
std,,,0.377466,0.0,0.118645,945.419437
min,,,0.0,1.0,0.5,80.0
25%,,,0.0,1.0,0.642,379.0
50%,,,0.0,1.0,0.779,612.0
75%,,,0.0,1.0,0.817,1042.0


In [None]:
# remove any peptides containing 'U' (selenocysteine) amino acids
# this is so that we can map peptide amino acids correctly when calculating AAIndex1 properties
len(undetected_peptides[undetected_peptides['Peptide'].str.contains("U", na=False)])

0

In [None]:
undetected_peptides[undetected_peptides['Peptide'].str.contains("U", na=False)]

Unnamed: 0,Protein,Peptide,Cleavage,Prediction,Probability,Length


In [None]:
undetected_peptides.shape

(39223, 6)

In [None]:
len(undetected_peptides[~undetected_peptides['Peptide'].str.contains("U", na=False)])

39223

In [None]:
undetected_peptides = undetected_peptides[~undetected_peptides['Peptide'].str.contains("U", na=False)]
undetected_peptides.shape

(39223, 6)

# Calculate SpC-NSAF for undetected peptides

In [None]:
# check for any intersection between detected and undetected (should be 0)
print(len(set(detected_peptides["Peptide"]).intersection(set(undetected_peptides["Peptide"]))))

0


In [None]:
# create dictionary from detected_peptides, with protein keys and associated NSAF values
detected_peptides_nsaf_dict = dict(zip(detected_peptides.Protein, detected_peptides.Quantification))

In [None]:
# map protein NSAF quantitation values to each protein in undetected_peptides
undetected_peptides['Quantification'] = undetected_peptides['Protein'].map(detected_peptides_nsaf_dict)
undetected_peptides

Unnamed: 0,Protein,Peptide,Cleavage,Prediction,Probability,Length,Quantification
0,P18031,EFEQIDK,1,1,0.500,435,0.022989
18,Q96P70,AIITDSVK,1,1,0.500,1041,0.025937
19,Q96P70,AIITDSVK,1,1,0.500,1041,0.025937
20,P18031,EFEQIDK,1,1,0.500,435,0.022989
24,P33176,SAEIDSDDTGGSAAQK,1,1,0.500,963,0.045691
...,...,...,...,...,...,...,...
209482,Q9NZ45,IVHAFDMEDLGDK,0,1,0.981,108,0.185185
209493,Q92541,K,0,1,0.981,710,0.008451
209494,Q92541,QEEEQEK,0,1,0.981,710,0.008451
209502,P00558,FHVEEEGK,0,1,0.981,417,0.019185


In [None]:
# check for any NaN values
undetected_peptides.isnull().sum()

Protein           0
Peptide           0
Cleavage          0
Prediction        0
Probability       0
Length            0
Quantification    0
dtype: int64

In [None]:
print(undetected_peptides.shape)
print(detected_peptides.shape)

(39223, 7)
(1917, 6)


In [None]:
undetected_peptides.describe(include='all')

Unnamed: 0,Protein,Peptide,Cleavage,Prediction,Probability,Length,Quantification
count,39223,39223,39223.0,39223.0,39223.0,39223.0,39223.0
unique,659,6084,,,,,
top,Q09666,K,,,,,
freq,858,3084,,,,,
mean,,,0.172093,1.0,0.736269,889.170232,0.03322
std,,,0.377466,0.0,0.118645,945.419437,0.055352
min,,,0.0,1.0,0.5,80.0,0.000393
25%,,,0.0,1.0,0.642,379.0,0.007463
50%,,,0.0,1.0,0.779,612.0,0.018657
75%,,,0.0,1.0,0.817,1042.0,0.045671


In [None]:
# remove any peptides in undetected_peptides that map to more than one different protein
undetected_peptides_clean = undetected_peptides.groupby('Peptide').filter(lambda x: x['Protein'].nunique() == 1)
undetected_peptides_clean.shape

(34515, 7)

In [None]:
undetected_peptides_clean.describe(include='all')

Unnamed: 0,Protein,Peptide,Cleavage,Prediction,Probability,Length,Quantification
count,34515,34515,34515.0,34515.0,34515.0,34515.0,34515.0
unique,659,6052,,,,,
top,Q09666,APFDLFENK,,,,,
freq,731,153,,,,,
mean,,,0.188527,1.0,0.736493,890.411415,0.031413
std,,,0.391138,0.0,0.118288,934.541117,0.044673
min,,,0.0,1.0,0.5,80.0,0.000393
25%,,,0.0,1.0,0.644,382.0,0.007394
50%,,,0.0,1.0,0.779,624.0,0.018033
75%,,,0.0,1.0,0.816,1046.0,0.044855


In [None]:
# drop duplicate peptides (i.e. if a peptide sequence that matches to same protein occurs twice or more), keep first occurrence
undetected_peptides_clean = undetected_peptides_clean.drop_duplicates(subset = ['Peptide', 'Protein'],
                                                                      keep = "first").reset_index(drop = True)
undetected_peptides_clean.describe(include="all")

Unnamed: 0,Protein,Peptide,Cleavage,Prediction,Probability,Length,Quantification
count,6052,6052,6052.0,6052.0,6052.0,6052.0,6052.0
unique,659,6052,,,,,
top,Q92616,EFEQIDK,,,,,
freq,73,1,,,,,
mean,,,0.079808,1.0,0.635305,847.549736,0.030239
std,,,0.271019,0.0,0.116417,799.114566,0.042953
min,,,0.0,1.0,0.5,80.0,0.000393
25%,,,0.0,1.0,0.523,379.75,0.007676
50%,,,0.0,1.0,0.59,620.0,0.017167
75%,,,0.0,1.0,0.756,1014.0,0.040548


In [None]:
# check how many detected proteins are not in undetected
len(detected_peptides[~detected_peptides["Protein"].isin(undetected_peptides_clean["Protein"])])

573

In [None]:
detected_peptides[~detected_peptides["Protein"].isin(undetected_peptides_clean["Protein"])]

Unnamed: 0,Protein,Peptide,PEP,Protein_length,PSM_per_protein,Quantification
2,P62851,DKLNNLVLFDK,0.004787,,10,
13,P31949,NQKDPGVLDR,0.004787,,27,
17,Q8WXX5,QKEMDNFLAQMEAK,0.004787,,13,
21,P09669,NYDVMKDFEEMR,0.004787,,41,
22,Q5VTE6,NWEYICSHDKEK,0.004787,,17,
...,...,...,...,...,...,...
1907,P30414,VRPGSLFDEVR,0.195430,,4,
1909,C9JU14,YLEDRTVR,0.210640,,1,
1910,Q9Y490,DLQEVKAAAR,0.212410,,67,
1911,Q9NTZ6,DEATAAVIDLNDRPIGSR,0.227690,,12,


In [None]:
# initial check on how many total proteins and PSMs in detected_peptides are in fasta_peptides
len(detected_peptides[detected_peptides["Protein"].isin(fasta_peptides["Protein"])])

1344

In [None]:
undetected_peptides_clean.shape

(6052, 7)

# 4. Export a new data frame- undetected peptides as a TSV file

In [None]:
# export undetected peptides to TSV
undetected_peptides_clean.to_csv("/content/drive/MyDrive/Colab Notebooks/peptide ml detection/data/undetected_peptides_NSAF.tsv", sep='\t', index=False)