In [1]:
# importing packages

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re # regular expressions

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [2]:
df=pd.read_excel(r'2020.01.12_Combined_Ti_1.xlsx')

# See dataframe head

df.head(3)
df.shape

Unnamed: 0,Row,User Flag,OK,Cmpd.,m/z meas.,Mr calc.,z,Δ m/z [ppm],RMS90 [ppm],Rt [min],...,#Cmpds.,Site,Rank,P,Sequence,Modifications,Type,Range,Accession,Protein
0,1,,True,9938,508.753911,1523.610258,3,1.134868,5.145389,26.586875,...,45,99.966283,1,0,K.TVDMESTEVFTK.K,Acetyl: 1; Oxidation: 4; Phospho: 6,CID,153 - 164,CASA2_BOVIN,Alpha-S2-casein OS=Bos taurus GN=CSN1S2 PE=1 SV=2
1,2,,True,17927,554.150198,1659.786917,3,0.068551,4.172535,37.91885,...,678,,1,0,K.VPQLEIVPNSAEER.L,Phospho: 10,CID,121 - 134,CASA1_BOVIN,Alpha-S1-casein OS=Bos taurus GN=CSN1S1 PE=1 SV=2
2,3,,True,17891,554.150278,1659.786917,3,0.212885,4.125017,37.86375,...,678,,1,0,K.VPQLEIVPNSAEER.L,Phospho: 10,CID,121 - 134,CASA1_BOVIN,Alpha-S1-casein OS=Bos taurus GN=CSN1S1 PE=1 SV=2


(2541, 21)

In [3]:
# useful columns
PTMS = df[["Site" , "Sequence", "Modifications", "Accession"]]

PTMS.head(3)

Unnamed: 0,Site,Sequence,Modifications,Accession
0,99.966283,K.TVDMESTEVFTK.K,Acetyl: 1; Oxidation: 4; Phospho: 6,CASA2_BOVIN
1,,K.VPQLEIVPNSAEER.L,Phospho: 10,CASA1_BOVIN
2,,K.VPQLEIVPNSAEER.L,Phospho: 10,CASA1_BOVIN


Phospho seems to happen everytime.

Let's get the distribution of aminoacids

In [4]:
# first get a cleaned version of the sequence column
PTMS = (PTMS
        .assign(cleaned_sequence = lambda row: (row
                                                .Sequence
                                                .apply(lambda seq : seq.split('.')[1])
                                               )
               )
       )
PTMS.head(3)

Unnamed: 0,Site,Sequence,Modifications,Accession,cleaned_sequence
0,99.966283,K.TVDMESTEVFTK.K,Acetyl: 1; Oxidation: 4; Phospho: 6,CASA2_BOVIN,TVDMESTEVFTK
1,,K.VPQLEIVPNSAEER.L,Phospho: 10,CASA1_BOVIN,VPQLEIVPNSAEER
2,,K.VPQLEIVPNSAEER.L,Phospho: 10,CASA1_BOVIN,VPQLEIVPNSAEER


In [5]:
def capture_phospho_positions(modification):
    """
    Given a modification description of the form:
    "Acetyl: 1; Oxidation: 4; Phospho: 6", captures the positions of the Phospho component only
    
    
    Returns a list of the positions.
    
    
    Example:
    
    example = "Acetyl: 1; Oxidation: 4; Phospho: 6"
    capture_phospho_positions(example)
    >>> [6]
    
    example_2 = "Acetyl: 5; Oxidation: 2; Phospho: 6, 9"
    capture_phospho_positions(example)
    >>> [6, 9]
    """
    
    
    events = modification.split(';')
    phospho_info = [s for s in events if 'Phospho' in s][0]
    
    # remove unecessary info ('Phospho:')
    positions_str = phospho_info[phospho_info.find(':')+1:]
    # remove whitespaces and get list of positions
    positions_list = positions_str.replace(" ", "").split(',')
    # convert positions to ints
    positions_list = [int(e) -1 for e in positions_list]
    
    return positions_list

In [6]:
PTMS['Letras'] = PTMS.apply(lambda row: np.array([char for char in row.cleaned_sequence])[capture_phospho_positions(row.Modifications)],axis=1)
PTMS.assign(TotalLetras = PTMS.Letras.apply(lambda x: len(x)),
            S = PTMS.Letras.apply(lambda letras: len([s for s in letras if s == 'S'])),
            T = PTMS.Letras.apply(lambda letras: len([s for s in letras if s == 'T'])),
            Y = PTMS.Letras.apply(lambda letras: len([s for s in letras if s == 'Y'])))

Unnamed: 0,Site,Sequence,Modifications,Accession,cleaned_sequence,Letras,TotalLetras,S,T,Y
0,99.966283,K.TVDMESTEVFTK.K,Acetyl: 1; Oxidation: 4; Phospho: 6,CASA2_BOVIN,TVDMESTEVFTK,[S],1,1,0,0
1,,K.VPQLEIVPNSAEER.L,Phospho: 10,CASA1_BOVIN,VPQLEIVPNSAEER,[S],1,1,0,0
2,,K.VPQLEIVPNSAEER.L,Phospho: 10,CASA1_BOVIN,VPQLEIVPNSAEER,[S],1,1,0,0
3,,K.VPQLEIVPNSAEER.L,Phospho: 10,CASA1_BOVIN,VPQLEIVPNSAEER,[S],1,1,0,0
4,,K.VPQLEIVPNSAEER.L,Phospho: 10,CASA1_BOVIN,VPQLEIVPNSAEER,[S],1,1,0,0
...,...,...,...,...,...,...,...,...,...,...
2536,,K.YKVPQLEIVPNSAEER.L,Phospho: 12,CASA1_BOVIN,YKVPQLEIVPNSAEER,[S],1,1,0,0
2537,,K.YKVPQLEIVPNSAEER.L,Phospho: 12,CASA1_BOVIN,YKVPQLEIVPNSAEER,[S],1,1,0,0
2538,,K.VPQLEIVPNSAEER.L,Phospho: 10,CASA1_BOVIN,VPQLEIVPNSAEER,[S],1,1,0,0
2539,,K.YKVPQLEIVPNSAEER.L,Phospho: 12,CASA1_BOVIN,YKVPQLEIVPNSAEER,[S],1,1,0,0
