In [34]:
import numpy as np
import pandas as pd
import os

from scipy import stats

os.chdir('/home/galina/PycharmProjects/Peptide Signal')

# Helper function, finds weight of aminoacid

In [35]:
def weight(name, dictionary):
    w = 0
    for e in name:
        w = w + dictionary[e]
    return int(w)

# Aminoacid weights to dictionary

In [36]:
amino = pd.read_csv('Aminoacids.csv', header = 0)
amino.head()

Unnamed: 0,Aminoacid,1-Letter Code,"Molecular weight, kDa"
0,Alanine,A,89
1,Arginine,R,174
2,Asparagine,N,132
3,Aspartic,D,133
4,Cysteine,C,121


In [37]:
amino.drop('Aminoacid', axis = 1, inplace = True)
amino.columns = ['Code', 'Weight']
weights = amino.set_index('Code')['Weight'].to_dict()
weights

{nan: nan,
 'A': 89.0,
 'C': 121.0,
 'D': 133.0,
 'E': 146.0,
 'F': 165.0,
 'G': 75.0,
 'H': 155.0,
 'I': 131.0,
 'K': 146.0,
 'L': 131.0,
 'M': 149.0,
 'N': 132.0,
 'P': 115.0,
 'Q': 147.0,
 'R': 174.0,
 'S': 105.0,
 'T': 119.0,
 'V': 117.0,
 'W': 204.0,
 'Y': 181.0}

# Peptide data cleaning

In [38]:
data = pd.read_csv('Peptides.csv', header = 0)

data.drop(['Grid Index',
           'Peptide Signal Uniformity',
           'Streptavidin Signal Uniformity',
           
           'Streptavidin Association Signal (Saturated)',
           'Streptavidin Dissociation Signal (Saturated)',
           
           'Tag',
           'Alignment',
           'Empty',
           'Thrombin',
           'PSA',
           'StrepTag', 
           'Myc', 
           'FLAG',
           'NA', 
           'Epitope', 
           'Bad Washed',
           'Bad Strep',
           'aMyc Signal Uniformity'], axis = 1, inplace = True)

# remove data with 'Bad Washed' flag
# data = data[ data['Bad Washed'] == 0 ]
# data = data[ data['Bad Strep'] == 0 ]

# data.drop(['Bad Washed', 'Bad Strep'], axis = 1, inplace = True)

data.head()

Unnamed: 0,Peptide,Peptide Length,Peptide Signal,Streptavidin Signal,aMyc Association Signal,aMyc Dissociation Signal,aMyc Signal
0,WTHPQFAT,8,0.000642,0.00118,8.6e-05,6e-06,-1.5e-05
1,WTHPQFAT,8,0.00113,0.00121,-0.000195,-6e-05,-1.3e-05
2,WTHLQFAT,8,0.000244,2.2e-05,1.9e-05,-3.5e-05,5.8e-05
3,NGQFQVWIPLAQK,13,0.00249,-2.4e-05,0.000547,0.000392,0.000177
4,LQLHPQAGK,9,0.00311,0.000408,8.6e-05,9.5e-05,0.000104


In [39]:
# remove NaNs and Xs
data = data.dropna()
data = data[ data['Peptide'] != 'X' ]

data.sort('Peptide', inplace = True)
data = data.reset_index(drop = True)

data.head()

Unnamed: 0,Peptide,Peptide Length,Peptide Signal,Streptavidin Signal,aMyc Association Signal,aMyc Dissociation Signal,aMyc Signal
0,AGQFQVWIPGAQK,13,0.00253,1.1e-05,0.000125,6.8e-05,9.2e-05
1,AGQFQVWIPGAQK,13,0.00279,-6e-05,0.00013,2.6e-05,5.4e-05
2,AGQFQVWIPGAQK,13,0.00336,0.00104,-8.7e-05,8.4e-05,-1.9e-05
3,AGQFQVWIPGAQK,13,0.00255,-3.7e-05,-5.7e-05,3.2e-05,6.8e-05
4,AGQFQVWIPGAQK,13,0.00187,4e-06,0.000182,0.000182,4.6e-05


# Calculate mean and standard deviation by peptide

In [40]:
grouped = data.groupby(['Peptide',
                        'Peptide Length'])                       

f = {'Peptide Signal'           : [np.mean, np.std, stats.sem, len],
     'Streptavidin Signal'      : [np.mean, np.std, stats.sem],
     'aMyc Signal'              : [np.mean, np.std, stats.sem],
     'aMyc Association Signal'  : [np.mean, np.std, stats.sem],
     'aMyc Dissociation Signal' : [np.mean, np.std, stats.sem]}     

In [41]:
result = grouped.agg(f)

# result.columns = result.columns.droplevel()
result.reset_index(inplace = True)

result.columns = ['Peptide',
                  'Peptide Length',    
                  
                  'Streptavidin Signal Mean',
                  'Streptavidin Signal SD',
                  'Streptavidin Signal SE',

                  'aMyc Dissociation Signal Mean',
                  'aMyc Dissociation Signal SD',
                  'aMyc Dissociation Signal SE',

                  'aMyc Association Signal Mean',
                  'aMyc Association Signal SD',
                  'aMyc Association Signal SE', 
                    
                  'aMyc Signal Mean',
                  'aMyc Signal SD',
                  'aMyc Signal SE',
                   
                  'Peptide Signal Mean',
                  'Peptide Signal SD',
                  'Peptide Signal SE',
                  
                  'Number of Spots']


result['Streptavidin Signal Mean NN'] = abs( result['Streptavidin Signal Mean']
                                          * (result['Streptavidin Signal Mean'] > 0) )
result['Peptide Signal Mean NN'] = abs( result['Peptide Signal Mean']
                                     * (result['Peptide Signal Mean'] > 0) )
result['aMyc Signal Mean NN'] = abs( result['aMyc Signal Mean']
                                  * (result['aMyc Signal Mean'] > 0) )
result['aMyc Association Signal Mean NN'] = abs( result['aMyc Association Signal Mean']
                                              * (result['aMyc Association Signal Mean'] > 0) )
result['aMyc Dissociation Signal Mean NN'] = abs( result['aMyc Dissociation Signal Mean']
                                               * (result['aMyc Dissociation Signal Mean'] > 0) )

result.head()

Unnamed: 0,Peptide,Peptide Length,Streptavidin Signal Mean,Streptavidin Signal SD,Streptavidin Signal SE,aMyc Dissociation Signal Mean,aMyc Dissociation Signal SD,aMyc Dissociation Signal SE,aMyc Association Signal Mean,aMyc Association Signal SD,...,aMyc Signal SE,Peptide Signal Mean,Peptide Signal SD,Peptide Signal SE,Number of Spots,Streptavidin Signal Mean NN,Peptide Signal Mean NN,aMyc Signal Mean NN,aMyc Association Signal Mean NN,aMyc Dissociation Signal Mean NN
0,AGQFQVWIPGAQK,13,0.000125,0.000307,6.9e-05,0.000129,7.8e-05,1.7e-05,0.000141,0.000125,...,1.3e-05,0.002625,0.000342,7.7e-05,20,0.000125,0.002625,7.8025e-05,0.000141,0.000129
1,AISPPPK,7,8.3e-05,0.000252,2.5e-05,0.000209,0.000517,5.2e-05,0.000252,0.000673,...,4.3e-05,0.002837,0.000502,5e-05,100,8.3e-05,0.002837,0.0001593498,0.000252,0.000209
2,AKFPIPLGKQSG,12,0.0004,0.000516,0.000258,0.000134,0.00012,6e-05,0.000143,0.000133,...,4.4e-05,0.00291,0.000384,0.000192,4,0.0004,0.00291,7.076637e-05,0.000143,0.000134
3,AQWHPQAGK,9,0.000591,0.000707,0.000353,4.1e-05,0.000129,6.4e-05,7e-05,0.000144,...,4e-05,0.002933,0.00031,0.000155,4,0.000591,0.002933,1e-07,7e-05,4.1e-05
4,ATHPQFAT,8,0.000264,0.000489,0.000244,0.000889,0.001586,0.000793,0.00108,0.001879,...,0.000773,0.001607,0.000362,0.000181,4,0.000264,0.001607,0.00081464,0.00108,0.000889


# Add weight, and sort

In [42]:
result['Peptide Weight'] = result['Peptide'].map(lambda x: weight(x, weights))
result.sort(['Peptide Length', 'Peptide'], inplace = True)
result = result.reset_index(drop = True)

result.head()

Unnamed: 0,Peptide,Peptide Length,Streptavidin Signal Mean,Streptavidin Signal SD,Streptavidin Signal SE,aMyc Dissociation Signal Mean,aMyc Dissociation Signal SD,aMyc Dissociation Signal SE,aMyc Association Signal Mean,aMyc Association Signal SD,...,Peptide Signal Mean,Peptide Signal SD,Peptide Signal SE,Number of Spots,Streptavidin Signal Mean NN,Peptide Signal Mean NN,aMyc Signal Mean NN,aMyc Association Signal Mean NN,aMyc Dissociation Signal Mean NN,Peptide Weight
0,AISPPPK,7,8.3e-05,0.000252,2.5e-05,0.000209,0.000517,5.2e-05,0.000252,0.000673,...,0.002837,0.000502,5e-05,100,8.3e-05,0.002837,0.000159,0.000252,0.000209,816
1,THPQFAT,7,-1.7e-05,3.4e-05,2e-05,0.000111,9.8e-05,5.7e-05,0.000113,0.000117,...,0.001647,0.000257,0.000148,3,0.0,0.001647,0.000121,0.000113,0.000111,909
2,WHPQFAT,7,0.000526,0.000605,0.000302,7.4e-05,0.000121,6e-05,0.00011,0.000149,...,0.000924,0.000134,6.7e-05,4,0.000526,0.000924,2.9e-05,0.00011,7.4e-05,994
3,WTHPFAT,7,0.000311,0.000588,0.000294,5.3e-05,7.7e-05,3.9e-05,4.7e-05,0.000207,...,0.001179,0.000203,0.000101,4,0.000311,0.001179,0.000104,4.7e-05,5.3e-05,966
4,WTHPQAT,7,4.9e-05,5.4e-05,3.1e-05,0.000118,0.000188,0.000108,0.000221,0.000179,...,0.0018,0.000416,0.00024,3,4.9e-05,0.0018,0.000108,0.000221,0.000118,948


In [43]:
# rearrange order of columns
result = result.reindex_axis(['Peptide',
                              'Number of Spots',
                              'Peptide Length',
                              'Peptide Weight',
                              
                              'Peptide Signal Mean',
                              'Peptide Signal Mean NN',
                              'Peptide Signal SD',
                              'Peptide Signal SE',
                              
                              'Streptavidin Signal Mean',
                              'Streptavidin Signal Mean NN',
                              'Streptavidin Signal SD',
                              'Streptavidin Signal SE',
                              
                              'aMyc Signal Mean',
                              'aMyc Signal Mean NN',
                              'aMyc Signal SD',
                              'aMyc Signal SE',              
                              
                              'aMyc Dissociation Signal Mean',
                              'aMyc Dissociation Signal Mean NN',
                              'aMyc Dissociation Signal SD',
                              'aMyc Dissociation Signal SE',

                              'aMyc Association Signal Mean',
                              'aMyc Association Signal Mean NN',
                              'aMyc Association Signal SD',
                              'aMyc Association Signal SE'], axis=1)
result.head()

Unnamed: 0,Peptide,Number of Spots,Peptide Length,Peptide Weight,Peptide Signal Mean,Peptide Signal Mean NN,Peptide Signal SD,Peptide Signal SE,Streptavidin Signal Mean,Streptavidin Signal Mean NN,...,aMyc Signal SD,aMyc Signal SE,aMyc Dissociation Signal Mean,aMyc Dissociation Signal Mean NN,aMyc Dissociation Signal SD,aMyc Dissociation Signal SE,aMyc Association Signal Mean,aMyc Association Signal Mean NN,aMyc Association Signal SD,aMyc Association Signal SE
0,AISPPPK,100,7,816,0.002837,0.002837,0.000502,5e-05,8.3e-05,8.3e-05,...,0.00043,4.3e-05,0.000209,0.000209,0.000517,5.2e-05,0.000252,0.000252,0.000673,6.7e-05
1,THPQFAT,3,7,909,0.001647,0.001647,0.000257,0.000148,-1.7e-05,0.0,...,5.9e-05,3.4e-05,0.000111,0.000111,9.8e-05,5.7e-05,0.000113,0.000113,0.000117,6.8e-05
2,WHPQFAT,4,7,994,0.000924,0.000924,0.000134,6.7e-05,0.000526,0.000526,...,3.6e-05,1.8e-05,7.4e-05,7.4e-05,0.000121,6e-05,0.00011,0.00011,0.000149,7.4e-05
3,WTHPFAT,4,7,966,0.001179,0.001179,0.000203,0.000101,0.000311,0.000311,...,4.7e-05,2.3e-05,5.3e-05,5.3e-05,7.7e-05,3.9e-05,4.7e-05,4.7e-05,0.000207,0.000103
4,WTHPQAT,3,7,948,0.0018,0.0018,0.000416,0.00024,4.9e-05,4.9e-05,...,3.7e-05,2.1e-05,0.000118,0.000118,0.000188,0.000108,0.000221,0.000221,0.000179,0.000103


In [44]:
result.to_csv('result.csv', index = False)