In [1]:
import numpy as np
import pandas as pd
import os

from scipy import stats

import sys
sys.path.append('/home/galina/PythonProjects/common_functions')
import peptides as pep

# Peptide data cleaning

In [3]:
data = pd.read_csv('EP signals from dIm EP and dR RT.csv', header = 0)

drop_pep = ['X']

drop_col = ['Column',
            'Row',
            'Peptide SD',
            'Peptide Median',
            'Peptide Max',
            'Peptide Min',
            'Streptavidin SD',
            'Streptavidin Median',
            'Streptavidin Max',
            'Streptavidin Min',
            'Baseline Mean',
            'Dissociation SNR']
data.rename(columns = {'Index' : 'Grid Index',
                       'Label' : 'Peptide'}, inplace = True)

# Add 'Bad' columns
bad_indices = [int(x) for x in np.genfromtxt("bad spots.txt", delimiter="\n")]
binary_bad = [False] * len(data)
for i in bad_indices:
    binary_bad[i] = True

data['Bad EP']         = binary_bad
data['Bad RT']         = (data['Association SNR'].isnull())
data['Peptide Length'] = data['Peptide'].map(pep.weight)
data['Peptide Weight'] = data['Peptide'].map(lambda x : len(str(x)))

data.drop(drop_col, axis = 1, inplace = True)
data.head()

Unnamed: 0,Grid Index,Peptide,Peptide Readout,Peptide Mean,Peptide MAD,Streptavidin Readout,Streptavidin Mean,Streptavidin MAD,Baseline Noise,Association Mean,Dissociation Mean,Association SNR,Bad EP,Bad RT,Peptide Length,Peptide Weight
0,0,WTHPQFAT,0.000387,0.000899,0.000241,0.001759,0.001738,0.000143,0.000211,0.001505,0.001104,7.134546,False,False,987.083,8
1,1,WTHPQFAT,0.00036,0.000606,0.000338,0.001839,0.001741,0.000107,0.000263,0.000903,0.000765,3.429684,False,False,987.083,8
2,2,WTHLQFAT,0.000223,0.000304,0.000148,-1.8e-05,5e-06,9.5e-05,0.000237,-6.6e-05,7.2e-05,-0.280433,False,False,1003.126,8
3,3,NGQFQVWIPLAQK,0.002042,0.001883,0.000265,-0.000196,-6.1e-05,0.000107,0.000237,-0.00012,-0.000145,-0.504192,False,False,1528.777,13
4,4,LQLHPQAGK,0.003436,0.003312,0.000187,0.000476,0.00058,0.000101,0.000237,0.000639,0.000319,2.693665,False,False,991.16,9


In [6]:
# remove NaNs, Xs and bad spots
data = data.dropna()
data = data[ data['Peptide'] != 'X' ]
data = data[ data['Bad'] == 0 ]

data.sort('Peptide', inplace = True)
data = data.reset_index(drop = True)

data.head()

Unnamed: 0,Peptide,Peptide Length,Peptide Signal,Streptavidin Signal,aMyc Signal,aMyc Association Signal,aMyc Dissociation Signal,Bad
0,AGQFQVWIPGAQK,13,0.00253,1.1e-05,9.2e-05,0.000125,6.8e-05,0
1,AGQFQVWIPGAQK,13,0.00279,-6e-05,5.4e-05,0.00013,2.6e-05,0
2,AGQFQVWIPGAQK,13,0.00336,-1.7e-05,0.000147,0.000193,0.000205,0
3,AGQFQVWIPGAQK,13,0.00255,-4.4e-05,0.000129,5.2e-05,9e-05,0
4,AGQFQVWIPGAQK,13,0.00187,2.8e-05,5.3e-05,0.000144,0.000139,0


# Calculate mean and standard deviation by peptide

In [7]:
grouped = data.groupby(['Peptide',
                        'Peptide Length'])                       

f = {'Peptide Signal'           : [np.mean, np.std, stats.sem, len],
     'Streptavidin Signal'      : [np.mean, np.std, stats.sem],
     'aMyc Signal'              : [np.mean, np.std, stats.sem],
     'aMyc Association Signal'  : [np.mean, np.std, stats.sem],
     'aMyc Dissociation Signal' : [np.mean, np.std, stats.sem]}     

In [8]:
result = grouped.agg(f)

# result.columns = result.columns.droplevel()
result.reset_index(inplace = True)

result.columns = ['Peptide',
                  'Peptide Length',    
                  
                  'Streptavidin Signal Mean',
                  'Streptavidin Signal SD',
                  'Streptavidin Signal SE',

                  'aMyc Dissociation Signal Mean',
                  'aMyc Dissociation Signal SD',
                  'aMyc Dissociation Signal SE',

                  'aMyc Association Signal Mean',
                  'aMyc Association Signal SD',
                  'aMyc Association Signal SE', 
                    
                  'aMyc Signal Mean',
                  'aMyc Signal SD',
                  'aMyc Signal SE',
                   
                  'Peptide Signal Mean',
                  'Peptide Signal SD',
                  'Peptide Signal SE',
                  
                  'Number of Spots']


result['Streptavidin Signal Mean NN'] = abs( result['Streptavidin Signal Mean']
                                          * (result['Streptavidin Signal Mean'] > 0) )
result['Peptide Signal Mean NN'] = abs( result['Peptide Signal Mean']
                                     * (result['Peptide Signal Mean'] > 0) )
result['aMyc Signal Mean NN'] = abs( result['aMyc Signal Mean']
                                  * (result['aMyc Signal Mean'] > 0) )
result['aMyc Association Signal Mean NN'] = abs( result['aMyc Association Signal Mean']
                                              * (result['aMyc Association Signal Mean'] > 0) )
result['aMyc Dissociation Signal Mean NN'] = abs( result['aMyc Dissociation Signal Mean']
                                               * (result['aMyc Dissociation Signal Mean'] > 0) )

result.head()

Unnamed: 0,Peptide,Peptide Length,Streptavidin Signal Mean,Streptavidin Signal SD,Streptavidin Signal SE,aMyc Dissociation Signal Mean,aMyc Dissociation Signal SD,aMyc Dissociation Signal SE,aMyc Association Signal Mean,aMyc Association Signal SD,...,aMyc Signal SE,Peptide Signal Mean,Peptide Signal SD,Peptide Signal SE,Number of Spots,Streptavidin Signal Mean NN,Peptide Signal Mean NN,aMyc Signal Mean NN,aMyc Association Signal Mean NN,aMyc Dissociation Signal Mean NN
0,AGQFQVWIPGAQK,13,-2e-06,4.3e-05,1e-05,0.000165,8.1e-05,1.8e-05,0.000205,9.1e-05,...,1.1e-05,0.002625,0.000342,7.7e-05,20,0.0,0.002625,0.000129,0.000205,0.000165
1,AISPPPK,7,-1.8e-05,5.1e-05,5e-06,0.000135,0.000118,1.2e-05,0.00011,0.000132,...,4e-06,0.002833,0.000502,5.1e-05,98,0.0,0.002833,9.7e-05,0.00011,0.000135
2,AKFPIPLGKQSG,12,-3.1e-05,3e-05,1.5e-05,0.000207,3.7e-05,1.8e-05,0.000207,9.1e-05,...,2e-05,0.00291,0.000384,0.000192,4,0.0,0.00291,0.000105,0.000207,0.000207
3,AQWHPQAGK,9,0.000783,8.7e-05,4.3e-05,0.000101,7.7e-05,3.8e-05,6e-05,7.7e-05,...,7e-06,0.002933,0.00031,0.000155,4,0.000783,0.002933,4.7e-05,6e-05,0.000101
4,ATHPQFAT,8,0.000987,0.000136,6.8e-05,-5.3e-05,6.6e-05,3.3e-05,-0.000132,0.000143,...,1.7e-05,0.001607,0.000362,0.000181,4,0.000987,0.001607,0.0,0.0,0.0


# Add weight, and sort

In [9]:
result['Peptide Weight'] = result['Peptide'].map(lambda x: weight(x, weights))
result.sort(['Peptide Length', 'Peptide'], inplace = True)
result = result.reset_index(drop = True)

result.head()

Unnamed: 0,Peptide,Peptide Length,Streptavidin Signal Mean,Streptavidin Signal SD,Streptavidin Signal SE,aMyc Dissociation Signal Mean,aMyc Dissociation Signal SD,aMyc Dissociation Signal SE,aMyc Association Signal Mean,aMyc Association Signal SD,...,Peptide Signal Mean,Peptide Signal SD,Peptide Signal SE,Number of Spots,Streptavidin Signal Mean NN,Peptide Signal Mean NN,aMyc Signal Mean NN,aMyc Association Signal Mean NN,aMyc Dissociation Signal Mean NN,Peptide Weight
0,AISPPPK,7,-1.8e-05,5.1e-05,5e-06,0.000135,0.000118,1.2e-05,0.00011,0.000132,...,0.002833,0.000502,5.1e-05,98,0.0,0.002833,9.7e-05,0.00011,0.000135,708
1,THPQFAT,7,0.000742,0.000101,5.8e-05,8e-06,5.5e-05,3.2e-05,2.4e-05,6.3e-05,...,0.001647,0.000257,0.000148,3,0.000742,0.001647,0.0,2.4e-05,8e-06,801
2,WHPQFAT,7,0.00061,5.1e-05,2.5e-05,-1.6e-05,9.4e-05,4.7e-05,-4e-06,6.8e-05,...,0.000924,0.000134,6.7e-05,4,0.00061,0.000924,0.0,0.0,0.0,886
3,WTHPFAT,7,-2e-06,5.7e-05,2.9e-05,-4.7e-05,6.3e-05,3.2e-05,-0.000158,7.3e-05,...,0.001179,0.000203,0.000101,4,0.0,0.001179,1.3e-05,0.0,0.0,858
4,WTHPQAT,7,0.000214,4.5e-05,2.6e-05,1e-06,0.000168,9.7e-05,-5.3e-05,0.000216,...,0.0018,0.000416,0.00024,3,0.000214,0.0018,0.0,0.0,1e-06,840


In [10]:
# rearrange order of columns
result = result.reindex_axis(['Peptide',
                              'Number of Spots',
                              'Peptide Length',
                              'Peptide Weight',
                              
                              'Peptide Signal Mean',
                              'Peptide Signal Mean NN',
                              'Peptide Signal SD',
                              'Peptide Signal SE',
                              
                              'Streptavidin Signal Mean',
                              'Streptavidin Signal Mean NN',
                              'Streptavidin Signal SD',
                              'Streptavidin Signal SE',
                              
                              'aMyc Signal Mean',
                              'aMyc Signal Mean NN',
                              'aMyc Signal SD',
                              'aMyc Signal SE',              
                              
                              'aMyc Dissociation Signal Mean',
                              'aMyc Dissociation Signal Mean NN',
                              'aMyc Dissociation Signal SD',
                              'aMyc Dissociation Signal SE',

                              'aMyc Association Signal Mean',
                              'aMyc Association Signal Mean NN',
                              'aMyc Association Signal SD',
                              'aMyc Association Signal SE'], axis=1)
result.head()

Unnamed: 0,Peptide,Number of Spots,Peptide Length,Peptide Weight,Peptide Signal Mean,Peptide Signal Mean NN,Peptide Signal SD,Peptide Signal SE,Streptavidin Signal Mean,Streptavidin Signal Mean NN,...,aMyc Signal SD,aMyc Signal SE,aMyc Dissociation Signal Mean,aMyc Dissociation Signal Mean NN,aMyc Dissociation Signal SD,aMyc Dissociation Signal SE,aMyc Association Signal Mean,aMyc Association Signal Mean NN,aMyc Association Signal SD,aMyc Association Signal SE
0,AISPPPK,98,7,708,0.002833,0.002833,0.000502,5.1e-05,-1.8e-05,0.0,...,4.2e-05,4e-06,0.000135,0.000135,0.000118,1.2e-05,0.00011,0.00011,0.000132,1.3e-05
1,THPQFAT,3,7,801,0.001647,0.001647,0.000257,0.000148,0.000742,0.000742,...,7e-06,4e-06,8e-06,8e-06,5.5e-05,3.2e-05,2.4e-05,2.4e-05,6.3e-05,3.7e-05
2,WHPQFAT,4,7,886,0.000924,0.000924,0.000134,6.7e-05,0.00061,0.00061,...,2.4e-05,1.2e-05,-1.6e-05,0.0,9.4e-05,4.7e-05,-4e-06,0.0,6.8e-05,3.4e-05
3,WTHPFAT,4,7,858,0.001179,0.001179,0.000203,0.000101,-2e-06,0.0,...,3.2e-05,1.6e-05,-4.7e-05,0.0,6.3e-05,3.2e-05,-0.000158,0.0,7.3e-05,3.6e-05
4,WTHPQAT,3,7,840,0.0018,0.0018,0.000416,0.00024,0.000214,0.000214,...,1.2e-05,7e-06,1e-06,1e-06,0.000168,9.7e-05,-5.3e-05,0.0,0.000216,0.000125


In [11]:
result.to_csv('07 EP dIm Signals.csv', index = False)