In [22]:
import numpy as np
import pandas as pd
import os

from scipy import stats

import sys
sys.path.append('/home/galina/PythonProjects/common_functions')
import peptides as pep

# Peptide data cleaning

In [23]:
data = pd.read_csv('EP signals from dIm EP and dR RT.csv', header = 0)

drop_col = ['Column',
            'Row',
            'Peptide SD',
#            'Peptide Median',
            'Peptide Max',
            'Peptide Min',
            'Streptavidin SD',
#            'Streptavidin Median',
            'Streptavidin Max',
            'Streptavidin Min',
            'Baseline Mean',
            'Dissociation SNR']
data.rename(columns = {'Index' : 'Grid Index',
                       'Label' : 'Peptide'}, inplace = True)

# Add 'Bad' columns
bad_indices = [int(x) for x in np.genfromtxt("bad spots.txt", delimiter="\n")]
binary_bad = [False] * len(data)
for i in bad_indices:
    binary_bad[i] = True

data['Bad EP']         = binary_bad
data['Bad RT']         = (data['Association SNR'].isnull())
data['Peptide Weight'] = data['Peptide'].map(pep.weight)
data['Peptide Length'] = data['Peptide'].map(lambda x : len(str(x)))

data.drop(drop_col, axis = 1, inplace = True)
data.head()

Unnamed: 0,Grid Index,Peptide,Peptide Readout,Peptide Mean,Peptide Median,Peptide MAD,Streptavidin Readout,Streptavidin Mean,Streptavidin Median,Streptavidin MAD,Baseline Noise,Association Mean,Dissociation Mean,Association SNR,Bad EP,Bad RT,Peptide Weight,Peptide Length
0,0,WTHPQFAT,0.000387,0.000899,0.000924,0.000241,0.001759,0.001738,0.001743,0.000143,0.000211,0.001505,0.001104,7.134546,False,False,987.083,8
1,1,WTHPQFAT,0.00036,0.000606,0.000659,0.000338,0.001839,0.001741,0.001741,0.000107,0.000263,0.000903,0.000765,3.429684,False,False,987.083,8
2,2,WTHLQFAT,0.000223,0.000304,0.000311,0.000148,-1.8e-05,5e-06,2.4e-05,9.5e-05,0.000237,-6.6e-05,7.2e-05,-0.280433,False,False,1003.126,8
3,3,NGQFQVWIPLAQK,0.002042,0.001883,0.00197,0.000265,-0.000196,-6.1e-05,-3.5e-05,0.000107,0.000237,-0.00012,-0.000145,-0.504192,False,False,1528.777,13
4,4,LQLHPQAGK,0.003436,0.003312,0.003363,0.000187,0.000476,0.00058,0.000593,0.000101,0.000237,0.000639,0.000319,2.693665,False,False,991.16,9


In [24]:
# remove NaNs, Xs and bad spots
data.dropna(subset = ['Peptide'], inplace = True)
data = data[ data['Peptide'] != 'X' ]
data = data[ data['Bad EP'] == 0 ]

lengths = [8, 9, 12, 13]
data = data[data['Peptide Length'].isin(lengths)]

data.sort(['Peptide Length', 'Peptide'], inplace = True)
data = data.reset_index(drop = True)

data.head()

Unnamed: 0,Grid Index,Peptide,Peptide Readout,Peptide Mean,Peptide Median,Peptide MAD,Streptavidin Readout,Streptavidin Mean,Streptavidin Median,Streptavidin MAD,Baseline Noise,Association Mean,Dissociation Mean,Association SNR,Bad EP,Bad RT,Peptide Weight,Peptide Length
0,527,ATHPQFAT,0.00112,0.001038,0.001142,0.00019,0.001197,0.001186,0.001182,0.000122,0.000266,0.000825,0.000553,3.099982,False,False,871.949,8
1,734,ATHPQFAT,0.001381,0.001484,0.001501,0.000182,0.001503,0.001451,0.001487,0.000144,0.000185,0.001086,0.000779,5.865785,False,False,871.949,8
2,1033,ATHPQFAT,0.001809,0.00144,0.001495,0.000217,0.001348,0.00164,0.001667,0.000143,0.000112,0.001556,0.001509,13.933146,False,False,871.949,8
3,1668,ATHPQFAT,0.001379,0.001355,0.001397,0.000195,0.001651,0.001417,0.001444,0.000115,0.000159,0.00083,0.000804,5.234234,False,False,871.949,8
4,793,CTHPQFAT,0.000929,0.001066,0.001053,0.000129,0.001015,0.001186,0.001179,0.000193,0.00016,0.001053,0.000747,6.570487,False,False,904.009,8


# Calculate mean and standard deviation by peptide

In [25]:
grouped = data.groupby(['Peptide',
                        'Peptide Length',
                        'Peptide Weight'])                       

f = {'Peptide Median'           : [np.mean, stats.sem, len],
     'Streptavidin Median'      : [np.mean, stats.sem]}     

In [26]:
result = grouped.agg(f)

result.reset_index(inplace = True)

result.columns = ['Peptide',
                  'Peptide Length',
                  'Peptide Weight',

                  'Peptide Mean',
                  'Peptide SE',
                  
                  'Number of Spots',
                 
                  'Streptavidin Mean',
                  'Streptavidin SE',]


result['Streptavidin Mean NN'] = result['Streptavidin Mean'].clip(lower = 0)
result['Peptide Mean NN']      = result['Peptide Mean'].clip(lower = 0)

result.head()

Unnamed: 0,Peptide,Peptide Length,Peptide Weight,Peptide Mean,Peptide SE,Number of Spots,Streptavidin Mean,Streptavidin SE,Streptavidin Mean NN,Peptide Mean NN
0,AGQFQVWIPGAQK,13,1429.644,0.002517,0.000109,20,-1.5e-05,1.3e-05,0.0,0.002517
1,AKFPIPLGKQSG,12,1242.486,0.003058,0.000181,4,-5.9e-05,1.6e-05,0.0,0.003058
2,AQWHPQAGK,9,1022.132,0.003245,0.000227,4,0.001123,5.3e-05,0.001123,0.003245
3,ATHPQFAT,8,871.949,0.001384,8.4e-05,4,0.001445,0.0001,0.001445,0.001384
4,CGQFQVWIPGAQK,13,1461.704,0.002316,0.00011,19,-1.9e-05,2.2e-05,0.0,0.002316


# Add weight, and sort

In [27]:
result.sort(['Peptide Length', 'Peptide'], inplace = True)
result = result.reset_index(drop = True)

result.head()

Unnamed: 0,Peptide,Peptide Length,Peptide Weight,Peptide Mean,Peptide SE,Number of Spots,Streptavidin Mean,Streptavidin SE,Streptavidin Mean NN,Peptide Mean NN
0,ATHPQFAT,8,871.949,0.001384,8.4e-05,4,0.001445,0.0001,0.001445,0.001384
1,CTHPQFAT,8,904.009,0.001125,3.5e-05,4,0.001317,0.000129,0.001317,0.001125
2,DTHPQFAT,8,915.959,0.001391,6.2e-05,4,0.001412,7.3e-05,0.001412,0.001391
3,DYKDDDDK,8,1012.984,0.00581,8.7e-05,99,-8.6e-05,9e-06,0.0,0.00581
4,ETHPQFAT,8,929.986,0.00174,0.000131,4,0.001277,6.4e-05,0.001277,0.00174


In [28]:
# rearrange order of columns
result = result.reindex_axis(['Peptide',
                              'Number of Spots',
                              'Peptide Length',
                              'Peptide Weight',
                              
                              'Peptide Mean',
                              'Peptide Mean NN',
                              'Peptide SE',
                              
                              'Streptavidin Mean',
                              'Streptavidin Mean NN',
                              'Streptavidin SE'], axis=1)
result.head()

Unnamed: 0,Peptide,Number of Spots,Peptide Length,Peptide Weight,Peptide Mean,Peptide Mean NN,Peptide SE,Streptavidin Mean,Streptavidin Mean NN,Streptavidin SE
0,ATHPQFAT,4,8,871.949,0.001384,0.001384,8.4e-05,0.001445,0.001445,0.0001
1,CTHPQFAT,4,8,904.009,0.001125,0.001125,3.5e-05,0.001317,0.001317,0.000129
2,DTHPQFAT,4,8,915.959,0.001391,0.001391,6.2e-05,0.001412,0.001412,7.3e-05
3,DYKDDDDK,99,8,1012.984,0.00581,0.00581,8.7e-05,-8.6e-05,0.0,9e-06
4,ETHPQFAT,4,8,929.986,0.00174,0.00174,0.000131,0.001277,0.001277,6.4e-05


In [29]:
result.to_csv('12 EP dIm Signals.csv', index = False)

In [33]:
# result[result['Peptide'] == 'NCQFQVWIPGAQK'] 

In [34]:
# data[data['Peptide'] == 'NCQFQVWIPGAQK'] 