In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import os

from scipy import stats

import sys
sys.path.append('/home/galina/PythonProjects/common_functions')
import peptides as pep

# Peptide data cleaning

In [3]:
data = pd.read_csv('EP signals from dIm EP, dR EP and dR RT.csv', header = 0)
data.rename(columns = {'Index' : 'Grid Index',
                       'Label' : 'Peptide'}, inplace = True)

bad_dIm_EP = pd.read_csv('bad dIm spots.txt', sep = '\t')['Index']
bad_dR_EP  = pd.read_csv( 'bad dR spots.txt', sep = '\t')['Index']

binary_bad_dIm = [False] * len(data)
for i in bad_dIm_EP:
    binary_bad_dIm[i] = True
    
binary_bad_dR = [False] * len(data)
for i in bad_dR_EP:
    binary_bad_dR[i] = True
    
data['dIm EP Bad'] = binary_bad_dIm
data['dR EP Bad']  = binary_bad_dR
data['dR RT Bad']  = (data['dR RT Streptavidin Association SNR'].isnull())

data['Peptide Weight'] = data['Peptide'].map(pep.weight)
data['Peptide Length'] = data['Peptide'].map(lambda x : len(str(x)))

col = ['Grid Index',
       'Peptide',
       'Peptide Length',
       'Peptide Weight',
       
       'dIm EP Peptide Readout',
       'dIm EP Peptide Median',
       'dIm EP Peptide MAD',
       'dIm EP Streptavidin Readout',
       'dIm EP Streptavidin Median',
       'dIm EP Streptavidin MAD',
       'dIm EP Bad',
       
       'dR EP Peptide Readout',
       'dR EP Peptide Median',
       'dR EP Peptide MAD',
       'dR EP Streptavidin Readout',
       'dR EP Streptavidin Median',
       'dR EP Streptavidin MAD',
       'dR EP Bad',
       
       'dR RT Streptavidin Baseline Noise',
       'dR RT Streptavidin Association Mean',
       'dR RT Streptavidin Dissociation Mean',
       'dR RT Streptavidin Association SNR',
       'dR RT Bad']

data = data[col]

# remove NaNs and Xs
data.dropna(subset = ['Peptide'], inplace = True)
data = data[ data['Peptide'] != 'X' ]

lengths = [8, 9, 12, 13]
data = data[data['Peptide Length'].isin(lengths)]

data.sort(['Peptide Length', 'Peptide'], inplace = True)
data = data.reset_index(drop = True)

data.head()



Unnamed: 0,Grid Index,Peptide,Peptide Length,Peptide Weight,dIm EP Peptide Readout,dIm EP Peptide Median,dIm EP Peptide MAD,dIm EP Streptavidin Readout,dIm EP Streptavidin Median,dIm EP Streptavidin MAD,...,dR EP Peptide MAD,dR EP Streptavidin Readout,dR EP Streptavidin Median,dR EP Streptavidin MAD,dR EP Bad,dR RT Streptavidin Baseline Noise,dR RT Streptavidin Association Mean,dR RT Streptavidin Dissociation Mean,dR RT Streptavidin Association SNR,dR RT Bad
0,527,ATHPQFAT,8,871.949,0.00112,0.001142,0.00019,0.001197,0.001182,0.000122,...,0.000224,0.000355,0.000346,0.000204,False,0.000266,0.000825,0.000553,3.099982,False
1,734,ATHPQFAT,8,871.949,0.001381,0.001501,0.000182,0.001503,0.001487,0.000144,...,0.000206,0.000569,0.000698,0.000235,False,0.000185,0.001086,0.000779,5.865785,False
2,1033,ATHPQFAT,8,871.949,0.001809,0.001495,0.000217,0.001348,0.001667,0.000143,...,0.000231,0.000339,0.000896,0.000176,False,0.000112,0.001556,0.001509,13.933146,False
3,1668,ATHPQFAT,8,871.949,0.001379,0.001397,0.000195,0.001651,0.001444,0.000115,...,0.000171,0.000916,0.000569,0.000212,False,0.000159,0.00083,0.000804,5.234234,False
4,793,CTHPQFAT,8,904.009,0.000929,0.001053,0.000129,0.001015,0.001179,0.000193,...,0.000188,0.000636,0.000409,0.000239,False,0.00016,0.001053,0.000747,6.570487,False


In [4]:
data.to_csv('18 EP dIm and dR Signals.csv', index = False)

In [5]:
tmp = data[data['Peptide Length'] == 13][['Peptide', 'Peptide Length']].groupby('Peptide').agg(len)
tmp = tmp.reset_index().sort(['Peptide Length', 'Peptide']).reset_index()
tmp
#tmp[~tmp['Peptide'].str.contains('QSG')]

  from ipykernel import kernelapp as app


Unnamed: 0,index,Peptide,Peptide Length
0,9,LGQFQVWIPGAQK,4
1,10,MGQFQVWIPGAQK,4
2,11,NAQFQVWIPGAQK,4
3,12,NCQFQVWIPGAQK,4
4,13,NDQFQVWIPGAQK,4
5,14,NEQFQVWIPGAQK,4
6,15,NFQFQVWIPGAQK,4
7,16,NGAFQVWIPGAQK,4
8,17,NGCFQVWIPGAQK,4
9,18,NGDFQVWIPGAQK,4


# Calculate mean and standard deviation by peptide

In [6]:
grouped_dIm = data[ data['dIm EP Bad'] == 0 ].groupby(['Peptide'])                      
grouped_dR  = data[ data['dR EP Bad']  == 0 ].groupby(['Peptide']) 

f_dIm = {'Peptide Length'             : {'Peptide Length'          : 'first'},
         'Peptide Weight'             : {'Peptide Weight'          : 'first'},
         'dIm EP Peptide Median'      : {'dIm EP Peptide Mean'     : np.mean,
                                         'dIm EP Peptide SE'       : stats.sem,
                                         'Number of dIm EP Spots'  : len},
         'dIm EP Streptavidin Median' : {'dIm EP Streptavidin Mean' : np.mean,
                                         'dIm EP Streptavidin SE'  : stats.sem}}

f_dR  = {'dR EP Peptide Median'       : {'dR EP Peptide Mean'      : np.mean,
                                         'dR EP Peptide SE'        : stats.sem,
                                         'Number of dR EP Spots'   : len},
         'dR EP Streptavidin Median'  : {'dR EP Streptavidin Mean'  : np.mean,
                                         'dR EP Streptavidin SE'   : stats.sem}}     

result_dIm = grouped_dIm.agg(f_dIm)
result_dR  = grouped_dR.agg(f_dR)

result_dIm.columns = result_dIm.columns.droplevel()
result_dR.columns  = result_dR.columns.droplevel()

result_dIm.reset_index(inplace = True)
result_dR.reset_index(inplace = True)

result_dIm['dIm EP Streptavidin Mean NN'] = result_dIm['dIm EP Streptavidin Mean'].clip(lower = 0)
result_dIm['dIm EP Peptide Mean NN']      = result_dIm['dIm EP Peptide Mean'].clip(lower = 0)

result_dR['dR EP Streptavidin Mean NN'] = result_dR['dR EP Streptavidin Mean'].clip(lower = 0)
result_dR['dR EP Peptide Mean NN']      = result_dR['dR EP Peptide Mean'].clip(lower = 0)

result = pd.merge(result_dIm, result_dR, how = 'outer', on = 'Peptide')
result.sort(['Peptide Length', 'Peptide'], inplace = True)
result = result.reset_index(drop = True)

result = result.reindex_axis(['Peptide',
                              'Peptide Length',
                              'Peptide Weight',
                              
                              'Number of dIm EP Spots',
                              'dIm EP Peptide Mean',
                              'dIm EP Peptide Mean NN',
                              'dIm EP Peptide SE',
                              'dIm EP Streptavidin Mean',
                              'dIm EP Streptavidin Mean NN',
                              'dIm EP Streptavidin SE',
                             
                              'Number of dR EP Spots',
                              'dR EP Peptide Mean',
                              'dR EP Peptide Mean NN',
                              'dR EP Peptide SE',
                              'dR EP Streptavidin Mean',
                              'dR EP Streptavidin Mean NN',
                              'dR EP Streptavidin SE'], axis=1)

result.head()



Unnamed: 0,Peptide,Peptide Length,Peptide Weight,Number of dIm EP Spots,dIm EP Peptide Mean,dIm EP Peptide Mean NN,dIm EP Peptide SE,dIm EP Streptavidin Mean,dIm EP Streptavidin Mean NN,dIm EP Streptavidin SE,Number of dR EP Spots,dR EP Peptide Mean,dR EP Peptide Mean NN,dR EP Peptide SE,dR EP Streptavidin Mean,dR EP Streptavidin Mean NN,dR EP Streptavidin SE
0,ATHPQFAT,8,871.949,4.0,0.001384,0.001384,8.4e-05,0.001445,0.001445,0.0001,4.0,0.001181,0.001181,1.8e-05,0.000627,0.000627,0.000115
1,CTHPQFAT,8,904.009,4.0,0.001125,0.001125,3.5e-05,0.001317,0.001317,0.000129,4.0,0.001023,0.001023,6.5e-05,0.000608,0.000608,0.0001
2,DTHPQFAT,8,915.959,4.0,0.001391,0.001391,6.2e-05,0.001412,0.001412,7.3e-05,4.0,0.001054,0.001054,5.4e-05,0.00064,0.00064,6.5e-05
3,DYKDDDDK,8,1012.984,99.0,0.00581,0.00581,8.7e-05,-8.6e-05,0.0,9e-06,99.0,0.002899,0.002899,5.5e-05,8.9e-05,8.9e-05,1.4e-05
4,ETHPQFAT,8,929.986,4.0,0.00174,0.00174,0.000131,0.001277,0.001277,6.4e-05,4.0,0.000894,0.000894,0.000111,0.000618,0.000618,4.7e-05


In [7]:
result.to_csv('18 EP dIm and dR Signals aggregated.csv', index = False)

In [8]:
# result[result['Peptide'] == 'NCQFQVWIPGAQK'] 

In [9]:
# data[data['Peptide'] == 'NCQFQVWIPGAQK'] 

In [10]:
# data[data['dR EP Peptide MAD'] > 0.005]

In [11]:
result[result['Peptide'] == 'LQWHPQAGA']

Unnamed: 0,Peptide,Peptide Length,Peptide Weight,Number of dIm EP Spots,dIm EP Peptide Mean,dIm EP Peptide Mean NN,dIm EP Peptide SE,dIm EP Streptavidin Mean,dIm EP Streptavidin Mean NN,dIm EP Streptavidin SE,Number of dR EP Spots,dR EP Peptide Mean,dR EP Peptide Mean NN,dR EP Peptide SE,dR EP Streptavidin Mean,dR EP Streptavidin Mean NN,dR EP Streptavidin SE
243,LQWHPQAGA,9,1007.118,4.0,0.002072,0.002072,0.000143,0.001501,0.001501,3.4e-05,4.0,0.001288,0.001288,4.6e-05,0.0006,0.0006,2.2e-05
