In [59]:
import numpy as np
import pandas as pd
import os

from scipy import stats

os.chdir('/home/galina/PycharmProjects/Peptide Signal')

# Helper function, finds weight of aminoacid

In [60]:
def weight(name, dictionary):
    w = 0
    for e in name:
        w = w + dictionary[e]
    return int(w)

# Aminoacid weights to dictionary

In [61]:
amino = pd.read_csv('Aminoacids.csv', header = 0)
amino.head()

Unnamed: 0,Aminoacid,1-Letter Code,"Molecular weight, kDa"
0,Alanine,A,89
1,Arginine,R,174
2,Asparagine,N,132
3,Aspartic,D,133
4,Cysteine,C,121


In [62]:
amino.drop('Aminoacid', axis = 1, inplace = True)
amino.columns = ['Code', 'Weight']
weights = amino.set_index('Code')['Weight'].to_dict()
weights

{nan: nan,
 'A': 89.0,
 'C': 121.0,
 'D': 133.0,
 'E': 146.0,
 'F': 165.0,
 'G': 75.0,
 'H': 155.0,
 'I': 131.0,
 'K': 146.0,
 'L': 131.0,
 'M': 149.0,
 'N': 132.0,
 'P': 115.0,
 'Q': 147.0,
 'R': 174.0,
 'S': 105.0,
 'T': 119.0,
 'V': 117.0,
 'W': 204.0,
 'Y': 181.0}

# Peptide data cleaning

In [63]:
data = pd.read_csv('Peptides.csv', header = 0)
data.head()

Unnamed: 0,Grid Index,Peptide,Spot Signal,Spot SD
0,0,WTHPQFAT,0.000642,0.000224
1,1,WTHPQFAT,0.00113,0.00011
2,2,WTHLQFAT,0.000244,0.00011
3,3,NGQFQVWIPLAQK,0.00249,0.000289
4,4,LQLHPQAGK,0.00311,0.000157


In [64]:
data['Peptide'] = data['Peptide'].map(lambda x: x.strip())
data.drop(['Grid Index', 'Spot SD'], axis = 1, inplace = True)

data = data[(data['Peptide'] != 'X') & (data['Peptide'] != 'NA')]
data.sort(['Peptide'], inplace = True)
data = data.reset_index(drop = True)

data.head()

Unnamed: 0,Peptide,Spot Signal
0,AGQFQVWIPGAQK,0.00253
1,AGQFQVWIPGAQK,0.00279
2,AGQFQVWIPGAQK,0.00336
3,AGQFQVWIPGAQK,0.00255
4,AGQFQVWIPGAQK,0.00187


# Calculate mean and standard deviation by peptide

In [65]:
grouped = data.groupby('Peptide')
result = grouped.agg([len, np.mean, np.std, stats.sem])

result.columns = result.columns.droplevel()
result.reset_index(inplace = True)

result.columns = ['Peptide', 'Number of Spots', 'Mean Spot Signal', 'Standard Deviation', 'Standard Error']
result['Non Negative Spot Signal'] = result['Mean Spot Signal'] * (result['Mean Spot Signal'] > 0)

result.head()

Unnamed: 0,Peptide,Number of Spots,Mean Spot Signal,Standard Deviation,Standard Error,Non Negative Spot Signal
0,AGQFQVWIPGAQK,20,0.002625,0.000342,7.7e-05,0.002625
1,AISPPPK,100,0.002837,0.000502,5e-05,0.002837
2,AKFPIPLGKQSG,4,0.00291,0.000384,0.000192,0.00291
3,AQWHPQAGK,4,0.002933,0.00031,0.000155,0.002933
4,ATHPQFAT,4,0.001607,0.000362,0.000181,0.001607


# Add length, weight, and sort

In [67]:
result['Peptide Length'] = result['Peptide'].map(lambda x: len(x))
result['Peptide Weight (kDa)'] = result['Peptide'].map(lambda x: weight(x, weights))

result.sort(['Peptide Length', 'Peptide'], inplace = True)
result = result.reset_index(drop = True)

result.head()

Unnamed: 0,Peptide,Number of Spots,Mean Spot Signal,Standard Deviation,Standard Error,Non Negative Spot Signal,Peptide Length,Peptide Weight (kDa)
0,AISPPPK,100,0.002837,0.000502,5e-05,0.002837,7,816
1,THPQFAT,3,0.001647,0.000257,0.000148,0.001647,7,909
2,WHPQFAT,4,0.000924,0.000134,6.7e-05,0.000924,7,994
3,WTHPFAT,4,0.001179,0.000203,0.000101,0.001179,7,966
4,WTHPQAT,3,0.0018,0.000416,0.00024,0.0018,7,948


In [69]:
# rearrange order of columns
result = result.reindex_axis(['Peptide',
                              'Peptide Length',
                              'Number of Spots',
                              'Peptide Weight (kDa)',
                              'Mean Spot Signal',
                              'Non Negative Spot Signal',
                              'Standard Deviation',
                              'Standard Error'], axis=1)
result.head()

Unnamed: 0,Peptide,Peptide Length,Number of Spots,Peptide Weight (kDa),Mean Spot Signal,Non Negative Spot Signal,Standard Deviation,Standard Error
0,AISPPPK,7,100,816,0.002837,0.002837,0.000502,5e-05
1,THPQFAT,7,3,909,0.001647,0.001647,0.000257,0.000148
2,WHPQFAT,7,4,994,0.000924,0.000924,0.000134,6.7e-05
3,WTHPFAT,7,4,966,0.001179,0.001179,0.000203,0.000101
4,WTHPQAT,7,3,948,0.0018,0.0018,0.000416,0.00024


In [70]:
result.to_csv('result.csv', index = False)