In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
!pip install tensorflow keras numpy matplotlib pandas sklearn biopython 

Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/76/02/8b606c4aa92ff61b5eda71d23b499ab1de57d5e818be33f77b01a6f435a8/biopython-1.78-cp36-cp36m-manylinux1_x86_64.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 2.8MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.78


In [3]:
from Bio import SeqIO
import pandas as pd
import numpy as np
from pandas import ExcelWriter
from pandas import ExcelFile
from collections import Counter

In [4]:
def CKSAAP(fastas, gap=4, **kw):
  if gap < 0:
    print('Error: the gap should be equal or greater than zero' + '\n\n')
    return 0
    
  
		
  AA = kw['order'] if kw['order'] != None else 'ACDEFGHIKLMNPQRSTVWXY'
  encodings = []
  aaPairs = []
  for aa1 in AA:
    for aa2 in AA:
      aaPairs.append(aa1 + aa2)
  header = []
  for g in range(gap+1):
    for aa in aaPairs:
      header.append(aa + '.gap' + str(g))
  # encodings.append(header)
  for i in fastas:
    sequence = i
    code = []
    for g in range(gap+1):
       myDict = {}
       for pair in aaPairs:
         myDict[pair] = 0
       sum=0

       for index1 in range(len(sequence)):
         index2 = index1 + g + 1
         if index1 < len(sequence) and index2 < len(sequence) and sequence[index1] in AA and sequence[index2] in AA:
           myDict[sequence[index1] + sequence[index2]] = myDict[sequence[index1] + sequence[index2]] + 1
           sum = sum + 1
       for pair in aaPairs:
         code.append(myDict[pair] / sum)
    encodings.append(code)
  return encodings

In [5]:
def BINARY(fastas, **kw):
  AA = kw['order'] if kw['order'] != None else 'ACDEFGHIKLMNPQRSTVWXY'
  encodings = []
  header = []
  for i in range(1, 25 * 21 + 1):
    header.append('BINARY.F'+str(i))
  # encodings.append(header)
  for i in fastas:
    sequence = i
    code = []
    for aa in sequence:
      for aa1 in AA:
        tag = 1.0 if aa == aa1 else 0.0
        code.append(tag)
    encodings.append(code)
  return encodings 

In [6]:
def AAC(fastas, **kw):
  AA = kw['order'] if kw['order'] != None else 'ACDEFGHIKLMNPQRSTVWXY'
  encodings = []
  header = []
  for i in AA:
    header.append(i)
  # encodings.append(header)

  for i in fastas:
    sequence = i
    count = Counter(sequence)
    for key in count:
      count[key] = count[key]/len(sequence)
    code = []
    for aa in AA:
      code.append(float(count[aa]))
    encodings.append(code)
  return encodings

In [7]:
def AAINDEX(fastas, **kw):
  AA = kw['order'] if kw['order'] != None else 'ACDEFGHIKLMNPQRSTVWXY'
  fileAAindex = r'/content/drive/My Drive/Propionylation/AAindex.txt'
  with open(fileAAindex) as f:
    records = f.readlines()[1:6]
  
  AAindex = []
  AAindexName = []

  for i in records:
    AAindex.append(i.rstrip().split()[1:] if i.rstrip() != '' else None)
    AAindexName.append(i.rstrip().split()[0] if i.rstrip() != '' else None)
  index = {}
  for i in range(len(AA)):
    index[AA[i]] = i
  encodings = []
  header = []
  for pos in range(1, 25 + 1):
    for idName in AAindexName:
      header.append('SeqPos.' + str(pos) + '.' + idName)
  # encodings.append(header)
	
  for i in fastas:
    sequence = i
    code = []
    for aa in sequence:
      for j in AAindex:
        code.append(float(j[index[aa]]))
    encodings.append(code)
  return encodings

In [9]:
if __name__ == '__main__':

  myAAorder = {
		'alphabetically': 'ACDEFGHIKLMNPQRSTVWXY',
		'polarity': 'DENKRQHSGTAPYVMCWIFL',
		'sideChainVolume': 'GASDPCTNEVHQILMKRFYW',
	}

  kw = {'order': 'ACDEFGHIKLMNPQRSTVWXY'}

  df= pd.read_csv('/content/drive/My Drive/Propionylation/Propio_final/Final_Dataset.csv')
  peptide=list(df['Peptide'])

  print(df.isnull().sum())

  fastas = np.array(peptide)
  gap = 4
  En1 = AAINDEX(fastas, **kw)
  En2 = AAC(fastas, **kw)
  En3 = BINARY(fastas, **kw)
  En4 = CKSAAP(fastas, gap=4, **kw)
  
  encodings= []

  for i in range(len(fastas)):
    En= list(En1[i]) +list(En2[i]) + list(En3[i]) +list(En4[i])
    encodings.append(En)


  target=list(df['Propionylation?'])
  truncated=[]
 
  pep_len=len(peptide)
  target_len=len(target)
  Propionylation=[]
  prop=[]
  lst=[]
  for i in target:
     if(i=='Yes'):
       prop='1'
     else:
       prop='0'
     lst.append(prop)
        
  for j in range(0,target_len):
    L_lenth=len(lst[j])
    L_value=lst[j]
    for k in L_value:
      Propionylation.append(k)

  raw_data ={
            'Image_Array': encodings,
            'Propionylation':Propionylation
            }
  Frame=pd.DataFrame(raw_data,columns=['Image_Array','Propionylation'])
  d1=pd.DataFrame(Frame['Propionylation'])
  d2 = (pd.DataFrame(Frame['Image_Array'].values.tolist(), index=Frame.index).rename(columns = lambda x: 'Pixel{}'.format(x+1)))
  df = d1.join(d2)
  df.to_csv('/content/drive/My Drive/Propionylation/Propio_final/combined_Encoding.csv',index=False)
  print(encodings[1])
  print(len(encodings[1]))
  print(len(encodings))
  print("ok")



Uniprot_Accession    0
Site                 0
Peptide              0
Propionylation?      0
dtype: int64
[4.44, 1.95, 0.76, 0.76, 0.06, 4.5, 0.05, 0.97, 0.81, 0.35, 4.7, 2.65, 0.77, 1.08, 0.73, 4.44, 1.95, 0.76, 0.76, 0.06, 4.17, 1.53, 3.23, 2.93, 1.0, 4.5, 0.05, 0.97, 0.81, 0.35, 4.5, 0.05, 0.97, 0.81, 0.35, 4.5, 0.05, 0.97, 0.81, 0.35, 3.95, 1.32, 1.08, 1.14, 0.82, 4.44, 1.95, 0.76, 0.76, 0.06, 4.5, 0.05, 0.97, 0.81, 0.35, 4.37, 0.0, 0.72, 0.51, 0.44, 4.36, 1.15, 0.06, 0.15, 0.6, 4.35, 0.05, 0.84, 0.91, 0.44, 4.6, 1.88, 0.39, 0.68, 0.44, 4.37, 0.0, 0.72, 0.51, 0.44, 3.97, 0.07, 0.49, 0.62, 0.35, 4.5, 0.05, 0.97, 0.81, 0.35, 4.6, 1.88, 0.39, 0.68, 0.44, 3.97, 0.07, 0.49, 0.62, 0.35, 4.66, 2.02, 1.96, 2.03, 0.6, 4.38, 0.6, 0.2, 0.45, 0.52, 4.17, 1.53, 3.23, 2.93, 1.0, 3.97, 0.07, 0.49, 0.62, 0.35, 4.66, 2.02, 1.96, 2.03, 0.6, 0.0, 0.0, 0.0, 0.0, 0.08, 0.12, 0.0, 0.0, 0.04, 0.08, 0.0, 0.0, 0.12, 0.08, 0.04, 0.24, 0.04, 0.04, 0.04, 0.0, 0.08, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 