## Import related libraries

In [1]:
import numpy as np

# Functions to perform BLOSUM62 encoding

In [2]:
def read_protein_sequences(file):
    with open(file) as f:
        sequences = f.read()
        
    sequences = list(filter(None, sequences.split('\n'))) 
    return(sequences)


def BLOSUM62(sequences):
    blosum62 = {
        'A': [4,  -1, -2, -2, 0,  -1, -1, 0, -2,  -1, -1, -1, -1, -2, -1, 1,  0,  -3, -2, 0],  # A
        'R': [-1, 5,  0,  -2, -3, 1,  0,  -2, 0,  -3, -2, 2,  -1, -3, -2, -1, -1, -3, -2, -3], # R
        'N': [-2, 0,  6,  1,  -3, 0,  0,  0,  1,  -3, -3, 0,  -2, -3, -2, 1,  0,  -4, -2, -3], # N
        'D': [-2, -2, 1,  6,  -3, 0,  2,  -1, -1, -3, -4, -1, -3, -3, -1, 0,  -1, -4, -3, -3], # D
        'C': [0,  -3, -3, -3, 9,  -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1], # C
        'Q': [-1, 1,  0,  0,  -3, 5,  2,  -2, 0,  -3, -2, 1,  0,  -3, -1, 0,  -1, -2, -1, -2], # Q
        'E': [-1, 0,  0,  2,  -4, 2,  5,  -2, 0,  -3, -3, 1,  -2, -3, -1, 0,  -1, -3, -2, -2], # E
        'G': [0,  -2, 0,  -1, -3, -2, -2, 6,  -2, -4, -4, -2, -3, -3, -2, 0,  -2, -2, -3, -3], # G
        'H': [-2, 0,  1,  -1, -3, 0,  0,  -2, 8,  -3, -3, -1, -2, -1, -2, -1, -2, -2, 2,  -3], # H
        'I': [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4,  2,  -3, 1,  0,  -3, -2, -1, -3, -1, 3],  # I
        'L': [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2,  4,  -2, 2,  0,  -3, -2, -1, -2, -1, 1],  # L
        'K': [-1, 2,  0,  -1, -3, 1,  1,  -2, -1, -3, -2, 5,  -1, -3, -1, 0,  -1, -3, -2, -2], # K
        'M': [-1, -1, -2, -3, -1, 0,  -2, -3, -2, 1,  2,  -1, 5,  0,  -2, -1, -1, -1, -1, 1],  # M
        'F': [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0,  0,  -3, 0,  6,  -4, -2, -2, 1,  3,  -1], # F
        'P': [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7,  -1, -1, -4, -3, -2], # P
        'S': [1,  -1, 1,  0,  -1, 0,  0,  0,  -1, -2, -2, 0,  -1, -2, -1, 4,  1,  -3, -2, -2], # S
        'T': [0,  -1, 0,  -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1,  5,  -2, -2, 0],  # T
        'W': [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1,  -4, -3, -2, 11, 2,  -3], # W
        'Y': [-2, -2, -2, -3, -2, -1, -2, -3, 2,  -1, -1, -2, -1, 3,  -3, -2, -2, 2,  7,  -1], # Y
        'V': [0,  -3, -3, -3, -1, -2, -2, -3, -3, 3,  1,  -2, 1,  -1, -2, -2, 0,  -3, -1, 4],  # V
        '*': [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],  # *
        '_': [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],  # _
        'X': [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],  # _
    }
    
    encodings = []
    for sequence in sequences:
        code=[]  
        for j in sequence:
            code = code + blosum62[j]
        encodings.append(code) 
    return encodings


def write_to_csv(encodings, file):
    with open(file, 'w') as f:
        for line in encodings:
            f.write(str(line[0]))
            for i in range(1, len(line)):
                f.write(',%s' % line[i])
            f.write('\n')

# Read sequence data and save features

### Kinase groups

In [3]:
groups = ['TK', 'CMGC', 'AGC', 'STE', 'CK1', 'CAMK', 'Other', 'TKL', 'Atypical', 'PKL']

In [None]:
for group in groups:
    #positive sites
    file = "./data/Group positive/"+group+'_positive.txt'
    sequences=read_protein_sequences(file)
    encode=BLOSUM62(sequences)
    write_to_csv(encode, "./features/group_positive_features/"+group+".txt")
    
    #negative sites
    file2 = "./data/Group negative/"+group+'_negative.txt'
    sequences2=read_protein_sequences(file2)
    encode2=BLOSUM62(sequences2)
    write_to_csv(encode2, "./features/group_negative_features/"+group+".txt")

### Kinase families

In [None]:
# get the name lists of kinase families with more than 15 S/T/Y, S/T, and Y sites, respectively
families = np.load("family 15.npy", allow_pickle=True)
families_st = np.load("family_st 15.npy", allow_pickle=True)
families_y = np.load("family_y 15.npy", allow_pickle=True)

In [None]:
# S/T/Y sites
for group in families:
    #positive sites
    file = "./data/Family positive/"+group+'_STY_positive.txt'
    sequences=read_protein_sequences(file)
    encode=BLOSUM62(sequences)
    write_to_csv(encode, "./features/family_all_positive_features/"+group+".txt")
    
    #negative sites
    file2 = "./data/Family negative/"+group+'_STY_negative.txt'
    sequences2=read_protein_sequences(file2)
    encode2=BLOSUM62(sequences2)
    write_to_csv(encode2, "./features/family_all_negative_features/"+group+".txt")

#S/T sites
for group in families_st:
    #positive sites
    file = "./data/Family positive/"+group+'_ST_positive.txt'
    sequences=read_protein_sequences(file)
    encode=BLOSUM62(sequences)
    write_to_csv(encode, "./features/family_st_positive_features/"+group+".txt")
    
    #negative sites
    file2 = "./data/Family negative/"+group+'_ST_negative.txt'
    sequences2=read_protein_sequences(file2)
    encode2=BLOSUM62(sequences2)
    write_to_csv(encode2, "./features/family_st_negative_features/"+group+".txt")

#Y site
for group in families_y:
    #positive sites
    file = "./data/Family positive/"+group+'_Y_positive.txt'
    sequences=read_protein_sequences(file)
    encode=BLOSUM62(sequences)
    write_to_csv(encode, "./features/family_y_positive_features/"+group+".txt")
    
    #negative sites
    file2 = "./data/Family negative/"+group+'_Y_negative.txt'
    sequences2=read_protein_sequences(file2)
    encode2=BLOSUM62(sequences2)
    write_to_csv(encode2, "./features/family_y_negative_features/"+group+".txt")

### Kinases

In [None]:
# get the name lists of kinase families with more than 15 S/T/Y, S/T, and Y sites, respectively
kinases = np.load("kinase 15.npy", allow_pickle=True)
kinases_st = np.load("kinase_st 15.npy", allow_pickle=True)
kinases_y = np.load("kinase_y 15.npy", allow_pickle=True)

In [None]:
# S/T/Y sites
for group in kinases:
    #positive sites
    file = "./data/Kinase positive/"+group+'_STY_positive.txt'
    sequences=read_protein_sequences(file)
    encode=BLOSUM62(sequences)
    write_to_csv(encode, "./features/kinase_all_positive_features/"+group+".txt")
    
    #negative sites
    file2 = "./data/Kinase negative/"+group+'_STY_negative.txt'
    sequences2=read_protein_sequences(file2)
    encode2=BLOSUM62(sequences2)
    write_to_csv(encode2, "./features/kinase_all_negative_features/"+group+".txt")

#S/T sites
for group in kinases_st:
    #positive sites
    file = "./data/Kinase positive/"+group+'_ST_positive.txt'
    sequences=read_protein_sequences(file)
    encode=BLOSUM62(sequences)
    write_to_csv(encode, "./features/kinase_st_positive_features/"+group+".txt")
    
    #negative sites
    file2 = "./data/Kinase negative/"+group+'_ST_negative.txt'
    sequences2=read_protein_sequences(file2)
    encode2=BLOSUM62(sequences2)
    write_to_csv(encode2, "./features/kinase_st_negative_features/"+group+".txt")

#Y site
for group in kinases_y:
    #positive sites
    file = "./data/Kinase positive/"+group+'_Y_positive.txt'
    sequences=read_protein_sequences(file)
    encode=BLOSUM62(sequences)
    write_to_csv(encode, "./features/kinase_y_positive_features/"+group+".txt")
    
    #negative sites
    file2 = "./data/Kinase negative/"+group+'_Y_negative.txt'
    sequences2=read_protein_sequences(file2)
    encode2=BLOSUM62(sequences2)
    write_to_csv(encode2, "./features/kinase_y_negative_features/"+group+".txt")