In [1]:
import pandas as pd
import urllib
import numpy as np

from urllib.request import urlopen

# Functions to extract negative sites, (i.e., S, T, Y +/- 7) in substrate proteins

In [2]:
def protein_seq(url, codes):
    #url --- the fasta link of a specific uniprot protein   
    sites=[]
    
    #extract the sequence and concate them into a string
    for n,line in enumerate(urlopen(url)):
        if (n!=0):
            line = line.decode('utf-8')  # Decoding the binary data to text.
            sites.append(line.rstrip("\n").rstrip(" "))            
    sites=''.join(sites) 
    
    seq=[]
    for i in range(0,len(sites)):
        if sites[i] in codes: #["S","T","Y"], ["S", "T"] or ["Y"]
            
            pos=i+1
    
            ses=''
            if pos>=8 and (pos+7)<=len(sites):
                ses=(sites[pos-8:pos+7])
                
            elif pos < 8:   
                for m in range(0,8-pos):
                    ses = ses+"*"    
                for j in range(8-pos,15):
                    ses+=sites[pos-8+j]
                    
            elif (pos+7)>len(sites):
                for j in range(0,len(sites)-pos+8):
                    ses+=sites[pos-8+j]
                    
                for n in range(0,pos+7-len(sites)):
                    ses = ses+"*"
            seq.append(ses)            
               
    return (seq)

In [3]:
def negative_seq(positive, sub_acc, codes):
    
    sequences=[]
    for ids in sub_acc:
        
        try:
            url='https://www.uniprot.org/uniprot/' + ids + '.fasta'
            urlopen(url)
            seq = protein_seq(url, codes)
            sequences+=seq
            
        except Exception:
            continue    
            
    # remove the duplicates
    sequences = list(dict.fromkeys(sequences))
    
    # remove postive to obtain negative sites
    sequences = list(set(sequences) - set(positive))
    
    return (sequences)

In [4]:
def write_neg(ads, sequences):
    
    with open(ads, 'w') as file:
        for listitem in sequences:
            #filter sequence with X, U or Z
            if not ("X" in listitem or "U" in listitem or "Z" in listitem):
                file.write('%s\n' % listitem)

# import data and export positive and negative sites

### Kinase group level

In [None]:
groups = ['TK', 'CMGC', 'AGC', 'STE', 'CK1', 'CAMK', 'Other', 'TKL', 'Atypical', 'PKL']
codes=["S", "T", "Y"]
for group in groups:
    
    df = pd.read_csv("./data/Group K-S sites/"+str(group)+"_Group.csv")
    df["PEPTIDE"].to_csv('./data/Group positive/'+str(group) +'_positive.txt', index=False, header=False)
    
    positive = df["PEPTIDE"].to_list()
    sub_acc = df["SUB_ACC"].unique()
    
    sequences = negative_seq(positive, sub_acc, codes)
    
    ads = './data/Group negative/'+ str(group) +'_negative.txt'
    write_neg(ads, sequences)


### Kinase family level

In [None]:
# get the name lists of kinase families with more than 15 S/T/Y, S/T, and Y sites, respectively
families = np.load("family 15.npy", allow_pickle=True)
families_st = np.load("family_st 15.npy", allow_pickle=True)
families_y = np.load("family_y 15.npy", allow_pickle=True)

In [None]:
#STY
codes=["S", "T", "Y"]
for group in families:
    
    df = pd.read_csv("./data/Family K-S sites/"+str(group)+"_Family.csv")
    
    #export positive sites
    df["PEPTIDE"].to_csv('./data/Family positive/'+str(group) +'_STY_positive.txt', index=False, header=False)
    
    positive = df["PEPTIDE"].to_list()
    sub_acc = df["SUB_ACC"].unique()
    
    #obtain negative sites
    sequences = negative_seq(positive, sub_acc, codes)
    
    ads = './data/Family negative/'+ str(group) +'_STY_negative.txt'
    write_neg(ads, sequences)

In [None]:
#ST
codes = ["S", "T"]
for group in families_st:
    
    df = pd.read_csv("./data/Family K-S sites/"+str(group)+"_Family.csv")
    
    #filter S/T sites
    df = df[df["CODE"].isin(["S", "T"])]
    
    #export positive sites
    df["PEPTIDE"].to_csv('./data/Family positive/'+str(group)+'_ST_positive.txt', index=False, header=False)
    
    positive = df["PEPTIDE"].to_list()
    sub_acc = df["SUB_ACC"].unique()
    
    #obtain negative sites
    sequences = negative_seq(positive, sub_acc, codes)
    
    ads = './data/Family negative/'+ str(group) +'_ST_negative.txt'
    write_neg(ads, sequences)

In [None]:
#Y
codes = ["Y"]
for group in families_y:
    
    df = pd.read_csv("./data/Family K-S sites/"+str(group)+"_Family.csv")
    
    #filter Y site
    df = df[df["CODE"].isin(["Y"])]
    
    #export positive sites
    df["PEPTIDE"].to_csv('./data/Family positive/'+str(group) +'_Y_positive.txt', index=False, header=False)
    
    positive = df["PEPTIDE"].to_list()
    sub_acc = df["SUB_ACC"].unique()
    
    #obtain negative sites
    sequences = negative_seq(positive, sub_acc, codes)
    
    ads = './data/Family negative/'+ str(group) +'_Y_negative.txt'
    write_neg(ads, sequences)

### Individual kinase level

In [None]:
# get the name lists of kinase families with more than 15 S/T/Y, S/T, and Y sites, respectively
kinases = np.load("kinase 15.npy", allow_pickle=True)
kinases_st = np.load("kinase_st 15.npy", allow_pickle=True)
kinases_y = np.load("kinase_y 15.npy", allow_pickle=True)

In [None]:
#STY
codes=["S", "T", "Y"]
for group in kinases:
    
    df = pd.read_csv("./data/Kinase K-S sites/"+str(group)+".csv")
    
    #export positive sites
    df["PEPTIDE"].to_csv('./data/Kinase positive/'+str(group) +'_STY_positive.txt', index=False, header=False)
    
    positive = df["PEPTIDE"].to_list()
    sub_acc = df["SUB_ACC"].unique()
    
    #obtain negative sites
    sequences = negative_seq(positive, sub_acc, codes)
    
    ads = './data/Kinase negative/'+ str(group) +'_STY_negative.txt'
    write_neg(ads, sequences)

In [None]:
#ST
codes = ["S", "T"]
for group in kinases_st:
    
    df = pd.read_csv("./data/Kinase K-S sites/"+str(group)+".csv")
    
    #filter S/T sites
    df = df[df["CODE"].isin(["S", "T"])]
    
    #export positive sites
    df["PEPTIDE"].to_csv('./data/Kinase positive/'+str(group)+'_ST_positive.txt', index=False, header=False)
    
    positive = df["PEPTIDE"].to_list()
    sub_acc = df["SUB_ACC"].unique()
    
    #obtain negative sites
    sequences = negative_seq(positive, sub_acc, codes)
    
    ads = './data/Kinase negative/'+ str(group) +'_ST_negative.txt'
    write_neg(ads, sequences)

In [None]:
#Y
codes = ["Y"]
for group in kinases_y:
    
    df = pd.read_csv("./data/Kinase K-S sites/"+str(group)+".csv")
    
    #filter Y site
    df = df[df["CODE"].isin(["Y"])]
    
    #export positive sites
    df["PEPTIDE"].to_csv('./data/Kinase positive/'+str(group) +'_Y_positive.txt', index=False, header=False)
    
    positive = df["PEPTIDE"].to_list()
    sub_acc = df["SUB_ACC"].unique()
    
    #obtain negative sites
    sequences = negative_seq(positive, sub_acc, codes)
    
    ads = './data/Kinase negative/'+ str(group) +'_Y_negative.txt'
    write_neg(ads, sequences)