In [1]:
import numpy as np
import scipy as scp
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")
from Bio import SeqIO
import pandas as pd
import tqdm
import re
from os import listdir
from os.path import isfile, join

### Method for finding entry indices

In [2]:
# Method to find subseq in seq

def compute_pref_suff(pattern, len_pat, pref_suff):
    '''Pref_suff func for K-M-P algorithm
        void func'''
    
    length = 0
    i = 1
  
    # the loop calculates lps[i] for i = 1 to len_pat-1 
    while i < len_pat: 
        if pattern[i]== pattern[length]: 
            length += 1
            pref_suff[i] = length
            i += 1
        else: 
            if length != 0: 
                length = pref_suff[length-1] 
            else: 
                pref_suff[i] = 0
                i += 1

def knutt_morris_list(txt, pattern):
    '''
    Knutt-Morris-Pratt algorithm for finding subsequence in sequence.
    Example: K_M_P(pattern, txt),
        pattern = subsequence to find,
        txt = sequence
    return:
        list of all index, where is subseq was found.
        '''
    result = []
    
    len_pat = len(pattern) 
    len_txt = len(txt) 
  
    pref_suff = [0]*len_pat 
    j = 0 
      
    compute_pref_suff(pattern, len_pat, pref_suff) 
  
    i = 0
    while i < len_txt:
        if pattern[j] == txt[i]: 
            i += 1
            j += 1
  
        if j == len_pat: 
            result.append((i-j)) 
            j = pref_suff[j-1] 
            
        elif i < len_txt and pattern[j] != txt[i]: 
            if j != 0: 
                j = pref_suff[j-1] 
            else: 
                i += 1
                
    return result

def knutt_morris_count(txt, pattern):
    ''' Knutt-Morris-Pratt algorithm for finding subsequence in sequence.
    Example: K_M_P(pattern, txt),
        pattern = subsequence to find,
        txt = sequence
    return:
        count'''
    #count_ = 0
    len_pat = len(pattern) 
    len_txt = len(txt) 
  
    pref_suff = [0]*len_pat 
    j = 0 
      
    compute_pref_suff(pattern, len_pat, pref_suff) 
  
    i = 0
    while i < len_txt:
        if pattern[j] == txt[i]: 
            i += 1
            j += 1
  
        if j == len_pat: 
            print(i-j)
            break
            j = pref_suff[j-1] 
            
        elif i < len_txt and pattern[j] != txt[i]: 
            if j != 0: 
                j = pref_suff[j-1] 
            else: 
                i += 1
                
    #return count_

In [3]:
string = 'klasklaskl'
string.count('klaskl')

1

### Data

In [4]:
# For example we will try to find all needed vectors for 4-mers

import itertools


# Amino acid data

global amino_dict
amino_dict = { 'ALA': 'A', 'ARG': 'R',
                  'ASN': 'N', 'ASP': 'D',
                  'CYS': 'C', 'GLN': 'Q',
                  'GLU': 'E', 'GLY': 'G',
                  'HIS': 'H', 'ILE': 'I',
                  'LEU': 'L', 'LYS': 'K',
                  'MET': 'M', 'PHE': 'F',
                  'PRO': 'P', 'SER': 'S',
                  'THR': 'T', 'TRP': 'W',
                  'TYR': 'Y', 'VAL': 'V' }

list_amino = list(amino_dict.values())

global amino_string

amino_string = ''.join(x for x in list_amino)

del list_amino

amino_string

'ARNDCQEGHILKMFPSTWYV'

In [5]:
# Make all posible combinations
# K-mer = 4

def making_subseq(k):
    '''Making all subsequences using aminoacids'''
    subseq_iter = itertools.product(amino_string, repeat = k)
    subseq_list = list(subseq_iter)
    del subseq_iter
    for i in range(len(subseq_list)):
        tup = subseq_list[i]
        subseq_list[i] = ''.join(tup)
    return subseq_list


In [13]:
subseq_list = making_subseq(4)
#subseq_list

In [14]:
def read_fasta_file(path):
    '''
    Reading fasta file
    return SeqRecord parser
    '''
    fasta_test_file = SeqIO.parse(path, 'fasta')
    record_list = list(fasta_test_file)
    org_name = path.replace('../data/fasta/', '').replace('.fasta', '')
    
    return record_list, org_name

### Test string analyzes

In [15]:
def seqio_data(SeqRecord):
    '''working with SeqRecord
    return protein name and sequence'''
    protein = SeqRecord.name
    seq = str(SeqRecord.seq)
    
    return protein, seq

#seqio_data(SeqRecord)

In [16]:
# Functions to make index vectors
# May be helpful in future

def vectorizing_single_subseq(string, subseq, n, k_mer_num,
                              knutt_morris:callable):
    '''
    finding subseq's entry index vector for subseq for some string
    return vector
    '''
    
    #n = len(string)
    #m = len(subseq)
    
    vector = [0 for k in range(n-k_mer_num+1)]
    
    list_entry = knutt_morris(string, subseq)
    
    for x in list_entry:
        vector[x] = 1
        
    return np.array(vector)

def vectorizing_all_subseq(string, subseq_list,
                           vectorizing_single_subseq:callable):
    '''
    finding subseq's entry index vector for all subseqs from subseq_list
    !!! return dict: subseq -> vector
    '''
    n = len(string)
    k_mer_num = len(subseq_list[0])
    
    vector_list = []
    #matrix = []
    
    for subseq in subseq_list:
        vector = vectorizing_single_subseq(string, subseq, n, k_mer_num, knutt_morris)
        vector = vector.reshape((-1, 1))
        del vector
        vector_list.append(vector)
    
    return dict(zip(subseq_list, vector_list))

In [17]:
def occurrences_count(string, sub):
    '''
    finding subseq's nums of entry in seq 
    '''
    count = start = 0
    while True:
        start = string.find(sub, start) + 1
        if start > 0:
            count+=1
        else:
            return count

In [18]:
def occurrences_re(text, sub):
    '''
    finding subseq's nums of entry in seq 
    '''
    return len(re.findall('(?={0})'.format(sub), text))

In [19]:
def occurrences_re_1(text, sub):
    '''
    finding subseq's nums of entry in seq 
    '''
    return len(re.findall('(?={0})'.format(re.escape(sub)), text))

In [26]:
# Freq

def finding_freq_single_protein(seq, subseq_list):
    
    '''Finding frequnces for subsequences in single protein
    and scaling it with SKlearn StandardScaler()
    return vector_freq'''
    n = len(seq)
    k = len(subseq_list[0])
    
    vector_freq = []
    
    for x in subseq_list:
        vector_freq.append(float(occurrences_count(seq, x))/n)
    
    vector_freq = np.array(vector_freq)
    vector_freq = vector_freq.reshape((-1, 1))
    scaler = preprocessing.StandardScaler()
    vector_freq_scaled = scaler.fit_transform(vector_freq)
    del vector_freq
    
    return list(vector_freq_scaled)

### Main

In [27]:
def main_analyzes(path, k_mer_num):
    '''Lego construction with all functions above
    return:
        organism name
        list of proteins
        list of frequency vectors scaled'''
    
    # initializing subseqs
    subseq_list = making_subseq(k_mer_num)
    table_columns = ['Organism', 'Protein'] + subseq_list
    Proteins_data = pd.DataFrame(columns=table_columns)
    
    # Variables
    organism_name = ''
      
    #reading
    prot_records, organism_name = read_fasta_file(path)
    
    index = 0
    
    #prot_records stuff
    for i in tqdm.tqdm_notebook(range(len(prot_records))):
        SeqRecord = prot_records[i]
        prot_name, seq = seqio_data(SeqRecord)
        freq_vector = finding_freq_single_protein(seq,subseq_list)
        adding_row = []
        adding_row.append(organism_name)
        adding_row.append(prot_name)
        adding_row += freq_vector
        Proteins_data.loc[index] = adding_row
        index += 1
    writing_path = 'csv_data/' + organism_name + '.csv'
    
    Proteins_data.to_csv(writing_path)      
    
    del prot_records
    
    return 0

### Human

In [35]:
subseq_list = making_subseq(2)
table_columns = ['Organism', 'Protein'] + subseq_list

    

organism_name = ''
prot_records, organism_name = read_fasta_file('../data/fasta/human_proteome.fasta')
prot_records = np.array(prot_records)

prot_records_split =  np.array_split(prot_records, 100)


In [36]:
human_list = []
for x in prot_records_split:
    human_list.append(x)

In [41]:
for j in tqdm.tqdm_notebook(range(0, 50)):
    Proteins_data = pd.DataFrame(columns=table_columns)
    index = 0
    #print(type(j))
    for i in range(len(human_list[j])):
        SeqRecord = human_list[j][i]
        prot_name, seq = seqio_data(SeqRecord)
        freq_vector = finding_freq_single_protein(seq,subseq_list)
        adding_row = []
        adding_row.append(organism_name)
        adding_row.append(prot_name)
        adding_row += freq_vector
        Proteins_data.loc[index] = adding_row
        index += 1
    writing_path = 'csv_data/' + organism_name + '_' +str(j) + '.csv'
    Proteins_data.to_csv(writing_path)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))