In [3]:
import numpy as np
import scipy as scp
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")
from Bio import SeqIO

### Method for finding entry indexex

In [4]:
# Method to find subseq in seq

def knutt_morris(txt, pattern):
    '''
    Knutt-Morris-Pratt algorithm for finding subsequence in sequence.
    Example: K_M_P(pattern, txt),
        pattern = subsequence to find,
        txt = sequence
    return:
        list of all index, where is subseq was found.
        '''
    result = []
    
    len_pat = len(pattern) 
    len_txt = len(txt) 
  
    pref_suff = [0]*len_pat 
    j = 0 
      
    compute_pref_suff(pattern, len_pat, pref_suff) 
  
    i = 0
    while i < len_txt:
        if pattern[j] == txt[i]: 
            i += 1
            j += 1
  
        if j == len_pat: 
            result.append((i-j)) 
            j = pref_suff[j-1] 
            
        elif i < len_txt and pattern[j] != txt[i]: 
            if j != 0: 
                j = pref_suff[j-1] 
            else: 
                i += 1
                
    return result
  
def compute_pref_suff(pattern, len_pat, pref_suff):
    '''Pref_suff func for K-M-P algorithm'''
    length = 0
    i = 1
  
    # the loop calculates lps[i] for i = 1 to len_pat-1 
    while i < len_pat: 
        if pattern[i]== pattern[length]: 
            length += 1
            pref_suff[i] = length
            i += 1
        else: 
            if length != 0: 
                length = pref_suff[length-1] 
            else: 
                pref_suff[i] = 0
                i += 1

### Data

In [13]:
fasta_test_file = SeqIO.parse('../data/Campylobacter_jejuni.fasta', 'fasta')

In [14]:
record = list(fasta_test_file)[0]


def read_seqIO(SeqRecord):
    pass

### Test string analyzes

In [None]:
string = 'MAWRSGLCETDSRTLKQFLQEECMWKLVGKSRKHREYRAVACRSTIFSPEDDSSCILCQL\
LLLYRDGEWIICFCCNGRYQGHYGVNHVHRRRRRICHLPTLYQLSFGGPLGPASIDFLPS\
FSQVTSSMTCDGITPDVIYEVCMLVPQDEAKRILVKGHGAMDLTCQKAVTLGGAGAWLLP\
RPEGYTLFFYILCYDLFTSCGNRCDIPSMTRLMAAATACGQAGCSFCTDHEGHVDPTGNY\
VGCTPDMGRCLCYVPCGPMTQSLIHNEEPATFFCESDDAKYLCAVGSKTAAQVTLGDGLD\
YHIGVKDSEGRWLPVKTDVWDLVKVEEPVSRMIVCSCPVLKNLVH'

In [None]:
# First attempt with classes, was considered not so good but saved for later
# You can pass it

class K_mer_str:
    '''
    Class for K-mer analyzes
    '''
    
    def __init__(string, k):
        self.string = string
        self.k = k
        self.vector = []
        
    def entry_index_vector(self, subseq):
        n = none
        pass
    

In [None]:
# For example we will try to find all needed vectors for 4-mers

import itertools


# Amino acid data

amino_dict = { 'ALA': 'A', 'ARG': 'R',
                  'ASN': 'N', 'ASP': 'D',
                  'CYS': 'C', 'GLN': 'Q',
                  'GLU': 'E', 'GLY': 'G',
                  'HIS': 'H', 'ILE': 'I',
                  'LEU': 'L', 'LYS': 'K',
                  'MET': 'M', 'PHE': 'F',
                  'PRO': 'P', 'SER': 'S',
                  'THR': 'T', 'TRP': 'W',
                  'TYR': 'Y', 'VAL': 'V' }

list_amino = list(amino_dict.values())
#list_amino
amino_string = ''.join(x for x in list_amino)

amino_string

In [None]:
# Make all posible combinations
# K-mer = 4

subseq_iter = itertools.product(amino_string, repeat = 4)

subseq_list = list(subseq_iter)
print(subseq_list[0]) #1

for i in range(len(subseq_list)):
    tup = subseq_list[i]
    subseq_list[i] = ''.join(tup)

print(subseq_list[0]) #2
print(len(subseq_list)) #3

# ok, we are ready

In [None]:
# Making functions

def vectorizing_single_subseq(string, subseq, n, k_mer_num,
                              knutt_morris:callable):
    '''
    finding subseq's entry index vector for subseq for some string
    return vector
    '''
    
    #n = len(string)
    #m = len(subseq)
    
    vector = [0 for k in range(n-k_mer_num+1)]
    
    list_entry = knutt_morris(string, subseq)
    
    for x in list_entry:
        vector[x] = 1
        
    return np.array(vector)

def vectorizing_all_subseq(string, subseq_list,
                           vectorizing_single_subseq:callable):
    '''
    finding subseq's entry index vector for all subseqs from subseq_list
    !!! return dict: subseq -> vector
    '''
    n = len(string)
    k_mer_num = len(subseq_list[0])
    
    vector_scaled_list = []
    #matrix = []
    
    for subseq in subseq_list:
        vector = vectorizing_single_subseq(string, subseq, n, k_mer_num, knutt_morris)
        vector = vector.reshape((-1, 1))
        
        scaler = preprocessing.StandardScaler() #  SKlearn
        
        vector_scaled = scaler.fit_transform(vector)
        del vector
        vector_scaled_list.append(vector_scaled)
        del vector_scaled
    
    return dict(zip(subseq_list, vector_scaled_list))