In [1]:
import bisect

class Index (object):
    'indice com todas as palavras de tamanho k existentes no texto t'
    
    def __init__(self, t, k):
        self.k = k
        self.index = self.get_kmers(t)
    
    def get_kmers(self, t):
        'subdivide o texto em strings de tamanho k'
        kmers = []
        for i in range (len(t) - self.k + 1):
            kmers.append([t[i:i + self.k],i])
        kmers.sort()
        return kmers
    
    def query_index(self, p):
        'busca a palavra p no índice usando binary search'
        hits = []
        #extrai o kmer de tamanho k da palavra p
        kmer = p[:self.k]
        print('kmer:' , kmer)
        # pegar a posicao na qual a lista [kmer,-1] poderia ser inserida no indice
        i = bisect.bisect_left(self.index, [kmer,-1])
        
        # obtem no indice todos os registros igual ao kmer procurado
        while i < len(self.index):
            if self.index[i][0] == kmer:
                hits.append(self.index[i][1])
            i += 1
            
        return hits
    
    def busca_palavra (self, t, p):
        'retorna as posicoes do texto que contem a palavra buscada'
        matches = []
        
        #obtem as posicoes do indice que contem o kmer ( : p[self.k]) da palavra 
        hits = index.query_index(p)
        print('hits', hits, ' count:', len(hits))
        
        #obtem o tamanho da palavra ao remover o trecho do kmer
        print('k ',self.k)
        offset = len( p[self.k : ] )
        print('offset ',offset)
        
        #para cada hit confirma o match da palavra 
        for hit in hits:
            pos_kmer_in_t = hit + self.k 
            print(p[self.k : ], t[ pos_kmer_in_t : pos_kmer_in_t + offset ])
            if p[self.k : ] == t[ pos_kmer_in_t : pos_kmer_in_t + offset ]:
                matches.append([hit, pos_kmer_in_t + offset])
        return matches


In [2]:
def leitura_fasta(nome_arquivo):
    'Processamento de arquivo fasta e retorno de dicionario com as sequencias lidas'
    
    sequencias = {}
    seq_id = ''
    
    try:
        arquivo_fasta =  open(nome_arquivo)
    except 'IOError':
       print('Arquivo nao encontrado!') 

    for linha in arquivo_fasta:
        if linha[0] == '>':
            seq_id = linha.rstrip()[1:linha.find(' ')]
            sequencias[seq_id] = ''
        elif linha != '':
            sequencias[seq_id] = sequencias[seq_id] + linha.rstrip()
    return sequencias[seq_id]

In [17]:
texto='AATCGGCCAGGCCCATTA'
palavra = 'GGCC'

#cria um indice contendo trechos (kmers) de tamanho 2 e com as respectivas posicoes
index =  Index(texto, 2)

In [18]:
#procura pela palavra no texto
matches  = index.busca_palavra(texto, palavra)
for match in matches:
    print('posicao:', match, ', trecho: ', texto [match[0] : match[1]] )

kmer: GG
hits [4, 9]  count: 2
k  2
offset  2
CC CC
CC CC
posicao: [4, 8] , trecho:  GGCC
posicao: [9, 13] , trecho:  GGCC


In [19]:
#cria um indice contendo trechos (kmers) de tamanho 8 a partir de um fasta
t=leitura_fasta('chr1.GRCh38.excerpt.fasta')
index =  Index(t, 8)
print(len(index.index))

799993


In [20]:
p = 'GGCGCGGTGGCTCACGCCTGTAAT'
matches  = index.busca_palavra(t, p)
print(matches)

kmer: GGCGCGGT
hits [56922, 57056, 83720, 84641, 147558, 160729, 191452, 262042, 364263, 657496, 681737, 717706, 725061]  count: 13
k  8
offset  16
GGCTCACGCCTGTAAT GGCTCACGCCTGTAAT
GGCTCACGCCTGTAAT GGCAGGCGCCTGTAGT
GGCTCACGCCTGTAAT GATTCATGCCTGTAAT
GGCTCACGCCTGTAAT GGCTCATGCCTGTAAT
GGCTCACGCCTGTAAT GGCTCATGCCTGTAAT
GGCTCACGCCTGTAAT GGCTCACACCTGTAAT
GGCTCACGCCTGTAAT GGTTCACGCCTGTAAT
GGCTCACGCCTGTAAT GGCTCACGCCTGTAAT
GGCTCACGCCTGTAAT GGCTCACGCCTGTAAT
GGCTCACGCCTGTAAT GGCTCACGCCTGTAAT
GGCTCACGCCTGTAAT GGCTCATGCCTGTAAT
GGCTCACGCCTGTAAT GGCTCACGCCTGTAAT
GGCTCACGCCTGTAAT GGCAGGCGCCTGTAGT
[[56922, 56946], [262042, 262066], [364263, 364287], [657496, 657520], [717706, 717730]]


In [21]:
for match in matches:
    print('posicao:', match, ', trecho: ', t [match[0] : match[1]] )

posicao: [56922, 56946] , trecho:  GGCGCGGTGGCTCACGCCTGTAAT
posicao: [262042, 262066] , trecho:  GGCGCGGTGGCTCACGCCTGTAAT
posicao: [364263, 364287] , trecho:  GGCGCGGTGGCTCACGCCTGTAAT
posicao: [657496, 657520] , trecho:  GGCGCGGTGGCTCACGCCTGTAAT
posicao: [717706, 717730] , trecho:  GGCGCGGTGGCTCACGCCTGTAAT
