In [1]:
import itertools 
import collections

In [2]:
def leitura_fasta_q(fasta_q):
    'Processamento de arquivo fastaq e retorno as sequencias lidas'
    reads = [] 
    #processa arquivo
    with open (fasta_q) as fq:
        while True:
            fq.readline() #skip line
            sequencia =  fq.readline().rstrip()
            fq.readline() #skip
            qualidade= fq.readline().rstrip()
            
            if len(sequencia) == 0 or len(qualidade) == 0:
                break
                
            reads.append(sequencia)
            
    return reads

In [3]:
def obter_qtd_bases(sequencia):
    count = collections.Counter()
    for seq in sequencia:
        count.update(seq)
    print(count)

In [4]:
def get_kmers(read, n):
    'subdivide o texto em strings de tamanho k'
    kmers = []
    for i in range (len(read) - n + 1):
        kmers.append([read[i:i + n],i])
    return kmers

In [5]:
def find_overlap(sufix,prefix,n):
    '''encontra overlaps de tamanho n entre o sufixo e o prefixo 
       Ex -  overlap de tamanho 4:
           CGCGAAGT (sufixo)
               AAGTCCCCCAAA (prefixo)
    '''
    start = 0
    while True:
        start =  sufix.find(prefix[:n], start)
        
        if start == -1: # nao encontrou overlap de tamanho n
            return 0
        
        if prefix.startswith(sufix[start:]):
            return len(sufix) - start
        
        start += 1

In [6]:
def obter_max_overlap (reads, tamanho):
    read_a, read_b = None, None
    maior_overlap = 0
    for a, b in itertools.permutations(reads,2):
        overlap = find_overlap(a,b,tamanho)
        if overlap > maior_overlap:
            read_a, read_b = a, b
            maior_overlap = overlap
    return  read_a, read_b, maior_overlap
    

In [7]:
def scs_greedy (reads, tamanho):
    read_a, read_b, maior_overlap = obter_max_overlap(reads, tamanho)
    while maior_overlap > 0 :
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a+read_b[maior_overlap:])
        read_a, read_b, maior_overlap = obter_max_overlap(reads, tamanho)
    return ''.join(reads)

In [34]:
reads = leitura_fasta_q('ads1_week4_reads.fq')
reads_copy = reads

In [35]:
#Para cada read, obtem os kmers de tamanho n e adiciona ao dicionario de kmers
kmer_reads = {}
n = 30

#associa as reads aos kmers
for read in reads:
    kmers = get_kmers(read, n)
    for kmer in kmers:
        # inicializa a key no dict, se nao existir
        if  kmer[0] not in kmer_reads.keys():
            kmer_reads[kmer[0]] = set()
        #adiciona a read    
        kmer_reads[kmer[0]].add(read)
print('Tamanho kmer_reads: ', len(kmer_reads))

Tamanho kmer_reads:  15865


In [40]:
'''para cada read, pega o sufixo e busca no dicionario kmer_reads as outras reads que tem esse kmer e retorna a que 
tem o maior overlap com cada read'''
max_overlap_pairs = {}
for read in reads:
    start = len(read) - n
    sufixo = read[start:]
    reads_com_sufixo = kmer_reads[sufixo]
    max_read_overlap = ['', '', 0]
    for r in reads_com_sufixo:
        if r != read:
            overlap = find_overlap(read, r, n)
            if overlap > max_read_overlap[2] :
                max_read_overlap = [read, r, overlap]
    if max_read_overlap != ['', '', 0]:
        max_overlap_pairs[(max_read_overlap[0], max_read_overlap[1])] = max_read_overlap[2]
        if max_read_overlap[0] in reads_copy:
            reads_copy.remove(max_read_overlap[0])
        if max_read_overlap[1] in reads_copy:
            reads_copy.remove(max_read_overlap[1])
        reads_copy.append(max_read_overlap[0]+max_read_overlap[1][max_read_overlap[2]:])

In [43]:
scs = scs_greedy (reads_copy, n)
print(len(scs))

15894
