# BA1I Find the Most Frequent Words with Mismatches in a String

In [11]:
from itertools import product
from collections import defaultdict

def hammingDist(a, b):
    return sum([x != y for x,y in zip(a,b)])

def allKmers(k):
    return [''.join(i) for i in product('ACGT', repeat = k)]  

def BA1I(filename):
    with open(filename) as f:
        dna = f.readline().rstrip()
        k, d = list(map(int, f.readline().rstrip().split()))
    kmers = allKmers(k)
    allKmersDict = defaultdict(int)
    for kmer in kmers:
        for i in range(len(dna) - k + 1):
            if hammingDist(dna[i : i + k], kmer) <= d:
                allKmersDict[kmer] += 1
    maxCount = max(allKmersDict.values())
    return ' '.join([k for k,v in allKmersDict.items() if v == maxCount])

In [14]:
def mismatchKmers(seq, k):
    kmers = set()
    bases = ['A', 'G', 'C', 'T']
    for i in range(len(seq) - k + 1):
        curKmer = seq[i : i + k]
        kmers.add(curKmer)
        for idx, v in enumerate(curKmer):
            for b in bases:
                kmers.add(curKmer[:idx] + b + curKmer[idx + 1:])
    return(kmers)
        
        

def fastBA1I(filename):
    with open(filename) as f:
        dna = f.readline().rstrip()
        k, d = list(map(int, f.readline().rstrip().split()))
    kmers = mismatchKmers(dna, k)
    allKmersDict = defaultdict(int)
    for kmer in kmers:
        for i in range(len(dna) - k + 1):
            if hammingDist(dna[i : i + k], kmer) <= d:
                allKmersDict[kmer] += 1
    maxCount = max(allKmersDict.values())
    return ' '.join([k for k,v in allKmersDict.items() if v == maxCount])

## Test

In [16]:
fastBA1I('BA1I-test.txt')

'ATGT GATG ATGC'

In [15]:
fastBA1I('BA1I-test2.txt')

'GCACACAGAC GCGCACACAC'

## Quiz

In [17]:
fastBA1I('rosalind_ba1i.txt')

'AAGAGA'