# BA2E Implement GreedyMotifSearch with Pseudocounts

In [5]:
import numpy as np

def formProfile(seqs):
    profiles = np.ones([4, len(seqs[0])])
    number2base = {0 : 'A', 1 : 'C', 2 : 'G', 3 : 'T'}
    for i in range(4):
        for j in range(len(seqs[0])):
            profiles[i, j] += float(sum([x[j] == number2base[i] for x in seqs]))/len(seqs) #different than BA2D, add 1 pseudocount
    return(profiles)


def scoreMotif(seqs):
    score = 0
    profile = formProfile(seqs)
    number2base = {0 : 'A', 1 : 'C', 2 : 'G', 3 : 'T'}
    for i in range(len(seqs[0])):
        score += max(profile[:, i]) * len(seqs) - 1 #different than BA2D
    return(score)

def kmerfromProfile(seq, profile):
    kmers = []
    prob = []
    base2number = {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}
    for i in range(len(seq) - profile.shape[1] + 1):
        kmers.append(seq[i : i + profile.shape[1]])
        prob.append(np.prod([profile[base2number[x], y] for x,y in zip(seq[i : i + profile.shape[1]], range(profile.shape[1]))]))
    idx = prob.index(max(prob))
    return kmers[idx]

def GreedyMotifSearch(dna, k, t):
    BestMotifs = [i[0 : k] for i in dna]
    scoreBestMotifs = scoreMotif(BestMotifs)
    for i in range(len(dna[0]) - k + 1):
        motif = []
        motif.append(dna[0][i : i + k])
        for j in range(1, t, 1):
            profile = formProfile(motif)
            motif.append(kmerfromProfile(dna[j], profile))
        if scoreMotif(motif) > scoreBestMotifs:
            BestMotifs = motif
            scoreBestMotifs = scoreMotif(BestMotifs)
    return BestMotifs

def BA2E(filename):
    with open(filename) as f:
        k,t = list(map(int, f.readline().rstrip().split()))
        dna = []
        for line in f:
            dna.append(line.rstrip())
        bestmotifs = GreedyMotifSearch(dna, k, t)
        for motif in bestmotifs:
            print(motif)

## Test

In [6]:
BA2E('BA2E-test.txt')

TTC
ATC
TTC
ATC
TTC


In [7]:
BA2E('BA2E-test2.txt')

CATCGCTTAACT
CCTCACTGAACT
CGTCACTACACT
CTTCTCTCGACT
CTTCACTCCACT
CCTCGCTAAACT
CTTCACTCCACT
CTTCGCTAGACT
CTTCACTGAACT
CGTCCCTGGACT
CCTCGCTGAACT
CTTCACTTAACT
CGTCACTTAACT
CATCTCTTTACT
CGTCGCTGGACT
CTTCTCTGCACT
CCTCTCTGCACT
CGTCTCTAGACT
CATCACTTCACT
CATCGCTCAACT
CATCACTAGACT
CATCACTCGACT
CGTCCCTACACT
CTTCGCTTGACT
CTTCCCTGAACT


## Quiz

In [6]:
BA2E('rosalind_ba2e.txt')

CGTCATCGCCGC
CGTCATCACTTC
CGTCCTCGCGGC
CGTCCTCCCGGC
CGTCATCACTTC
CGTCATCACGTC
CGTCTTCCCCCC
CGTCATCCCTCC
CGTCTTCGCCAC
CGTCCTCTCTCC
CGTCGTCTCCCC
CGTCATCTCAAC
CGTCCTCCCTGC
CGTCCTCCCGGC
CGTCGTCCCACC
CGTCCTCCCGCC
CGTCGTCTCCCC
CGTCGTCACTCC
CGTCATCCCTTC
CGTCATCGCATC
CGTCTTCTCAAC
CGTCTTCCCCAC
CGTCGTCACAGC
CGTCGTCACGCC
CGTCATCACACC
