# BA2D Implement GreedyMotifSearch

In [18]:
import numpy as np

def formProfile(seqs):
    profiles = np.zeros([4, len(seqs[0])])
    number2base = {0 : 'A', 1 : 'C', 2 : 'G', 3 : 'T'}
    for i in range(4):
        for j in range(len(seqs[0])):
            profiles[i, j] = float(sum([x[j] == number2base[i] for x in seqs]))/len(seqs)
    return(profiles)


def scoreMotif(seqs):
    score = 0
    profile = formProfile(seqs)
    number2base = {0 : 'A', 1 : 'C', 2 : 'G', 3 : 'T'}
    for i in range(len(seqs[0])):
        score += max(profile[:, i]) * len(seqs)
    return(score)

def kmerfromProfile(seq, profile):
    kmers = []
    prob = []
    base2number = {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}
    for i in range(len(seq) - profile.shape[1] + 1):
        kmers.append(seq[i : i + profile.shape[1]])
        prob.append(np.prod([profile[base2number[x], y] for x,y in zip(seq[i : i + profile.shape[1]], range(profile.shape[1]))]))
    idx = prob.index(max(prob))
    return kmers[idx]

def GreedyMotifSearch(dna, k, t):
    BestMotifs = [i[0 : k] for i in dna]
    scoreBestMotifs = scoreMotif(BestMotifs)
    for i in range(len(dna[0]) - k + 1):
        motif = []
        motif.append(dna[0][i : i + k])
        for j in range(1, t, 1):
            profile = formProfile(motif)
            motif.append(kmerfromProfile(dna[j], profile))
        if scoreMotif(motif) > scoreBestMotifs:
            BestMotifs = motif
            scoreBestMotifs = scoreMotif(BestMotifs)
    return BestMotifs

def BA2D(filename):
    with open(filename) as f:
        k,t = list(map(int, f.readline().rstrip().split()))
        dna = []
        for line in f:
            dna.append(line.rstrip())
        bestmotifs = GreedyMotifSearch(dna, k, t)
        for motif in bestmotifs:
            print(motif)

## Test

In [19]:
BA2D('BA2D-test.txt')

CAG
CAG
CAA
CAA
CAA


In [20]:
BA2D('BA2D-test2.txt')

AGTGGGTATCTC
TAAAAAGGTATA
AACCACGAGTAC
TGTCATGTGCGG
AACCTAAACCCT
AGTCGTTATCCC
AGTAATATGTAC
AGTGGTTATCAC
AGTGGTTATCCC
AGTGGCTATCGC
AGTGGATATCCC
AGTGAGAAGCAA
AGTGACTAGACA
TAAGACTAGTTA
TATGAAGGGTGA
AGTCGGGATAAC
AGTGGGTATCTC
AGCGGTTAGTCA
AGTGAAATTCCT
TGTGGATGGCTT
TGTAGGTATCAC
TGCAGATATCCA
TGTGGTTATCAC
TGTCATTATTCA
TGCGTAGATCAA


## Quiz

In [21]:
BA2D('rosalind_ba2d.txt')

CACCCCTCCGTC
CGGGACAGCGCA
TATTTAGAATGG
TCTGATACCTGC
ACTCCTCGTCGC
TCTCCTGACTCA
TCTTCAGGTTGC
CCTTAAAGTTCA
TCTCATGCCTTG
CCTTCTACACTG
TCTTTTGCTTGC
TCCCATGGCTCC
TCCTAAGCACCA
CATGAAAGCCCC
TCTCCCTCCCGC
TCTCCTAGTTTC
TCCCTTGCCTTC
TCTCCTGACTTC
TCTCATGGCTCA
TCCCTTGCCTTG
CCGTCTGCCCCA
CATGACGCCCGC
TGTCCAACCTTA
TATCATACTGGG
TCTCCAACAGGA
