# BA2F Implement RandomizedMotifSearch

In [10]:
import numpy as np
import random

def formProfile(seqs):
    profiles = np.ones([4, len(seqs[0])])
    number2base = {0 : 'A', 1 : 'C', 2 : 'G', 3 : 'T'}
    for i in range(4):
        for j in range(len(seqs[0])):
            profiles[i, j] += float(sum([x[j] == number2base[i] for x in seqs]))/len(seqs) #different than BA2D, add 1 pseudocount
    return(profiles)


def scoreMotif(seqs):
    score = 0
    profile = formProfile(seqs)
    number2base = {0 : 'A', 1 : 'C', 2 : 'G', 3 : 'T'}
    for i in range(len(seqs[0])):
        score += max(profile[:, i]) * len(seqs) - 1 #different than BA2D
    return(score)

def kmerfromProfile(seq, profile):
    kmers = []
    prob = []
    base2number = {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}
    for i in range(len(seq) - profile.shape[1] + 1):
        kmers.append(seq[i : i + profile.shape[1]])
        prob.append(np.prod([profile[base2number[x], y] for x,y in zip(seq[i : i + profile.shape[1]], range(profile.shape[1]))]))
    idx = prob.index(max(prob))
    return kmers[idx]

def RandomizedMotifSearch(dna, k, t):
    idx = [random.choice(range(len(dna[0]) - k + 1)) for _ in range(t)]
    kmers = [dna[i][idx[i] : idx[i] + k] for i in range(t)]
    bestMotifs = kmers
    scoreBestMotifs = scoreMotif(bestMotifs)
    profile = formProfile(bestMotifs)
    while True:
        motifs = []
        for i in range(t):
            motifs.append(kmerfromProfile(dna[i], profile))
        if scoreMotif(motifs) > scoreBestMotifs:
            bestMotifs = motifs
            scoreBestMotifs = scoreMotif(motifs)
            profile = formProfile(bestMotifs)
        else:
            return(bestMotifs, scoreBestMotifs)
        
def BA2F(filename):
    with open(filename, 'r') as f:
        k, t = list(map(int, f.readline().rstrip().split()))
        dna = []
        for line in f:
            dna.append(line.rstrip())
        allScores = []
        allMotifs = []
        for i in range(1000):
            motifs, scores = RandomizedMotifSearch(dna, k, t)
            allScores.append(scores)
            allMotifs.append(motifs)
        bestMotifs = allMotifs[allScores.index(max(allScores))]
        for i in bestMotifs:
            print(i)
        return bestMotifs

## Test

In [8]:
BA2F('BA2F-test.txt')

TCTCGGGG
CCAAGGTG
TACAGGCG
TTCAGGTG
TCCACGTG


['TCTCGGGG', 'CCAAGGTG', 'TACAGGCG', 'TTCAGGTG', 'TCCACGTG']

In [9]:
BA2F('BA2F-test2.txt')

CATGGGGAAAACTGA
CCTCTCGATCACCGA
CCTATAGATCACCGA
CCGATTGATCACCGA
CCTTGTGCAGACCGA
CCTTGCCTTCACCGA
CCTTGTTGCCACCGA
ACTTGTGATCACCTT
CCTTGTGATCAATTA
CCTTGTGATCTGTGA
CCTTGTGATCACTCC
AACTGTGATCACCGA
CCTTAGTATCACCGA
CCTTGTGAAATCCGA
CCTTGTCGCCACCGA
TGTTGTGATCACCGC
CACCGTGATCACCGA
CCTTGGTTTCACCGA
CCTTTGCATCACCGA
CCTTGTGATTTACGA


['CATGGGGAAAACTGA',
 'CCTCTCGATCACCGA',
 'CCTATAGATCACCGA',
 'CCGATTGATCACCGA',
 'CCTTGTGCAGACCGA',
 'CCTTGCCTTCACCGA',
 'CCTTGTTGCCACCGA',
 'ACTTGTGATCACCTT',
 'CCTTGTGATCAATTA',
 'CCTTGTGATCTGTGA',
 'CCTTGTGATCACTCC',
 'AACTGTGATCACCGA',
 'CCTTAGTATCACCGA',
 'CCTTGTGAAATCCGA',
 'CCTTGTCGCCACCGA',
 'TGTTGTGATCACCGC',
 'CACCGTGATCACCGA',
 'CCTTGGTTTCACCGA',
 'CCTTTGCATCACCGA',
 'CCTTGTGATTTACGA']

In [11]:
BA2F('BA2F-test3.txt')

CGATAA
GGTTAA
GGTATA
GGTTAA
GGTTAC
GGTTAA
GGCCAA
GGTTAA


['CGATAA',
 'GGTTAA',
 'GGTATA',
 'GGTTAA',
 'GGTTAC',
 'GGTTAA',
 'GGCCAA',
 'GGTTAA']

## Quiz

In [12]:
BA2F('rosalind_ba2f.txt')

AAGGATGCCGTAAGT
TAGGCCGGCATAAGT
TTCAGAGGCATAAGT
TAGTGCCACATAAGT
TAGTCGAGCATAAGT
TAGTGACCGATAAGT
GAGTGAGGCATAAAC
ACATGAGGCATAAGT
TAGTGAGGCATAGCG
TAGTGAGGCCAGAGT
TAACAAGGCATAAGT
TAGTGAGATGTAAGT
TAGTGAGGACGAAGT
TAGCCTGGCATAAGT
TAGTGCTCCATAAGT
TAGTGACTTATAAGT
TAGTGAGGCAGTGGT
ACGTGAGGCATAAGG
TAGTCTAGCATAAGT
TAGTGAGGCATCTAT


['AAGGATGCCGTAAGT',
 'TAGGCCGGCATAAGT',
 'TTCAGAGGCATAAGT',
 'TAGTGCCACATAAGT',
 'TAGTCGAGCATAAGT',
 'TAGTGACCGATAAGT',
 'GAGTGAGGCATAAAC',
 'ACATGAGGCATAAGT',
 'TAGTGAGGCATAGCG',
 'TAGTGAGGCCAGAGT',
 'TAACAAGGCATAAGT',
 'TAGTGAGATGTAAGT',
 'TAGTGAGGACGAAGT',
 'TAGCCTGGCATAAGT',
 'TAGTGCTCCATAAGT',
 'TAGTGACTTATAAGT',
 'TAGTGAGGCAGTGGT',
 'ACGTGAGGCATAAGG',
 'TAGTCTAGCATAAGT',
 'TAGTGAGGCATCTAT']