# Search Motifs by scanning 


## Scanning-and-Scoring a Motif
Given a possible motif, we find its best match in each sequence.

In [1]:
def ScanAndScoreMotif(DNA, motif):
    """ Given a list of sequences in DNA and a motif,
    find the best alignments of the motif in all sequences
    and compute the total Hamming distances"""
    totalDist = 0
    bestAlignment = []
    k = len(motif)
    for seq in DNA:
        minHammingDist = k+1
        for s in range(len(seq)-k+1):
            HammingDist = sum([1 for i in range(k) if motif[i] != seq[s+i]])
            if (HammingDist < minHammingDist):
                bestS = s
                minHammingDist = HammingDist
        bestAlignment.append(bestS)
        totalDist += minHammingDist
    return bestAlignment, totalDist

In [2]:
seqApprox = [
    'tagtggtcttttgagtgtagatctgaagggaaagtatttccaccagttcggggtcacccagcagggcagggtgacttaat',
    'cgcgactcggcgctcacagttatcgcacgtttagaccaaaacggagttggatccgaaactggagtttaatcggagtcctt',
    'gttacttgtgagcctggttagacccgaaatataattgttggctgcatagcggagctgacatacgagtaggggaaatgcgt',
    'aacatcaggctttgattaaacaatttaagcacgtaaatccgaattgacctgatgacaatacggaacatgccggctccggg',
    'accaccggataggctgcttattaggtccaaaaggtagtatcgtaataatggctcagccatgtcaatgtgcggcattccac',
    'tagattcgaatcgatcgtgtttctccctctgtgggttaacgaggggtccgaccttgctcgcatgtgccgaacttgtaccc',
    'gaaatggttcggtgcgatatcaggccgttctcttaacttggcggtgcagatccgaacgtctctggaggggtcgtgcgcta',
    'atgtatactagacattctaacgctcgcttattggcggagaccatttgctccactacaagaggctactgtgtagatccgta',
    'ttcttacacccttctttagatccaaacctgttggcgccatcttcttttcgagtccttgtacctccatttgctctgatgac',
    'ctacctatgtaaaacaacatctactaacgtagtccggtctttcctgatctgccctaacctacaggtcgatccgaaattcg']

%time (bestAlignment, totalDist) = ScanAndScoreMotif(seqApprox, "tagatccgaa")
print(bestAlignment, totalDist)
k = len("tagatccgaa")
for i in range(len(bestAlignment)):
        print(seqApprox[i][bestAlignment[i]:bestAlignment[i]+k])

Wall time: 4 ms
[17, 47, 18, 33, 21, 0, 46, 70, 16, 65] 11
tagatctgaa
tggatccgaa
tagacccgaa
taaatccgaa
taggtccaaa
tagattcgaa
cagatccgaa
tagatccgta
tagatccaaa
tcgatccgaa
Wall time: 2 ms
[17, 47, 18, 33, 21, 0, 46, 70, 16, 65] 11
tagatctgaa
tggatccgaa
tagacccgaa
taaatccgaa
taggtccaaa
tagattcgaa
cagatccgaa
tagatccgta
tagatccaaa
tcgatccgaa


## Scan all possible k-mer motifs to find the best one
How many motifs are to be scanned?

In [3]:
import itertools

def MedianStringMotifSearch(DNA,k):
    """ Consider all possible 4**k motifs"""
    bestAlignment = []
    minHammingDist = k*len(DNA)
    kmer = ''
    for pattern in itertools.product('acgt', repeat=k):
        motif = ''.join(pattern)
        align, dist = ScanAndScoreMotif(DNA, motif)
        if (dist < minHammingDist):
            bestAlignment = [p for p in align]
            minHammingDist = dist
            kmer = motif
    return bestAlignment, minHammingDist, kmer

%time MedianStringMotifSearch(seqApprox,8)

Wall time: 1min 55s


([19, 49, 20, 35, 23, 2, 48, 72, 18, 67], 8, 'gatccgaa')

## Let's consider only Motifs seen in the DNA
Are we guaranteed to get the best motif?

In [5]:
def ContainedMotifSearch(DNA,k):
    """ Consider only motifs from the given DNA sequences"""
    motifSet = set()
    for seq in DNA:
        for i in range(len(seq)-k+1):
            motifSet.add(seq[i:i+k])
    print("%d Motifs in our set" % len(motifSet))
    bestAlignment = []
    minHammingDist = k*len(DNA)
    kmer = ''
    for motif in motifSet:
        align, dist = ScanAndScoreMotif(DNA, motif)
        if (dist < minHammingDist):
            bestAlignment = [s for s in align]
            minHammingDist = dist
            kmer = motif
    return bestAlignment, minHammingDist, kmer

k = 8
%time bestAlignment, minDist, kmer = ContainedMotifSearch(seqApprox, k)
print(bestAlignment, minDist, kmer)
for i in range(len(bestAlignment)):
        print(seqApprox[i][bestAlignment[i]:bestAlignment[i]+k])

718 Motifs in our set
Wall time: 1.34 s
[19, 49, 20, 35, 23, 2, 48, 72, 18, 67] 8 gatccgaa
gatctgaa
gatccgaa
gacccgaa
aatccgaa
ggtccaaa
gattcgaa
gatccgaa
gatccgta
gatccaaa
gatccgaa


## Contained Consensus Motif Search
Note if we only consider kmers in the DNA, the best kmer isn't necessarily the consensus of the alignment. 

The following functions gets the consensus motif and profile given an alignment. It also computes the likelyhood of k-mer given a profile. 

In [9]:
import numpy as np

def Consensus(s, DNA, k):
    """ compute the consensus k-Motif of an alignment given offsets into each DNA string.
            s = list of starting indices, 1-based, 0 means ignore, DNA = list of nucleotide strings,
            k = Target Motif length """
    consensus = ''
    for i in range(k):
        # loop over string positions
        cnt = dict(zip("acgt",(0,0,0,0)))
        for j, sval in enumerate(s):
            # loop over DNA strands
            base = DNA[j][sval+i] 
            cnt[base] += 1
        consensus += max(cnt.items(), key=lambda tup: tup[1])[0]
    return consensus

def Profile(s, DNA, k):
    """ Compute the profile of k-mer motifs in a DNA string 
        s = list of starting indices, 1-based, 0 means ignore, DNA = list of nucleotide strings,
        k = Target Motif length"""
    profile = np.zeros((4, k))
    for i in range(k):
        # loop over string positions
        cnt = dict(zip("acgt",(0,0,0,0)))
        for j, sval in enumerate(s):
            # loop over DNA strands
            base = DNA[j][sval+i] 
            cnt[base] += 1
        profile[:, i] = np.fromiter(cnt.values(), dtype=float) / (float)(len(s))
    return profile

def LikelyhoodKmerProfile(kmer, profile):
    """Given a profile, compute the probability of a kmer string"""
    # check that kmer length matches with profile length
    assert len(kmer) == profile.shape[1]
    prob = 1.0
    idx = dict(zip("acgt", (0, 1, 2, 3)))
    for i in range(len(kmer)):
        prob = prob * profile[idx[kmer[i]], i]
    return prob

def ScanAndScoreProfile(DNA, profile):
    """ Given a list of sequences in DNA and a profile,
    find the best alignments of the profile in all sequences"""
    bestAlignment = []
    k = profile.shape[1] 
    for seq in DNA:
        maxProbProfile = 0
        for s in range(len(seq)-k+1):
            prob = LikelyhoodKmerProfile(seq[s:s+k], profile)
            if (prob > maxProbProfile):
                bestS = s
                maxProbProfile = prob
        bestAlignment.append(bestS)
    return bestAlignment

def ContainedConsensusMotifSearch(DNA,k):
    bestAlignment, minHammingDist, kmer = ContainedMotifSearch(DNA,k)
    motif = Consensus(bestAlignment,DNA,k)
    newAlignment, HammingDist = ScanAndScoreMotif(DNA, motif)
    return newAlignment, HammingDist, motif

%time newAlignment, HammingDist, motif = ContainedConsensusMotifSearch(seqApprox,10)
print(motif)
print(newAlignment, HammingDist)
profile = Profile(newAlignment, seqApprox, 10)
print(profile)
print(LikelyhoodKmerProfile(motif, profile))
bestProfileAlignment = ScanAndScoreProfile(seqApprox, profile)
print(bestProfileAlignment)

709 Motifs in our set
Wall time: 1.59 s
tagatccgaa
[17, 47, 18, 33, 21, 0, 46, 70, 16, 65] 11
[[0.  0.8 0.1 0.9 0.  0.  0.  0.2 0.9 1. ]
 [0.1 0.1 0.  0.  0.1 0.9 0.9 0.  0.  0. ]
 [0.  0.1 0.9 0.1 0.  0.  0.  0.8 0.  0. ]
 [0.9 0.  0.  0.  0.9 0.1 0.1 0.  0.1 0. ]]
0.30611001600000015
[17, 47, 18, 33, 21, 0, 46, 70, 16, 65]


## Randomized Motif Search
Searches for a k-length motif that appears in all given DNA sequences. It begins with a random set of candidate consensus motifs derived from the data. It refines the motif until a true consensus emerges.

In [12]:
import random

def RandomizedMotifSearch(DNA,k):
    """ Searches for a k-length motif that appears 
    in all given DNA sequences. It begins with a
    random set of candidate consensus motifs 
    derived from the data. It refines the motif
    until a true consensus emerges."""
    
    # Seed with motifs from random alignments
    motifSet = set()
    for i in range(500):
        randomAlignment = [random.randint(0,len(DNA[j])-k) for j in range(len(DNA))]
        motif = Consensus(randomAlignment, DNA, k)
        motifSet.add(motif)

    bestAlignment = []
    minHammingDist = k*len(DNA)
    kmer = ''
    testSet = motifSet.copy()
    while (len(testSet) > 0):
        print(len(motifSet),end=', ')
        nextSet = set()
        for motif in testSet:
            align, dist = ScanAndScoreMotif(DNA, motif)
            # add new motifs based on these alignments
            newMotif = Consensus(align, DNA, k)
            if (newMotif not in motifSet):
                nextSet.add(newMotif)
            if (dist < minHammingDist):
                bestAlignment = [s for s in align]
                minHammingDist = dist
                kmer = motif
        testSet = nextSet.copy()
        motifSet = motifSet | nextSet
    return bestAlignment, minHammingDist, kmer

## Let's try it

In [15]:
%time bestAlignment, minDist, kmer = RandomizedMotifSearch(seqApprox,10)
print(Profile(bestAlignment, seqApprox, 10))
print(Consensus(bestAlignment, seqApprox, 10))

499, 740, 819, 830, 831, Wall time: 779 ms
[[0.  0.8 0.1 0.9 0.  0.  0.  0.2 0.9 1. ]
 [0.1 0.1 0.  0.  0.1 0.9 0.9 0.  0.  0. ]
 [0.  0.1 0.9 0.1 0.  0.  0.  0.8 0.  0. ]
 [0.9 0.  0.  0.  0.9 0.1 0.1 0.  0.1 0. ]]
tagatccgaa


In [58]:
for i in range(10):
    print(RandomizedMotifSearch(seqApprox,10))

499, 774, 861, 876, 878, ([17, 47, 18, 33, 21, 0, 46, 70, 16, 65], 11, 'tagatccgaa')
500, 768, 843, 863, 869, ([17, 47, 18, 33, 21, 0, 46, 70, 16, 65], 11, 'tagatccgaa')
500, 743, 823, 843, 845, ([17, 47, 18, 33, 21, 0, 46, 70, 16, 65], 11, 'tagatccgaa')
500, 756, 832, 844, 845, ([17, 47, 18, 33, 21, 0, 46, 70, 16, 65], 11, 'tagatccgaa')
500, 745, 826, 844, 850, ([17, 47, 18, 33, 21, 0, 46, 70, 16, 65], 11, 'tagatccgaa')
500, 776, 852, 870, 873, 874, ([17, 47, 18, 33, 21, 0, 46, 70, 16, 65], 11, 'tagatccgaa')
500, 762, 857, 878, 880, ([17, 47, 18, 33, 21, 0, 46, 70, 16, 65], 11, 'tagatccgaa')
500, 753, 822, 839, 844, 845, ([17, 47, 18, 33, 21, 0, 46, 70, 16, 65], 11, 'tagatccgaa')
500, 764, 845, 865, 867, 868, ([17, 47, 18, 33, 21, 0, 46, 70, 16, 65], 11, 'tagatccgaa')
500, 749, 825, 839, 841, ([17, 47, 18, 33, 21, 0, 46, 70, 16, 65], 11, 'tagatccgaa')
