In [None]:
# Naive exact match

def naive(p, t):
    occurrences = []
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match = True
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                match = False
                break
        if match:
            occurrences.append(i)  # all chars matched; record
    return occurrences


In [4]:
def naive_with_counts(p, t):
    occurrences = []
    num_alignments = 0
    num_character_comparisons = 0
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        num_alignments += 1
        match = True
        for j in range(len(p)):  # loop over characters
            num_character_comparisons += 1
            if t[i+j] != p[j]:  # compare characters
                match = False
                break
        if match:
            occurrences.append(i)  # all chars matched; record
    return occurrences, num_alignments, num_character_comparisons


In [5]:
# Example 1
###########
p = 'word'
t = 'there would have been a time for such a word'
occurrences, num_alignments, num_character_comparisons = naive_with_counts(p, t)
print(occurrences, num_alignments, num_character_comparisons)

[40] 41 46


In [6]:
# Example 2
###########
p = 'needle'
t = 'needle need noodle needle'
occurrences, num_alignments, num_character_comparisons = naive_with_counts(p, t)
print(occurrences, num_alignments, num_character_comparisons)

[0, 19] 20 35


Programming HW2 

In [1]:
#Read Genome from FASTA file#
#############################

def readGenome(filename):
    """ This function reads a FASTA file"""
    genome = '' # initialize genome to empty string
    with open(filename,'r') as f:               # Open file as f
        for line in f:                          # Loop therough and read each line of file f
            if not line[0] == '>':              # If line does not start with ">"
                genome += line.rstrip()         # Add line to the string genome, rstrip removes trailing whitespace from ends of string        
        
    return genome                               # After reading and adding all lines return the string genome



In [2]:
genome = readGenome('chr1.GRCh38.excerpt.fasta')
genome[:10]

'TTGAATGCTG'

In [7]:
# 1 How many alignments doe naive exact algorithm try
p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG'
occurrences, num_alignments, num_character_comparisons = naive_with_counts(p, genome)
print(num_alignments)

799954


In [8]:
# 2 How many character comparisons doe naive exact matching algorithm try

t = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG'
occurrences, num_alignments, num_character_comparisons = naive_with_counts(p, genome)
print(num_character_comparisons)

984143


In [8]:
# Boyer Moore with counts function
##################################
def boyer_moore_with_counts(p, p_bm, t):
    """ Do Boyer-Moore matching. p=pattern, t=text,
        p_bm=BoyerMoore object for p """
    i = 0
    occurrences = []
    num_alignments = 0
    num_character_comparisons = 0
    while i < len(t) - len(p) + 1:
        num_alignments += 1
        shift = 1
        mismatched = False
        for j in range(len(p)-1, -1, -1):
            num_character_comparisons += 1
            if p[j] != t[i+j]:
                skip_bc = p_bm.bad_character_rule(j, t[i+j])
                skip_gs = p_bm.good_suffix_rule(j)
                shift = max(shift, skip_bc, skip_gs)
                mismatched = True
                break
        if not mismatched:
            occurrences.append(i)
            skip_gs = p_bm.match_skip()
            shift = max(shift, skip_gs)
        i += shift
    return occurrences, num_alignments, num_character_comparisons


In [9]:
# Import preprocessig module for Boyer Moore
import bm_preproc
from bm_preproc import BoyerMoore

In [22]:
# 3 How many alignments does Boyer Moore try
p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG'
p_bm = BoyerMoore(p, alphabet = 'ACGT')
occurrences, num_alignments, num_character_comparisons = boyer_moore_with_counts(p, p_bm, genome)
print(occurrences, num_alignments, num_character_comparisons)
print('number of alignments tried:', num_alignments)

[56922] 127974 165191
number of alignments tried: 127974


In [10]:
# Example 1
###########
p = 'word'
t = 'there would have been a time for such a word'
lowercase_alphabet = 'abcdefghijklmnopqrstuvwxyz '
p_bm = BoyerMoore(p, lowercase_alphabet)
occurrences, num_alignments, num_character_comparisons = boyer_moore_with_counts(p, p_bm, t)
print(occurrences, num_alignments, num_character_comparisons)

[40] 12 15


In [None]:
Index-assisted approximate matching

In [9]:
# import bisect module
import bisect

In [18]:
class Index(object):
    def __init__(self, t, k):
        ''' Create index from all substrings of size 'length' '''
        self.k = k  # k-mer length (k)
        self.index = []
        for i in range(len(t) - k + 1):  # for each k-mer
            self.index.append((t[i:i+k], i))  # add (k-mer, offset) pair
        self.index.sort()  # alphabetize by k-mer
    
    def query(self, p):
        ''' Return index hits for first k-mer of P '''
        kmer = p[:self.k]  # query with first k-mer
        i = bisect.bisect_left(self.index, (kmer, -1))  # binary search, -1 means we get all indices greater than -1 , ie the first occurence of that kmer
        hits = []
        while i < len(self.index):  # collect matching index entries
            if self.index[i][0] != kmer:
                break
            hits.append(self.index[i][1])
            i += 1
        return hits

In [19]:
# Use the Index object to match a full pattern in a string t

def queryIndex(p, t, index): # Query fuction takes input pattern, string and index we created from t
    k = index.k # get the length of k from the index we created
    offsets = [] #List of offsets where it matches
    for i in index.query(p): # the query function from the Index object returns a list of possible places in t where p could start,
        if p[k:] == t[i+k:i+len(p)]:    #The query function gives us where the 1st k bases of p match k bases of t. 
            if i not in offsets:
                offsets.append(i)           #But we need to VERIFY if the rest of the string p matches t in that location
    return offsets                      # example: t="TAGACTAC",p="ACTA", i=3, k=3, p[k:]->p[3:]->'A'
                                        # t[i+k:i+len(p)]->t[6:3+4]->t[6:7]->'A'
                                            

In [20]:
index = Index(genome, 8)

In [23]:
def approximate_match(p, t, n):   #Takes as arguments pattern, text and the max number of mismatches n
    
    segment_length = int(round(len(p) /(n+1))) # Set the length of each partition of p, convert to int so that indices are int or it will raise error
    all_matches = set() # create a set to hold all the indices wehere we find a match, wihtout duplicates
    for i in range(n+1): # For each segment of P, for each iteration move along by the lenght of 1 segment
        #Set bounds of P for the segment we are searching for 
        start = i * segment_length # so if i = 0 and seg_l = 2, start = 0, i = 1, seg_l= 2, start = 1 * 2
        end = min((i+1) * segment_length, len(p)) #to make sure we dont run over end of p, since p might not be a perfect multiple of n+1
        #Find matches where the substring matched our text
        all_matches = queryIndex(p, t, index)     
    return list(all_matches)         
    

In [88]:
# Subsequence Index that handles subsequences that take every Nth character

import bisect
   
class SubseqIndex(object):
    """ Holds a subsequence index for a text T """
    
    def __init__(self, t, k, ival):
        """ Create index from all subsequences consisting of k characters
            spaced ival positions apart.  E.g., SubseqIndex("ATAT", 2, 2)
            extracts ("AA", 0) and ("TT", 1). """
        self.k = k  # num characters per subsequence extracted
        self.ival = ival  # space between them; 1=adjacent, 2=every other, etc
        self.index = []
        self.span = 1 + ival * (k - 1)
        for i in range(len(t) - self.span + 1):  # for each subseq
            self.index.append((t[i:i+self.span:ival], i))  # add (subseq, offset)
        self.index.sort()  # alphabetize by subseq
    
    def query(self, p):
        """ Return index hits for first subseq of p """
        subseq = p[:self.span:self.ival]  # query with first subseq
        i = bisect.bisect_left(self.index, (subseq, -1))  # binary search
        hits = []
        while i < len(self.index):  # collect matching index entries
            if self.index[i][0] != subseq:
                break
            hits.append(self.index[i][1])
            i += 1
        return hits


In [115]:
t = genome
subseq_ind = SubseqIndex(t, 8, 3)

In [103]:
def query_subseq(p, t, subseq_ind):
    k = subseq_ind.k
    offsets = []
    num_index_hits = 0
    for i in subseq_ind.query(p): 
        num_index_hits += 1
        if p[k:] == t[i+k:i+len(p)]:
            offsets.append(i)
    return offsets, num_index_hits        

In [65]:
ind = SubseqIndex('ATATAT', 3, 2)
print(ind.index)

[('AAA', 0), ('TTT', 1)]


In [66]:
p = 'TTATAT'
print(ind.query(p[0:]))

[]


In [68]:
print(ind.query(p[1:]))

[1]


In [116]:
def approximate_match(p, t, n):   #Takes as arguments pattern, text and the max number of mismatches n
    
    segment_length = int(round(len(p) /(n+1))) # Set the length of each partition of p, convert to int so that indices are int or it will raise error
    all_matches = set() # create a set to hold all the indices wehere we find a match, wihtout duplicates
    for i in range(n+1): # For each segment of P, for each iteration move along by the lenght of 1 segment
        #Set bounds of P for the segment we are searching for 
        start = i * segment_length # so if i = 0 and seg_l = 2, start = 0, i = 1, seg_l= 2, start = 1 * 2
        end = min((i+1) * segment_length, len(p)) #to make sure we dont run over end of p, since p might not be a perfect multiple of n+1
        #Find matches where the substring matched our text
        all_matches = subseq_ind(p, t, index)     
    return list(all_matches)         
    