In [21]:
#Boyer Moore Function
        
def boyer_moore(p, p_bm, t):
    '''Do Boyer Moore Matching'''
    i = 0  # keep track of the index where we are in the text
    occurrences = []
    while i < len(t) - len(p) + 1: # Loop through all the positions in the text where p could start without running past end of t
        shift = 1  # The amount that we move along after this comparison( bad char and good suffix - shift can be more than 1 each time)
        mismatched = False # Create variable mismatch which we update if we find a match as we go along
        for j in range(len(p)-1, -1, -1 ): #Loop through pattern p from end to beginning, third argument -1 is because we are going backwards
                                           # second argument is so we stop just before j is -1 ie at 0
                if not p[j] == t[i+j]:  # If there is a mismatch
                    #Check bad character rule to see howw mauch we can skip
                    skip_bc = p_bm.bad_character_rule(j, t[i+j]) # pass in the index of the mismatch, and the character at that index in text that mismatched
                    skip_gs = p_bm.good_suffix_rule(j) # pass in index of the mismatch
                    # We shift by the max of the above two rules
                    shift = max(shift, skip_bc, skip_gs)
                    mismatched = True # set mismatch as we have encountered a mismatch
                    break # since we found mismatch we dont have to compare rest of the string 
                    
        if not mismatched:
            occurrences.append(i) # If there was no mismatch we add the index to list of occurences
            # We now check the match_skip function (incase when the entire pattern matched with the text) 
            skip_gs = p_bm.match_skip()
            # Now find max of the shift
            shift = max(shift, skip_gs)
        i += shift # update position by shift calculated
    return occurrences  

In [14]:
# Import preprocessig module for Boyer Moore
import bm_preproc
from bm_preproc import BoyerMoore

In [29]:
 #Boyer Moore Function
    
    
def boyer_moore(p, p_bm, t):
    i = 0  # keep track of the index where we are in the text
    occurrences = []
    while i < len(t) - len(p) + 1: # Loop through all the positions in the text where p could start without running past end of t
        shift = 1  # The amount that we move along after this comparison( bad char and good suffix - shift can be more than 1 each time)
        mismatched = False # Create variable mismatch which we update if we find a match as we go along
        for j in range(len(p)-1, -1, -1 ): #Loop through pattern p from end to beginning, third argument -1 is because we are going backwards
                                           # second argument is so we stop just before j is -1 ie at 0
                if not p[j] == t[i+j]:  # If there is a mismatch
                    #Check bad character rule to see howw mauch we can skip
                    skip_bc = p_bm.bad_character_rule(j, t[i+j]) # pass in the index of the mismatch, and the character at that index in text that mismatched
                    skip_gs = p_bm.good_suffix_rule(j) # pass in index of the mismatch
                    # We shift by the max of the above two rules
                    shift = max(shift, skip_bc, skip_gs)
                    mismatched = True # set mismatch as we have encountered a mismatch
                    break # since we found mismatch we dont have to compare rest of the string 
                    
        if not mismatched:
            occurrences.append(i) # If there was no mismatch we add the index to list of occurences
            # We now check the match_skip function (incase when the entire pattern matched with the text) 
            skip_gs = p_bm.match_skip()
            # Now find max of the shift
            shift = max(shift, skip_gs)
        i += shift # update position by shift calculated
    return occurrences  

In [30]:
p_bm = BoyerMoore(p, alphabet = 'ACGT')
boyer_moore(p, p_bm, t)

[56922, 262042, 364263, 657496, 717706]

In [31]:
def approximate_match(p, t, n):   #Takes as arguments pattern, text and the max number of mismatches n
    segment_length = int(round(len(p) /(n+1))) # Set the length of each partition of p, convert to int so that indices are int or it will raise error
    all_matches = set() # create a set to hold all the indices wehere we find a match, wihtout duplicates
    for i in range(n+1): # For each segment of P, for each iteration move along by the lenght of 1 segment
        #Set bounds of P for the segment we are searching for 
        start = i * segment_length # so if i = 0 and seg_l = 2, start = 0, i = 1, seg_l= 2, start = 1 * 2
        end = min((i+1) * segment_length, len(p)) #to make sure we dont run over end of p, since p might not be a perfect multiple of n+1
        p_bm = BoyerMoore(p[start:end], alphabet = 'ACGT') #We pass in the substring we just calculated and the alphabet to the preprocessing object, for making our tables for good suffix and bad character rule
        #Find matches where the substring matched our text
        matches = boyer_moore(p[start:end], p_bm, t) # use the boyer moore function and pass in the substring, p_bm obj, t
        
        # Verification to see that the rest of p matches t with no more than n mismatches
        for m in matches:
            #Make sure our location does not let p run off the begining or the end of t
            if m < start or m-start+len(p) > len(t): # if any of this is true then we will run past the beginning or end of t
                continue # if any of the abov is true skip the rest of the loop
                
            mismatches = 0 # To count the mismatches between the rest of p and t
            # Compare part of p before the start(from 0 to start against corresponding position in t)
            for j in range(0, start):
                if not p[j] == t[m-start + j]: # if corresponding positions dont match
                    mismatches += 1 # increment mismatch by 1
                    if mismatches > n: # If the number of mismatches is more than n
                        break  # break out of this inner loop
            # Compare the part of p after the end
            for j in range(end, len(p)): 
                if not p[j] == t[m-start+j]:
                    mismatches += 1
                    if mismatches > n:
                        break
                        
            if mismatches <= n: # If we have verified on both sides of p and mismatches are less than n
                all_matches.add(m-start) # we add the m - start to get the begining of p for the match to the set all_matches
                
                
                    
    return matches, list(all_matches)         
    

In [32]:
matches, all_matches = approximate_match(p, t, 2)

In [34]:
print(len(all_matches))

19


In [35]:
print(len(matches))

60


In [1]:
p = 'AACTTG'
t = 'CACTTAATTTG'
print(approximate_match(p, t, 2))

In [None]:
print(t[5:])

In [1]:
#Read Genome from FASTA file#
#############################

def readGenome(filename):
    """ This function reads a FASTA file"""
    genome = '' # initialize genome to empty string
    with open(filename,'r') as f:               # Open file as f
        for line in f:                          # Loop therough and read each line of file f
            if not line[0] == '>':              # If line does not start with ">"
                genome += line.rstrip()         # Add line to the string genome, rstrip removes trailing whitespace from ends of string        
        
    return genome                               # After reading and adding all lines return the string genome



In [2]:
genome = readGenome('chr1.GRCh38.excerpt.fasta')
genome[:10]

'TTGAATGCTG'

In [19]:
t = genome
p = 'GGCGCGGTGGCTCACGCCTGTAAT'

In [24]:
hits, all_matches = approximate_match(p,t,2)

In [25]:
print(hits)

[18749, 19182, 22413, 22548, 23019, 23154, 43143, 56938, 67379, 83736, 83879, 84657, 84791, 108126, 129010, 147574, 160745, 175326, 186012, 187671, 191468, 205397, 251106, 251240, 262058, 273685, 282020, 322751, 364279, 364412, 421237, 429315, 454348, 465663, 471982, 480517, 480658, 523101, 551150, 551843, 572212, 588494, 595557, 613475, 621507, 632321, 635947, 646504, 651539, 657512, 674072, 681753, 707167, 717722, 719434, 724943, 746636, 747375, 747511, 760505]


In [26]:
print(len(hits))

60


In [1]:
p = 'GGCGCGGTGGCTCACGCCTGTAAT'
p[:-1]

'GGCGCGGTGGCTCACGCCTGTAA'

In [2]:
a = 'happy'
a[:-1]

'happ'