In [111]:
from Bio import SeqIO
import pandas as pd
import time

df = pd.DataFrame()

for seq_record in SeqIO.parse("sequence.fasta", "fasta"):
    print(repr(seq_record.seq))
    print("Length of sequence is", len(seq_record))
    seqSeries = pd.Series(list(str(seq_record.seq)))


Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN', SingleLetterAlphabet())
Length of sequence is 120883175


In [112]:
searchedPattern = "CTCAGAA" 
searchedSeries = pd.Series([x for x in searchedPattern])
print("Length of pattern is", len(searchedSeries))

Length of pattern is 7


In [113]:
seqSeries = seqSeries[10000000:10050000]
seqSeries

10000000    C
10000001    T
10000002    C
10000003    T
10000004    T
           ..
10049995    A
10049996    A
10049997    T
10049998    T
10049999    T
Length: 50000, dtype: object

In [114]:
def nativePatternSearch(sequence, pattern):
    #Simple linear Pattern Searching
    indexList = list() # list for storing matched pattern starting index 
    patternLen = len(pattern)
    for keyParent, valueParent in sequence.items(): # iter over every char in sequence (split key/value)
        for keyChild, valueChild in pattern.items(): # iter over every char in searched pattern (split key/value)
            try:
                # check if following individual chars matches all chars in searched pattern break internal for loop if a single char does not match. 
                # if current index char and following char fulfill searched pattern append to indexlist
                if sequence[keyParent + keyChild] != valueChild: break 
                elif (patternLen == (keyChild + 1)): indexList.append(keyParent) 
            except: break
    return set(indexList)

start_time = time.time()
print(nativePatternSearch(seqSeries, searchedSeries))
print("--- %s seconds ---" % (time.time() - start_time))

{10045603, 10031430, 10030605, 10007151, 10001235, 10035415, 10047831}
--- 1.0524308681488037 seconds ---


In [115]:
def badBoyerMoorePatternSearch(sequence,pattern):
    start_time = time.time()

    patternLen = len(pattern)
    sequenceLen = len(sequence)

    preprocessingDic = dict()
    for key, value in pattern.items():
        preprocessingDic[value] = key 
    for key, value in preprocessingDic.items():
        if(patternLen - value - 1 != 0): preprocessingDic[key] = patternLen - value - 1

    print("--- Pre-Processing %s seconds ---" % (time.time() - start_time))

    start_time = time.time()
    
    indexlist = list()    
    keyParent = sequence.index[0] + (patternLen - 1)  # set KeyParent as the index value of first backward searchable index 
    
    while keyParent < sequence.index[0] + sequenceLen  :  # loop all char elements from frist backward searchable index
        valueParent = sequence[keyParent] #set valueParent value of the squence element in question
        
        keyChild = patternLen - 1 # set KeyChild as the last index
        while keyChild >= 0 : #check backward 
            backward = patternLen - 1 - keyChild 
            # if sequence element in question is not same as element as in pattern 
            # shift forward to align common char in pattern (number of shift in preprocessDic)
            # break useless checking
            if sequence[keyParent - backward] != pattern[keyChild] : 
                keyParent += preprocessingDic.get(valueParent,patternLen)
                break
            keyChild -= 1 # decrement if char matches
        if keyChild < 0: 
            indexlist.append(keyParent -(patternLen - 1)) #append index of first char of matched pattern
            keyParent += 1

    print("--- Processing %s seconds ---" % (time.time() - start_time))

    return set(indexlist)

print(badBoyerMoorePatternSearch(seqSeries, searchedSeries))

--- Pre-Processing 4.887580871582031e-05 seconds ---
--- Processing 0.44123196601867676 seconds ---
{10031430, 10030605, 10007151, 10001235, 10035415, 10047831}


In [116]:
def buildMap(sequence, chunk):
    #expect m/c time complexity; m - sequence length, c - chunk size
    m = len(sequence)
    ptr = 0
    Map = {}
    while ptr < m :
        string = "".join(sequence[ptr:(ptr + chunk)].to_numpy())
        if string in Map:
            Map[string].append(ptr)
        else:
            Map[string]=[ptr]
        ptr += chunk
    return Map


In [117]:
def buildCombination(pattern, chunk):
    #expect n-(c-1) time complexity; n - pattern length, c - chunk size
    m = len(pattern)
    ptr = 0
    Map = {}
    while ptr + chunk <= m :
        string = "".join(pattern[ptr:(ptr + chunk)].to_numpy())
        if string in Map:
            Map[string].append(ptr)
        else:
            Map[string]=[ptr]
        ptr += 1
    return Map

In [118]:
def checkPattern(sequence, pattern, sequenceLoc, patternLoc, chunk):
    #check front of chunk
    for x in range(0,patternLoc):
        if (pattern[x] != sequence[sequenceLoc + (x - patternLoc) + 10000000]) : return False
    #check back of chunk
    for x in range(patternLoc + chunk,len(pattern)):
        if (pattern[x] != sequence[sequenceLoc + (x - patternLoc) + 10000000]) : return False
    return True


In [120]:
def chunkingAlgo(sequence, pattern):
    start_time = time.time()
        #chunk const has to be less then half of the length of sequence to ensure pattern will definitely hit in the chucking of sequence.
    chunk = int(len(pattern) / 2)
    sequenceSet = buildMap(sequence,chunk) 
    patternSet = buildCombination(pattern,chunk)

    print("--- Pre-Processing %s seconds ---" % (time.time() - start_time))


    start_time = time.time()  

    indexlist = list()  
    for keyPattern, valuePattern in patternSet.items():
        if keyPattern in sequenceSet:
            for sequenceElement in sequenceSet[keyPattern]:
                for patternElement in valuePattern:
                    if checkPattern(sequence,pattern,sequenceElement,patternElement,chunk):
                        indexlist.append(sequenceElement - patternElement + 10000000) 

    print("--- Processing %s seconds ---" % (time.time() - start_time))

    return set(indexlist)

print(chunkingAlgo(seqSeries, searchedSeries))


--- Pre-Processing 1.1363089084625244 seconds ---
--- Processing 0.049941062927246094 seconds ---
{10045603, 10031430, 10030605, 10007151, 10001235, 10035415, 10047831}


In [None]:
#naive algo
--- 1.0524308681488037 seconds ---
{10045603, 10031430, 10030605, 10007151, 10001235, 10035415, 10047831}

#chunking algo
--- Pre-Processing 1.1363089084625244 seconds ---
--- Processing 0.049941062927246094 seconds ---
{10045603, 10031430, 10030605, 10007151, 10001235, 10035415, 10047831}
