# Z algorithms

In [13]:
def badBoyerMoorePatternSearch(sequence,pattern):
    patternLen = len(pattern)
    sequenceLen = len(sequence)

    preprocessingDic = dict()
    for key, value in pattern.items():
        preprocessingDic[value] = key 
    for key, value in preprocessingDic.items():
        if(patternLen - value - 1 != 0): preprocessingDic[key] = patternLen - value - 1

    indexlist = list()    
    keyParent = sequence.index[0]+ (patternLen - 1)  # set KeyParent as the index value of first backward searchable index 
    
    while keyParent < sequence.index[0] + sequenceLen  :  # loop all char elements from frist backward searchable index
        valueParent = sequence[keyParent] #set valueParent value of the squence element in question
        
        keyChild = patternLen - 1 # set KeyChild as the last index
        while keyChild >= 0 : #check backward 
            backward = patternLen - 1 - keyChild 
            # if sequence element in question is not same as element as in pattern 
            # shift forward to align common char in pattern (number of shift in preprocessDic)
            # break useless checking
            if sequence[keyParent - backward] != pattern[keyChild] : 
                keyParent += preprocessingDic.get(valueParent,patternLen)
                break
            keyChild -= 1 # decrement if char matches
        if keyChild < 0: 
            indexlist.append(keyParent -(patternLen - 1)) #append index of first char of matched pattern
            keyParent += 1
    return indexlist



In [1]:
# construct Z array
# search pattern in text
def calZarr(myStr, Z):
    strLen = len(myStr)
    L, R = 0, 0
    for i in range(1,strLen):
        if i > R:
            # reset left and R, prefix should start from i
            L, R = i, i
            # compare S[0...] and S[i...]
            # S[R-L] == S[0], increment R will traverse array until index R
            while (R < strLen) and (myStr[R] == myStr[R-L]):
                R += 1
            Z[i] = R - L 
            # deduct 1 as incremented R to compare next (S[i], S[R-L]) but not equal
            R -= 1
        else:
            k = i - L
            # S[i] == S[k]
            # Z[i] = min(Z[k], R - i +1)
            # compute {L, R}
            if Z[k] < R - i + 1:
                # [L, R] stays same
                Z[i] = Z[k]
            else:
                # reset
                L = i
                while(R < strLen) and (myStr[R-L] == myStr[R]):
                    R += 1
                Z[i] = R - L
                R -= 1
    return Z
def search(text, pattern):
    # create string for finding Z array
    concatStr = pattern + "$" + text
    myLen = len(concatStr)

    # construct Z array
    Z = [0]*myLen
    myZarray = calZarr(concatStr, Z)
    patLen = len(pattern)
    idxList = []

    for i in range(myLen):
        if myZarray[i] == patLen:
            patIdx = i - patLen - 1
            idxList.append(patIdx)
    return idxList


In [8]:
from Bio import SeqIO
import pandas as pd
df = pd.DataFrame()
for seq_record in SeqIO.parse(r"ls_orchid.fasta.txt", "fasta"):
    print(repr(seq_record.seq))
    print("Length of sequence is", len(seq_record))
    seqSeries = seq_record.seq

Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', SingleLetterAlphabet())
Length of sequence is 740
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC', SingleLetterAlphabet())
Length of sequence is 753
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA', SingleLetterAlphabet())
Length of sequence is 748
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT', SingleLetterAlphabet())
Length of sequence is 744
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA', SingleLetterAlphabet())
Length of sequence is 733
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC', SingleLetterAlphabet())
Length of sequence is 718
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT', SingleLetterAlphabet())
Length of sequence is 730
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GCA', SingleLetterAlphabet())
Length of sequence is 704
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATAG

In [9]:
# the last sequence
seqSeries

Seq('CATTGTTGAGATCACATAATAATTGATCGAGTTAATCTGGAGGATCTGTTTACT...GCC', SingleLetterAlphabet())

In [10]:
searchedPattern = "GGGCC" 
searchedSeries = pd.Series([x for x in searchedPattern])
print("Length of pattern is", len(searchedSeries))

Length of pattern is 5


In [11]:
# Z algo test
idxList = search(seqSeries, searchedPattern)
print(idxList) 

[587]


In [15]:
# BoyerMoore test
print(badBoyerMoorePatternSearch(pd.Series(list(str(seqSeries))), searchedSeries))

[587]
