# Z Algorithm

In [1]:
import time
import pandas as pd
from collections import defaultdict
from Bio import SeqIO
from Bio.SeqIO.FastaIO import SimpleFastaParser
# use a defaultdict to avoid KeyError

## Parameter Input


In [6]:
file_name = "bee.fna"
subset_size = 50000
searched_pattern = "CCTACAA"

- file_name: name of the FASTA file located in the same directory
- subset_size: size of subset of the FASTA file taken from 0 to subset_size
- searched_pattern: string to be searched inside FASTA file

In [8]:
def calZArr(myStr, patLen, Z, lookupDict):
    strLen = len(myStr)
    L, R = 0, 0
    count = 0
    for i in range(1,strLen):
        if i > R:
            # reset left and R, prefix should start from i
            L, R = i, i
            # compare S[0...] and S[i...]
            # S[R-L] == S[0], increment R will traverse array until index R
            while (R < strLen) and (myStr[R] == myStr[R-L]):
                R += 1
            Z[i] = R - L
            # deduct 1 as incremented R to compare next (S[i], S[R-L]) but not equal
            R -= 1
        else:
            k = i - L
                                # S[i] == S[k]
                                # Z[i] = min(Z[k], R - i +1)
                                # compute {L, R}
            if Z[k] < R - i + 1:
                # [L, R] stays same
                Z[i] = Z[k]
            else:
                # reset
                L = i
                while(R < strLen) and (myStr[R-L] == myStr[R]):
                    R += 1
                Z[i] = R - L
                R -= 1
        # save in dictionary, Key = Z-Value at the index, Value = index in sequence
        lookupDict[Z[i]].append(i-patLen-1)
    return lookupDict

In [9]:
def ZAlgo(text, pattern):
    # create string for finding Z array
    concatStr = pattern + "$" + text
    myLen = len(concatStr)
    patLen = len(pattern)

    # construct Z array
    Z = [0] * myLen
    lookupDict = defaultdict(list)
    start_time = time.time()
    zDict = calZArr(concatStr, patLen, Z, lookupDict)
    print("--- Pre-Processing %s seconds ---" % (time.time() - start_time))

    start_time = time.time()
    # retrieve from dictionary any potential matches
    matches = zDict[patLen]
    print("--- Processing %s seconds ---" % (time.time() - start_time))
    return matches

In [10]:
with open(file_name) as handle:
    list1 = [seq for (name,seq) in SimpleFastaParser(handle)]
stringSequence = "".join(list1)
stringSequence = stringSequence[0:subset_size] # comment out to search whole FASTA file
#seqSeries = pd.Series(list(stringSequence))
print("Total length of genome sequence: ", len(stringSequence))

Total length of genome sequence:  50000


In [11]:
idxList = ZAlgo(stringSequence, searched_pattern)
print(idxList) 

--- Pre-Processing 0.03390908241271973 seconds ---
--- Processing 0.0 seconds ---
[261, 510]
