# Z algorithms

In [80]:
import time
# construct Z array
# search pattern in text
   
def calZarr(myStr, Z):
    strLen = len(myStr)
    L, R = 0, 0
    for i in range(1,strLen):
        if i > R:
            # reset left and R, prefix should start from i
            L, R = i, i
            # compare S[0...] and S[i...]
            # S[R-L] == S[0], increment R will traverse array until index R
            while (R < strLen) and (myStr[R] == myStr[R-L]):
                R += 1
            Z[i] = R - L 
            # deduct 1 as incremented R to compare next (S[i], S[R-L]) but not equal
            R -= 1
        else:
            k = i - L
                                # S[i] == S[k]
                                # Z[i] = min(Z[k], R - i +1)
                                # compute {L, R}
            if Z[k] < R - i + 1:
                # [L, R] stays same
                Z[i] = Z[k]
            else:
                # reset
                L = i
                while(R < strLen) and (myStr[R-L] == myStr[R]):
                    R += 1
                Z[i] = R - L
                R -= 1
    return Z
def search(text, pattern):
    
    # create string for finding Z array
    concatStr = pattern + "$" + text
    myLen = len(concatStr)

    # construct Z array
    Z = [0]*myLen
    start_time = time.time()  
    myZarray = calZarr(concatStr, Z)
    print("--- Pre-Processing %s seconds ---" % (time.time() - start_time))
    patLen = len(pattern)
    idxList = []
    
    start_time = time.time()  
    for i in range(myLen):
        if myZarray[i] == patLen:
            patIdx = i - patLen - 1
            idxList.append(patIdx)
    print("--- Processing %s seconds ---" % (time.time() - start_time))
    return idxList

def calZarr2(myStr, patLen, Z):
    strLen = len(myStr)
    L,R = 0,0
    lookupDict = {}

    for i in range(1, strLen):
        k = Z[i-L] if (Z[i-L]  < R-i) else R-i
        Z[i] = (abs(k)+k)//2 # if negative, convert to 0
        while (i + Z[i] < strLen) and (myStr[Z[i]] == myStr[i + Z[i]]):
            Z[i] += 1
        if (i + Z[i] > R): 
            L = i
            R = i + Z[i]
        try:
            lookupDict[Z[i]].append(i - patLen - 1)
        except KeyError:
            lookupDict[Z[i]] = [i - patLen - 1]
    return lookupDict

def search2(text, pattern):
    # create string for finding Z array
    concatStr = pattern + "$" + text
    myLen = len(concatStr)
    patLen = len(pattern)

    # construct Z array
    Z = [0]*myLen
    start_time = time.time()  
    zDict = calZarr2(concatStr, patLen, Z)
    print("--- Pre-Processing %s seconds ---" % (time.time() - start_time))
    start_time = time.time()  
    if patLen in zDict:
        x = zDict[patLen]
    else:
        x = []
    print("--- Processing %s seconds ---" % (time.time() - start_time))
    return x

In [109]:
from Bio import SeqIO
import pandas as pd
df = pd.DataFrame()
for seq_record in SeqIO.parse(r"sequence.fna", "fasta"):
    print(repr(seq_record.seq))
    print("Length of sequence is", len(seq_record))
    seqSeries = seq_record.seq

Seq('aatataagggaatggagaagaattgtgtaggaaatatcagaaagggagacagaa...TCG', SingleLetterAlphabet())
Length of sequence is 123313939
Seq('GAGCGGTACAACGAGAGTAGCCGCGCGGCGGCCGCGAGTAGAAGCTAGCGAGGG...AGT', SingleLetterAlphabet())
Length of sequence is 86187811
Seq('Ctacaacaaagggtgtgtgtgtggtgtggggggggggtatgtgctgccttaaac...ggt', SingleLetterAlphabet())
Length of sequence is 92870237
Seq('aaccctaaccctaaccctaaccctaaccctaacccctaaccctaaccctaaccc...agg', SingleLetterAlphabet())
Length of sequence is 89007665
Seq('ttttcgattcgattcgattcttttcgattcgattcgattcgattcgattcgatt...ata', SingleLetterAlphabet())
Length of sequence is 89573405
Seq('ttcctgctataaacatcggggtgcaggtgacccggcatttcattgcatctgaat...ACG', SingleLetterAlphabet())
Length of sequence is 78268176
Seq('AAAAGAATCAAAATCGAAAGAAtcgaaataatcgaatcgaaaagaatcgaaaag...cca', SingleLetterAlphabet())
Length of sequence is 81039452
Seq('AGTGCTTTTGTTGTAGAGTTACCAACAGGTCAGAAGATCAGAGTGAGTCATGGC...TTA', SingleLetterAlphabet())
Length of sequence is 75260524
Seq('TACCCTAAAA

Seq('CAACTCAAAGTGgattatacacatacatgtaaaaCATTAAGTTATacaatttct...GCA', SingleLetterAlphabet())
Length of sequence is 86956
Seq('AGCGCAGGTTCCAAGGACCTGCTGGAAAGGGCAAGATGGGCCAGAGAGAGGAGA...ggg', SingleLetterAlphabet())
Length of sequence is 46521
Seq('ggtgacgggcactgggggttattctgtatgttagtaaattgaacaccaataaaa...aag', SingleLetterAlphabet())
Length of sequence is 117440
Seq('GTAAATCTGTCTTAAATGAAGCATCTGGGACCTGAAAGATTAAAGCTGGCTCCC...TTC', SingleLetterAlphabet())
Length of sequence is 118337
Seq('ACCTGCCTCCAAAATTCATTAGAGTTCCTAGAAGAAAGATAGGAGCCAAGTATG...cta', SingleLetterAlphabet())
Length of sequence is 51224
Seq('GGTGGCGGGGAGCATAGGGTGAGAGGACGGGCCACAGGCACTAAGCAGGTGTCA...TCG', SingleLetterAlphabet())
Length of sequence is 56787
Seq('GCTGAAGTGGAAAAACCAGGTGCTGCCGACAGCGTCCCCAAACACCAATTCCCT...TTA', SingleLetterAlphabet())
Length of sequence is 30484
Seq('ccaggtcagctctgctgtcctcagaaccgtctctcaggctgaaacctcaggaca...ttt', SingleLetterAlphabet())
Length of sequence is 15959
Seq('Gacggaaggaagaaggaagaaggaaagaaagaa

Seq('gcaggctccctgcagggagagagcccaatacaggactcagtcccaggaccccgg...TGG', SingleLetterAlphabet())
Length of sequence is 98160
Seq('attatttctctgaaattgtTATGTAAAATATTGcatctgctttgtttgtttttg...TAG', SingleLetterAlphabet())
Length of sequence is 887828
Seq('CCTGGCCTGCTCCTGTACCGTCATCCCAAGGAGGTCTTAGCAAGCTTCAGGACA...AAA', SingleLetterAlphabet())
Length of sequence is 319569
Seq('TAGCATCCAGTCCCCAAACTGAAGCACTCAAAGCAAAAGCTCCACCTTCACcga...ATT', SingleLetterAlphabet())
Length of sequence is 706478
Seq('CCCCATCCTCGTAATTTCTTTCCTTGGGCAACATACTTGGTTCAGCCGTTTTAA...ATT', SingleLetterAlphabet())
Length of sequence is 470734
Seq('ATTCTgaggaaaaaatgtaatttgaaatgATTGATATGAACTCAAAGGGGTGTG...AGT', SingleLetterAlphabet())
Length of sequence is 256977
Seq('aggacctctagtttggattgcatctcattcaattgatttttaatttctccttga...CTT', SingleLetterAlphabet())
Length of sequence is 261000
Seq('aaaataatctggtgttttttttggggggggcagttgttattgtttttcttgttt...cga', SingleLetterAlphabet())
Length of sequence is 198152
Seq('ccctaaccctaaccctaaccctaaccct

In [67]:
# the last sequence
seqSeries

Seq('CATTGTTGAGATCACATAATAATTGATCGAGTTAATCTGGAGGATCTGTTTACT...GCC', SingleLetterAlphabet())

In [106]:
searchedPattern = "CGTAA" 
searchedSeries = pd.Series([x for x in searchedPattern])
print("Length of pattern is", len(searchedSeries))

Length of pattern is 5


In [107]:
# Z algo test
idxList = search(seqSeries, searchedPattern)
print(idxList) 


idxList2 = search2(seqSeries, searchedPattern)
print(idxList2)

--- Pre-Processing 0.000997304916381836 seconds ---
--- Processing 0.0 seconds ---
[]
--- Pre-Processing 0.000997304916381836 seconds ---
--- Processing 0.0 seconds ---
[]


In [103]:
# BoyerMoore test
print(badBoyerMoorePatternSearch(pd.Series(list(str(seqSeries))), searchedSeries))

NameError: name 'badBoyerMoorePatternSearch' is not defined