In [140]:
from Bio import SeqIO
import pandas as pd
import time

In [141]:
records = []
for seq_record in SeqIO.parse("bee.fna", "fasta"):
    print(repr(seq_record.seq))
    print("Length of sequence is", len(seq_record))
    records.append(str(seq_record.seq))
    
stringSequence = "".join(records)

stringSequence = stringSequence[0:50000]    
seqSeries = pd.Series(list(stringSequence))

Seq('GAGAGAATTAACTACCTTAACCTGAACCTAAACCTACCGATAACCTAACTCTAA...gtt', SingleLetterAlphabet())
Length of sequence is 27754200
Seq('CCCCCCCccaggggggggggggaaaaaagaaaaaggaattgctctttgacctcg...ggt', SingleLetterAlphabet())
Length of sequence is 16089512
Seq('gctctcatttttttagtaggttgaccaataaagagctttttaagcacaacttta...aat', SingleLetterAlphabet())
Length of sequence is 3988
Seq('CCCCAGGAGAAGAtccataatttgtaatataacaaataaatatttaataattaa...agg', SingleLetterAlphabet())
Length of sequence is 13619445
Seq('AGAGCTGAGAGGTTTTACGAGGTATAGTCTTGTCGGCATAACGTTTATTGaact...tag', SingleLetterAlphabet())
Length of sequence is 13404451
Seq('GGTGCGAAGTTTggttggataacgtctctctcTGAACCCGTTGTTcatgcagaa...agt', SingleLetterAlphabet())
Length of sequence is 13896941
Seq('TTCAAATTagtatttccaattttaatacatattattatcaccatttttctatta...CTA', SingleLetterAlphabet())
Length of sequence is 24043
Seq('taatataaatattttagtcaTTTACATATTACAGAAGActgataagaaaattat...ggt', SingleLetterAlphabet())
Length of sequence is 17789102
Seq('AGAAAAAAGacacaaatt

In [142]:
searchedPattern = "CCTACAA" #CCTACAA
searchedSeries = pd.Series([x for x in searchedPattern])
print("Length of pattern is", len(searchedSeries))

Length of pattern is 7


In [143]:
def buildMap(sequence, chunk):
    #expect m/c time complexity; m - sequence length, c - chunk size
    m = len(sequence)
    ptr = 0
    Map = {}
    while ptr < m :
        string = "".join(sequence[ptr:(ptr + chunk)].to_numpy())
        if string in Map:
            Map[string].append(ptr)
        else:
            Map[string]=[ptr]
        ptr += chunk
    return Map


In [144]:
def buildCombination(pattern, chunk):
    #expect n-(c-1) time complexity; n - pattern length, c - chunk size
    m = len(pattern)
    ptr = 0
    Map = {}
    while ptr + chunk <= m :
        string = "".join(pattern[ptr:(ptr + chunk)].to_numpy())
        if string in Map:
            Map[string].append(ptr)
        else:
            Map[string]=[ptr]
        ptr += 1
    return Map

In [145]:
def checkPattern(sequence, pattern, sequenceLoc, patternLoc, chunk):
    # expect n - c time complexity 
    sequenceLength = len(sequence)
    #check front of chunk
    for x in range(0,patternLoc):
        sequenceIndex = sequenceLoc + (x - patternLoc) + sequence.index[0]
        OutofBound = ( False if 0 > sequenceIndex or sequenceIndex > sequenceLength else True)
        if (OutofBound and pattern[x] != sequence[sequenceIndex]) : return False
    #check back of chunk
    for x in range(patternLoc + chunk,len(pattern)):
        sequenceIndex = sequenceLoc + (x - patternLoc) + sequence.index[0]
        OutofBound = ( False if 0 > sequenceIndex or sequenceIndex > sequenceLength else True)
        if (OutofBound and pattern[x] != sequence[sequenceIndex]) : return False
    return True

In [146]:
def chunkingAlgo(sequence, pattern):
    start_time = time.time()
        #chunk const has to be less then half of the length of sequence to ensure pattern will definitely hit in the chucking of sequence.
    chunk = int(len(pattern) / 2)
    sequenceSet = buildMap(sequence,chunk) 
    patternSet = buildCombination(pattern,chunk)

    print("--- Pre-Processing %s seconds ---" % (time.time() - start_time))


    start_time = time.time()  
    
    indexlist = list()  
    for keyPattern, valuePattern in patternSet.items():
        if keyPattern in sequenceSet:
            for sequenceElement in sequenceSet[keyPattern]:
                for patternElement in valuePattern:
                    if checkPattern(sequence,pattern,sequenceElement,patternElement,chunk):
                        indexlist.append(sequenceElement - patternElement + sequence.index[0]) 

    print("--- Processing %s seconds ---" % (time.time() - start_time))

    return set(indexlist)

In [147]:
print(seqSeries, searchedSeries)
print(chunkingAlgo(seqSeries, searchedSeries))

0        G
1        A
2        G
3        A
4        G
        ..
49995    t
49996    t
49997    t
49998    c
49999    t
Length: 50000, dtype: object 0    C
1    C
2    T
3    A
4    C
5    A
6    A
dtype: object
--- Pre-Processing 1.0986928939819336 seconds ---
--- Processing 0.01772022247314453 seconds ---
{261, 510}
