# Chunking Algorithm

In [2]:
from Bio import SeqIO
import pandas as pd
import time

## Parameter Input

In [3]:
file_name = "bee.fna"
subset_size = 50000
searched_pattern = "TGATA"

searchedSeries = pd.Series([x for x in searched_pattern])
print("Length of pattern is", len(searchedSeries))

Length of pattern is 5


- file_name: name of the FASTA file located in the same directory
- subset_size: size of subset of the FASTA file taken from 0 to subset_size
- searched_pattern: string to be searched inside FASTA file

In [6]:
records = []
for seq_record in SeqIO.parse(file_name, "fasta"):
    print(repr(seq_record.seq))
    print("Length of sequence is", len(seq_record))
    records.append(str(seq_record.seq))
    
stringSequence = "".join(records)

stringSequence = stringSequence[0:subset_size]    
seqSeries = pd.Series(list(stringSequence))

Seq('GAGAGAATTAACTACCTTAACCTGAACCTAAACCTACCGATAACCTAACTCTAA...gtt')
Length of sequence is 27754200
Seq('CCCCCCCccaggggggggggggaaaaaagaaaaaggaattgctctttgacctcg...ggt')
Length of sequence is 16089512
Seq('gctctcatttttttagtaggttgaccaataaagagctttttaagcacaacttta...aat')
Length of sequence is 3988
Seq('CCCCAGGAGAAGAtccataatttgtaatataacaaataaatatttaataattaa...agg')
Length of sequence is 13619445
Seq('AGAGCTGAGAGGTTTTACGAGGTATAGTCTTGTCGGCATAACGTTTATTGaact...tag')
Length of sequence is 13404451
Seq('GGTGCGAAGTTTggttggataacgtctctctcTGAACCCGTTGTTcatgcagaa...agt')
Length of sequence is 13896941
Seq('TTCAAATTagtatttccaattttaatacatattattatcaccatttttctatta...CTA')
Length of sequence is 24043
Seq('taatataaatattttagtcaTTTACATATTACAGAAGActgataagaaaattat...ggt')
Length of sequence is 17789102
Seq('AGAAAAAAGacacaaattattacaaagaCGCATTTAGAAACCGAAATATGGATC...ata')
Length of sequence is 23881
Seq('GGGAAGAAGATTTGTCTGGCCGATTTGCATGCCGGCCCATCATCATCCAGATCC...agg')
Length of sequence is 14198698
Seq('tgcacagaagtaaca

Length of sequence is 35193
Seq('GATTAGAATATGGCATTAGAACTGTAAGATAGcttgatcaaaaaatatatcaat...TAC')
Length of sequence is 35070
Seq('TAaagtattatttgaaaaatcaattattgaatattatcaaataaatttatatag...ATT')
Length of sequence is 34588
Seq('aaaaaagtgaaaatttttaataaaataaataaatataatttttttattaagtat...ata')
Length of sequence is 33219
Seq('GATTGCCCCTTGCACACATCTTTCATCTGTTTCTGCATGACAAGCGGTACATAC...aat')
Length of sequence is 30850
Seq('TTATTCGttagttttttctttgaaataaaattattatcaaaaacaaaatttatg...ACA')
Length of sequence is 30345
Seq('AACCACACAAGGAGAGAGGAATTAAGgaactaaattaatattatcaaaattaaa...ACC')
Length of sequence is 30149
Seq('taaatacgatttccgcagctcgttgcaagcgggaaagattgctctttgacctcg...ata')
Length of sequence is 29749
Seq('aaattcttataaaattcctTAAATAAAACACGAATCAGGAAACAGAAATGCTCT...TGT')
Length of sequence is 29493
Seq('ctatgtaatataaataataatctataaataactattatttatattattgaataa...CAG')
Length of sequence is 28697
Seq('cattgttttgaattttttgaattttaaaaatcttctttgaGATAGTATACAAAG...ATG')
Length of sequence is 25587
Seq('GAGAGAG

In [7]:
def buildMap(sequence, chunk):
    #expect m/c time complexity; m - sequence length, c - chunk size
    m = len(sequence)
    ptr = 0
    Map = {}
    while ptr < m :
        string = "".join(sequence[ptr:(ptr + chunk)].to_numpy())
        if string in Map:
            Map[string].append(ptr)
        else:
            Map[string]=[ptr]
        ptr += chunk
    return Map


In [8]:
def buildCombination(pattern, chunk):
    #expect n-(c-1) time complexity; n - pattern length, c - chunk size
    m = len(pattern)
    ptr = 0
    Map = {}
    while ptr + chunk <= m :
        string = "".join(pattern[ptr:(ptr + chunk)].to_numpy())
        if string in Map:
            Map[string].append(ptr)
        else:
            Map[string]=[ptr]
        ptr += 1
    return Map

In [9]:
def checkPattern(sequence, pattern, sequenceLoc, patternLoc, chunk):
    # expect n - c time complexity 
    sequenceLength = len(sequence)
    #check front of chunk
    for x in range(0,patternLoc):
        sequenceIndex = sequenceLoc + (x - patternLoc) + sequence.index[0]
        OutofBound = ( False if 0 > sequenceIndex or sequenceIndex > sequenceLength else True)
        if (OutofBound and pattern[x] != sequence[sequenceIndex]) : return False
    #check back of chunk
    for x in range(patternLoc + chunk,len(pattern)):
        sequenceIndex = sequenceLoc + (x - patternLoc) + sequence.index[0]
        OutofBound = ( False if 0 > sequenceIndex or sequenceIndex > sequenceLength else True)
        if (OutofBound and pattern[x] != sequence[sequenceIndex]) : return False
    return True

In [10]:
def chunkingAlgo(sequence, pattern):
    start_time = time.time()
    
    #chunk const has to be less then half of the length of sequence to ensure pattern will definitely hit in the chucking of sequence.
    chunk = int(len(pattern) / 2)
    sequenceSet = buildMap(sequence,chunk) 

    print("--- Pre-Processing %s seconds ---" % (time.time() - start_time))

    start_time = time.time()
    
    patternSet = buildCombination(pattern,chunk)

    start_time = time.time()  
    
    indexlist = list()  
    for keyPattern, valuePattern in patternSet.items():
        if keyPattern in sequenceSet:
            for sequenceElement in sequenceSet[keyPattern]:
                for patternElement in valuePattern:
                    if checkPattern(sequence,pattern,sequenceElement,patternElement,chunk):
                        indexlist.append(sequenceElement - patternElement + sequence.index[0]) 

    print("--- Processing %s seconds ---" % (time.time() - start_time))

    return set(indexlist)

In [11]:
print(seqSeries, searchedSeries)
print(chunkingAlgo(seqSeries, searchedSeries))

0        G
1        A
2        G
3        A
4        G
        ..
49995    t
49996    t
49997    t
49998    c
49999    t
Length: 50000, dtype: object 0    T
1    G
2    A
3    T
4    A
dtype: object
--- Pre-Processing 1.6964149475097656 seconds ---
--- Processing 0.08476543426513672 seconds ---
{8324, 7558, 14982, 16392, 24456, 44810, 44555, 15119, 4369, 8214, 14103, 8731, 12189, 5791, 7199, 41503, 15909, 44972, 13102, 17972, 38324, 25403, 30524, 5441, 15681, 25795, 11592, 42441, 15693, 25550, 8791, 20439, 28761, 44640, 13282, 11774, 17001, 22634, 36714, 45930, 47210, 5867, 31725, 14193, 19314, 4467, 23795, 29946, 9212, 25598, 4351}
