In [1]:
from Bio import SeqIO
import time

In [3]:
# Based on how much my memory usage increases by its probably 700 MB
gen = SeqIO.parse('GRCh38_latest_genomic.fna', 'fasta')
start = time.clock()
chr1_seq = next(gen).seq
end = time.clock()
print(f'It took {end - start} seconds to create the Seq object')

It took 1.39224 seconds to create the SeqRecord object


In [4]:
chr1_str = str(chr1_seq)

In [5]:
start = time.clock()
for s in chr1_str:
    pass
end = time.clock()
print(f'It takes {end - start} seconds to iterate through the string')

It takes 4.213208 seconds to iterate through the string


In [7]:
# The actual string is roughly 200 MB as expected.
chr1_copy = (chr1_str+'.')[:-1]

What I realized last week and what I will test here is that Seq.find is amazingly fast. Like it'll find a length 1000 segment in 0.5 seconds when it takes 4.213 seconds to iterate through the entire string. Well maybe it has a lot of indexing structures on top of it. It does seem to have an extra 500 MB (the data itself is only 200 MB) but I don't think that's enough to justify its speed. Also somehow it managed to construct the object and I presume indexing structures in like 1.4 seconds. Even with optimized routines this seems way too fast.

In [8]:
import pandas as pd

In [20]:
TRIALS_PER_SEQ = 1

def test_for_seq(seq, df):
    for (i, length) in enumerate(PATTERN_LENGTHS):
        for _ in range(TRIALS_PER_SEQ):
            df.loc[i]['Samples'] += 1
            
            start = np.random.randint(0, len(seq) - length + 1)
            sub_seq = seq[start : start + length]
            
            start_time = time.clock()
            df.loc[i]['Average Sparsity'] += seq.count(sub_seq)
            end_time = time.clock()
            df.loc[i]['Average Count Time'] += end_time - start_time
            
            start_time = time.clock()
            seq.find(sub_seq)
            end_time = time.clock()
            df.loc[i]['Average Find Time'] += end_time - start_time

In [31]:
PATTERN_LENGTHS = [50, 100, 500, 1000, 5000, 10000, 50000, 100000]

df = pd.DataFrame(columns=['Pattern Length', 'Samples', 'Average Find Time', 'Average Count Time', 'Average Sparsity'])
for i, length in enumerate(PATTERN_LENGTHS):
    df.loc[i] = [length, 0, 0, 0, 0]

In [None]:
for i in range(25):
    