In [2]:
from Bio import SeqIO

In [3]:
records = SeqIO.parse('GRCh38_latest_genomic.fna', 'fasta')

seq_record = next(records)

In [4]:
print(seq_record)

ID: NC_000001.11
Name: NC_000001.11
Description: NC_000001.11 Homo sapiens chromosome 1, GRCh38.p12 Primary Assembly
Number of features: 0
Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN', SingleLetterAlphabet())


In [5]:
seq = seq_record.seq

In [6]:
len(seq)

248956422

In [7]:
seq[100000:1000500]

Seq('ACTAAGCACACAGAGAATAATGTCTAGAATCTGAGTGCCATGTTATCAAATTGT...AGG', SingleLetterAlphabet())

In [8]:
seq.count('N')

18475408

In [9]:
import numpy as np

In [10]:
PATTERN_LENGTH = 1000
TRIALS = 50
rand_indexes = np.random.randint(len(seq) - PATTERN_LENGTH + 1, size = TRIALS)
rand_seqs = [seq[index : index + PATTERN_LENGTH] for index in rand_indexes]

In [11]:
import time

In [12]:
# https://github.com/python/cpython/blob/master/Objects/stringlib/fastsearch.h#L5

average_time = 0
for (pattern, index) in zip(rand_seqs, rand_indexes):
    start = time.clock()
    found_index = seq.find(pattern)
    end = time.clock()
    time_elapsed = end - start
    average_time += time_elapsed
    print("Pattern starting at index {}. Found match at index {}. Took {}".format(index, found_index, time_elapsed))
print("Average time: {}".format(average_time / TRIALS))

Pattern starting at index 99702246. Found match at index 99702246. Took 0.3440430000000001
Pattern starting at index 248257787. Found match at index 248257787. Took 0.7676189999999998
Pattern starting at index 106981401. Found match at index 106981401. Took 0.22026699999999977
Pattern starting at index 83230124. Found match at index 83230124. Took 0.29790700000000037
Pattern starting at index 207616487. Found match at index 207616487. Took 0.7086350000000001
Pattern starting at index 143083843. Found match at index 0. Took 1.5999999999571912e-05
Pattern starting at index 27889613. Found match at index 27889613. Took 0.06140900000000027
Pattern starting at index 316556. Found match at index 0. Took 1.4999999999432134e-05
Pattern starting at index 215194961. Found match at index 215194961. Took 0.5609480000000007
Pattern starting at index 69591945. Found match at index 69591945. Took 0.20468200000000003
Pattern starting at index 74235958. Found match at index 74235958. Took 0.29985000000

In [18]:
dna_string = str(seq)
len(dna_string)

248956422

In [17]:
counts = {}
for base in dna_string:
    if base not in counts:
        counts[base] = 0
    counts[base] += 1
print(counts)

{'N': 18475408, 't': 25922550, 'a': 25810086, 'c': 16788841, 'g': 16855483, 'C': 31266202, 'G': 31256045, 'T': 41321614, 'A': 41260191, 'M': 1, 'R': 1}
