In [14]:
from enum import IntEnum
from typing import Tuple, List
from random import choice

In [29]:
nuc_tuple = ('A','C','G','T')
Nucleotide = IntEnum('Nucleotide', nuc_tuple)
A = Nucleotide.A
C = Nucleotide.C
G = Nucleotide.G
T = Nucleotide.T
Nuc_list = [A,C,G,T]
Codon = Tuple[Nucleotide, Nucleotide, Nucleotide]
Gene = List[Codon]

In [16]:
Gene

typing.List[typing.Tuple[__main__.Nucleotide, __main__.Nucleotide, __main__.Nucleotide]]

In [21]:
N = 100
N = 3*round(N/3)
gene_str = ''
for _ in range(N):
    gene_str += choice(nuc_tuple)
print('Gene string ({}): {}'.format(N,gene_str))

Gene string (99): CCCTCGAGGTGCAAGGCTAGGTACAACTTTTACTGCTACGGGGTCCCCACCCCCACGAACCGCCACTCGTATCCGCTCTAGATCCTCAGACTGAGTCGG


In [22]:
def string2gene(s: str) -> Gene:
    gene = []
    for i in range(0, len(s), 3):
        if (i+2) >= len(s):
            return gene
        codon = (Nucleotide[s[i]], Nucleotide[s[i+1]], Nucleotide[s[i+2]])
        gene.append(codon)
    return gene

In [57]:
gene = sorted(string2gene(gene_str))

In [189]:
def linear_contains(gene: Gene, key_codon: Codon) -> bool:
    for codon in gene:
        if codon == key_codon:
            return True
    return False

In [205]:
sample_codons = []
for _ in range(10):
    sample_codons.append(tuple(choice(Nuc_list) for _ in range(3)))

for codon in sample_codons:
    if linear_contains(gene,codon):
        print(codon)

(<Nucleotide.A: 1>, <Nucleotide.A: 1>, <Nucleotide.G: 3>)
(<Nucleotide.C: 2>, <Nucleotide.T: 4>, <Nucleotide.C: 2>)
(<Nucleotide.A: 1>, <Nucleotide.G: 3>, <Nucleotide.G: 3>)
(<Nucleotide.T: 4>, <Nucleotide.A: 1>, <Nucleotide.T: 4>)


In [153]:
def binary_contains(gene: Gene, key_codon: Codon) -> bool:
    low = 0
    high = len(gene)-1
    while low <= high:
        mid = (low+high) // 2
        if gene[mid] < key_codon:
            low = mid+1
        elif gene[mid] > key_codon:
            high = mid-1
        else:
            return True
    return False

In [207]:
for codon in sample_codons:
    if binary_contains(gene,codon):
        print(codon)

(<Nucleotide.A: 1>, <Nucleotide.A: 1>, <Nucleotide.G: 3>)
(<Nucleotide.C: 2>, <Nucleotide.T: 4>, <Nucleotide.C: 2>)
(<Nucleotide.A: 1>, <Nucleotide.G: 3>, <Nucleotide.G: 3>)
(<Nucleotide.T: 4>, <Nucleotide.A: 1>, <Nucleotide.T: 4>)


In [136]:
gene

[(<Nucleotide.A: 1>, <Nucleotide.A: 1>, <Nucleotide.C: 2>),
 (<Nucleotide.A: 1>, <Nucleotide.A: 1>, <Nucleotide.C: 2>),
 (<Nucleotide.A: 1>, <Nucleotide.A: 1>, <Nucleotide.G: 3>),
 (<Nucleotide.A: 1>, <Nucleotide.C: 2>, <Nucleotide.C: 2>),
 (<Nucleotide.A: 1>, <Nucleotide.C: 2>, <Nucleotide.G: 3>),
 (<Nucleotide.A: 1>, <Nucleotide.G: 3>, <Nucleotide.A: 1>),
 (<Nucleotide.A: 1>, <Nucleotide.G: 3>, <Nucleotide.G: 3>),
 (<Nucleotide.A: 1>, <Nucleotide.G: 3>, <Nucleotide.G: 3>),
 (<Nucleotide.A: 1>, <Nucleotide.G: 3>, <Nucleotide.T: 4>),
 (<Nucleotide.A: 1>, <Nucleotide.T: 4>, <Nucleotide.C: 2>),
 (<Nucleotide.C: 2>, <Nucleotide.A: 1>, <Nucleotide.C: 2>),
 (<Nucleotide.C: 2>, <Nucleotide.C: 2>, <Nucleotide.C: 2>),
 (<Nucleotide.C: 2>, <Nucleotide.C: 2>, <Nucleotide.C: 2>),
 (<Nucleotide.C: 2>, <Nucleotide.C: 2>, <Nucleotide.C: 2>),
 (<Nucleotide.C: 2>, <Nucleotide.C: 2>, <Nucleotide.G: 3>),
 (<Nucleotide.C: 2>, <Nucleotide.G: 3>, <Nucleotide.C: 2>),
 (<Nucleotide.C: 2>, <Nucleotide.G: 3>, 

In [293]:
N = 100000
N = 3*round(N/3)
gene_str = ''
for _ in range(N):
    gene_str += choice(nuc_tuple)
print('Gene string ({:,}): {}...'.format(N,gene_str[:100]))
gene = string2gene(gene_str)
gene = sorted(gene)
sample_codons = []
for _ in range(10000):
    sample_codons.append(tuple(choice(Nuc_list) for _ in range(3)))

Gene string (99,999): GGCTGCAACGTTGCTATTGGCCGTATCCCTAGTAGTTGATCCCGACTCACTTTCTTAAATCGAGATCTGAGCTAGGTCGGCCTGCCATCAGAAATGAGCG...


In [294]:
%%time

s = 0
for codon in sample_codons:
    if linear_contains(gene,codon):
        s+=1
print('Total founds: '+str(s))

Total founds: 10000
CPU times: user 12.3 s, sys: 0 ns, total: 12.3 s
Wall time: 12.3 s


In [295]:
%%time

s=0
for codon in sample_codons:
    if binary_contains(gene,codon):
        s+=1
print('Total founds: '+str(s))

Total founds: 10000
CPU times: user 31.1 ms, sys: 0 ns, total: 31.1 ms
Wall time: 30.9 ms


In [296]:
len(list(set(gene)))

64

- Obs.1: a random generating the gene is not the right way, but useful for test the code.
- Obs.2: we must consider the first match condon. The random generating for condon samples is not a conceptually right.
- Obs.3: l if you let the gene, the linear search will be done very quickly (much mor faster, less than 100 ms).
