### DNA Search

Genes are commonly represented in computer software as a sequence of the characters A, C, G, and T. Each letter represents a nucleotide, and the combination of
three nucleotides is called a codon. This is illustrated in figure 2.1. A codon codes
for a specific amino acid that together with other amino acids can form a protein.
A classic task in bioinformatics software is to find a particular codon within a gene.


In [7]:
from typing import List, Tuple
from enum import IntEnum

Nucleotide: IntEnum = IntEnum('Nucleotide', ('A', 'C', 'G', 'T'))
Codon = Tuple[Nucleotide, Nucleotide, Nucleotide]
Gene = List[Codon]

gene_str: str = "ACGTGGCTCTCTAAGTAGTAGTAGGGGTTTATATATACCCTAGGACTCCCTTTAA"
    
def str_to_gene(s: str) -> Gene:
    gene: Gene = []
    for i in range(0, len(s), 3):
        if (i+2) >= len(s):
            return gene
        codon: Codon = (Nucleotide[s[i]], Nucleotide[s[i+1]], Nucleotide[s[i+2]])
        gene.append(codon)
    return gene

acg: Codon = (Nucleotide.A, Nucleotide.C, Nucleotide.G)
gat: Codon = (Nucleotide.G, Nucleotide.A, Nucleotide.T)
my_gene = str_to_gene(gene_str)

def binary_contains(key_codon: Codon, gene: Gene) -> bool:
    low = 0
    high = len(gene)
    while low <= high:
        mid: int = (low + high)//2
        if key_codon > gene[mid]:
            low = mid+1
        elif key_codon < gene[mid]:
            high = mid-1
        else:
            print(mid)
            return True
    return False

binary_contains(acg, my_gene)
        

False

In [8]:
from typing import List, Tuple
from enum import IntEnum

In [9]:
Nucleotide: IntEnum = IntEnum('Nucleotide', ('A', 'C', 'G', 'T'))
Codon = Tuple[Nucleotide, Nucleotide, Nucleotide]
Gene = List[Codon]

In [10]:
gene_str: str = "ACGTGGCTCTCTAAGTAGTAGTAGGGGTTTATATATACCCTAGGACTCCCTTTAA"

In [11]:
def str_to_gene(s: str) -> Gene:
    gene: Gene = []
    for i in range(0, len(s), 3):
        if (i+2) >= len(s):
            return gene
        codon: Codon = (Nucleotide[s[i]], Nucleotide[s[i+1]], Nucleotide[s[i+2]])
        gene.append(codon)
    return gene

In [12]:
str_to_gene(gene_str)

[(<Nucleotide.A: 1>, <Nucleotide.C: 2>, <Nucleotide.G: 3>),
 (<Nucleotide.T: 4>, <Nucleotide.G: 3>, <Nucleotide.G: 3>),
 (<Nucleotide.C: 2>, <Nucleotide.T: 4>, <Nucleotide.C: 2>),
 (<Nucleotide.T: 4>, <Nucleotide.C: 2>, <Nucleotide.T: 4>),
 (<Nucleotide.A: 1>, <Nucleotide.A: 1>, <Nucleotide.G: 3>),
 (<Nucleotide.T: 4>, <Nucleotide.A: 1>, <Nucleotide.G: 3>),
 (<Nucleotide.T: 4>, <Nucleotide.A: 1>, <Nucleotide.G: 3>),
 (<Nucleotide.T: 4>, <Nucleotide.A: 1>, <Nucleotide.G: 3>),
 (<Nucleotide.G: 3>, <Nucleotide.G: 3>, <Nucleotide.G: 3>),
 (<Nucleotide.T: 4>, <Nucleotide.T: 4>, <Nucleotide.T: 4>),
 (<Nucleotide.A: 1>, <Nucleotide.T: 4>, <Nucleotide.A: 1>),
 (<Nucleotide.T: 4>, <Nucleotide.A: 1>, <Nucleotide.T: 4>),
 (<Nucleotide.A: 1>, <Nucleotide.C: 2>, <Nucleotide.C: 2>),
 (<Nucleotide.C: 2>, <Nucleotide.T: 4>, <Nucleotide.A: 1>),
 (<Nucleotide.G: 3>, <Nucleotide.G: 3>, <Nucleotide.A: 1>),
 (<Nucleotide.C: 2>, <Nucleotide.T: 4>, <Nucleotide.C: 2>),
 (<Nucleotide.C: 2>, <Nucleotide.C: 2>, 

In [13]:
acg: Codon = (Nucleotide.A, Nucleotide.C, Nucleotide.G)
gat: Codon = (Nucleotide.G, Nucleotide.A, Nucleotide.T)
my_gene = str_to_gene(gene_str)

In [14]:
def linear_contains(key_codon: Codon, gene: Gene)-> bool:
    for codon in gene:
        if codon == key_codon:
            return True
    return False
    

In [15]:
linear_contains(acg, my_gene)

True

In [16]:
linear_contains(gat, my_gene)

False

In [17]:
linear_contains(acg, my_gene)

True

In [24]:
def binary_contains(key_codon: Codon, gene: Gene) -> bool:
    low = 0
    high = len(gene)
    while low <= high:
        mid: int = (low + high)//2
        if key_codon > gene[mid]:
            low = mid+1
        elif key_codon < gene[mid]:
            high = mid-1
        else:
            return True
    return False
        

In [25]:
my_sorted_gene = sorted(my_gene)

In [26]:
binary_contains(gat, my_sorted_gene)

False

In [27]:
binary_contains(acg, my_sorted_gene)

True

In [54]:
from __future__ import annotations
from typing import Protocol, Iterable, TypeVar, Any, Sequence
T = TypeVar('T')

In [15]:
def linear_contains(iterable: Iterable[T], key: T) -> bool:
    for item in iterable:
        if key == item:
            return True
    return False

In [16]:
C = TypeVar('C', bound='Comparable')

In [17]:
class Comparable(Protocol):
    def __eq__(self, other: Any) -> bool:
        ...
        
    def __lt__(self: C, other: C) -> bool:
        ...
    
    def __gt__(self: C, other: C) -> bool:
        return (not self < other) and self != other
    
    def __le__(self: C, other: C) -> bool:
        return self < other or self==other
    
    def __ge__(self: C, other: C) -> bool:
        return (not self < other)
    
    

In [18]:
def binary_contains(sequence: Sequence, key: C) -> bool:
    low: int = 0
    high: int = len(sequence) - 1
    while low <= high:
        mid: int = (low+high)//2
        if sequence[mid] < key:
            low = mid + 1
        elif sequence[mid] > key:
            high = mid - 1
        else:
            return True
    return False
    

In [19]:
print(linear_contains([1, 5, 15, 15, 15, 15, 20], 5)) # True
print(binary_contains(["a", "d", "e", "f", "z"], "f")) # True
print(binary_contains(["john", "mark", "ronald", "sarah"], "sheila")) # False

True
True
False
