In [2]:
def load_fasta(filepath):
    """
    Load a FASTA file and return the DNA sequence as a single continuous string.
    Header lines starting with '>' are skipped.
    """
    sequence = []
    with open(filepath, "r") as f:
        for line in f:
            line = line.strip()
            if not line.startswith(">"):
                sequence.append(line)
    return "".join(sequence)

In [None]:
def make_cDNA(genome):
    """
    Extract the three exons from HBB genomic DNA and concatenate them to create cDNA.
    Exon coordinates are based on NG_000007.3 RefSeqGene reference from NCBI.
    Reference URL: https://www.ncbi.nlm.nih.gov/nuccore/NG_000007.3
    The coordinates are converted from global genomic positions to local coordinates
    relative to our HBB_region.fasta slice (offset: 70,545).
    Args:
        genome (str): Genomic DNA sequence from HBB_region.fasta
    Returns:
        str: cDNA sequence (386 bp) with exons concatenated and introns removed
    """
    offset = 70545 # Genomic start position of the HBB_region.fasta slice (example)

    # The genomic coordinates on NG_000007.3 are converted to local slice coordinates by substracting the defined offset.
    exons = [
        (70573 - offset, 70714 - offset), # exon 1: convert the global genomic coordinates into the local slice coordinates
        (70955 - offset, 71083 - offset), # exon 2: convert the global genomic coordinates into the local slice coordinates
        (71144 - offset, 71261 - offset), # exon 3: convert the global genomic coordinates into the local slice coordinates
    ]

    print("=== Extracting Exons ===")
    cDNA = ""
    for idx, (start, end) in enumerate(exons, 1):
        exon_seq = genome[start:end]
        print(f"Exon {idx}: positions {start} ~ {end}")
        print(f"Sequence ({len(exon_seq)} bp): {exon_seq[:50]}...\n")  # print firtst 50 sequence
        cDNA += exon_seq

    print("=== Done ===")
    print(f"Final cDNA length: {len(cDNA)} bp")

    return cDNA

#-------------------------

def trim_from_first_ATG(cDNA):
    """
    Return coding sequence starting from the first ATG in ther cDNA.
    If no ATG is found, return the original cDNA.
    """
    start_index = cDNA.find("ATG")
    if start_index != -1:
        return cDNA[start_index:]
    return cDNA


In [None]:
class BruteForce:

    """
    This is a simple brute-force substring search for DNA sequences.
    Checks every starting position in the text.
    Returns all match positions
    """
    def __init__(self, genome, pattern):
        self.genome = genome     # FASTA or TXT file
        self.pattern = pattern # the codon
        
    def search(self):
        M = len(self.pattern)     # M = codon
        N = len(self.genome)      # N = Genome
        matches = []

        #Check every starting position
        for i in range(N - M + 1):
            j = 0

            # Compare the characters one by one
            while j < M and self.genome[i + j] == self.pattern[j]:
                j += 1

            #If we matched the full pattern, record the index
            if j == M:
                matches.append(i)
        return matches
    #Returns true if pattern is appears at least 1 time
    def contains(self, pattern, text):
        return len(self.search(pattern)) > 0
           


In [5]:
### Sanity Check Using a simple example

# Example DNA sequence (10 chars)
genome = "ACGTACGTAA"   # N = 10

# Example pattern (3 chars)
pattern = "CGT"   # M = 3

# Create an instance of BruteForce
bf = BruteForce(genome, pattern)

# Run search
matches = bf.search()

print("Genome:  ", genome)
print("Pattern: ", pattern)
print("Matches: ", matches)

Genome:   ACGTACGTAA
Pattern:  CGT
Matches:  [1, 5]


# Source:  https://www.ncbi.nlm.nih.gov/nuccore/NG_000007.3?report=fasta (where we obtained the `HBB_region.fasta`)
# Source: https://www.omim.org/entry/141900 (Documents codon 6 mutation (GAG→GTG, Glu→Val) in HBB gene)


The genomic region we selected on the NCBI page from the RefSeqGene record NG_000007.3, corresponds to the human β-globin (HBB) gene cluster. This interval includes the entire HBB gene, all of its exons and introns, and the surrounding 5′ and 3′ flanking regulatory DNA. Most importantly, this region contains codon 6 of the HBB gene, where the classic sickle-cell mutation occurs (the DNA change GAG → GTG, causing Glu → Val in the protein). Because this FASTA segment captures the complete gene plus adjacent regulatory sequences, it provides everything we need for searching and detecting the sickle-cell mutation.

## This exact genomic region has been saved in our file `HBB_region.fasta` for use in our analysis pipeline.

In [6]:
if __name__ == "__main__":
    genome = load_fasta("HBB_region.fasta") 
    # genome = load_fasta("HBB_gene.txt")
    cDNA = make_cDNA(genome)
    coding_seq = trim_from_first_ATG(cDNA)
    pattern = "GTG"   # sickle-cell mutation codon
  

    bf = BruteForce(coding_seq, pattern)
    result = bf.search()

    print("cDNA length (after ATG trim):", len(coding_seq))
    print("Found matches at positions (in coding sequence):", result)

=== Extracting Exons ===
Exon 1: positions 28 ~ 169
Sequence (141 bp): TGTAACAGAATAAAAAATCAATTATGTATTCAAGTTGCTAGTGTCTTAAG...

Exon 2: positions 410 ~ 538
Sequence (128 bp): CAAGGGATGGATGAAGGCAGGTGACTCTAACAGAAAGGGAAAGGATGTTG...

Exon 3: positions 599 ~ 716
Sequence (117 bp): ACTTTGAGTTTGTAAGTATATACTTCTCTGTAATGTGTCTGAATATCTCT...

=== Done ===
Final cDNA length: 386 bp
cDNA length (after ATG trim): 363
Found matches at positions (in coding sequence): [17, 138, 280, 328]


## Testing E.coli genome and lac operon genes


In [10]:
# --- Testing E. coli genome and lac operon genes ---

# Load E. coli full genome
ecoli_genome = load_fasta("ecoli_genome.fasta")

# Load each lac operon gene
lacZ = load_fasta("lacZ_gene.fna")
lacY = load_fasta("lacY_gene.fna")
lacI = load_fasta("lacI_gene.fna")

print("\n=== E. coli Genome Tests ===")

def test_gene(gene_name, gene_seq):
    pattern = gene_seq[:30]  # use 30bp signature
    bf = BruteForce(ecoli_genome, pattern)
    matches = bf.search()
    print(f"{gene_name}: Found at positions {matches}")

# Run tests
test_gene("lacZ", lacZ)
test_gene("lacY", lacY)
test_gene("lacI", lacI)




=== E. coli Genome Tests ===
lacZ: Found at positions []
lacY: Found at positions []
lacI: Found at positions []


In [11]:
print("lacZ start:", lacZ[:60])
print("lacY start:", lacY[:60])
print("lacI start:", lacI[:60])
print("Genome start:", ecoli_genome[:60])

print("\nLengths:")
print("lacZ:", len(lacZ))
print("lacY:", len(lacY))
print("lacI:", len(lacI))
print("Genome:", len(ecoli_genome))


lacZ start: ATGACCATGATTACGGATTCACTGGCCGTCGTTTTACAACGTCGTGACTGGGAAAACCCT
lacY start: ATGTACTATTTAAAAAACACAAACTTTTGGATGTTCGGTTTATTCTTTTTCTTTTACTTT
lacI start: GTGAAACCAGTAACGTTATACGATGTCGCAGAGTATGCCGGTGTCTCTTATCAGACCGTT
Genome start: AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTC

Lengths:
lacZ: 3075
lacY: 1254
lacI: 1083
Genome: 4641652


# Time and Space Complexity (Brute-Force Search)

## Time Complexity: O(P × N × M)

Brute-force pattern matching becomes costly when we want to test more than one pattern.  
For each pattern, the algorithm scans the entire gene of length **N**, and at each position compares up to **M** characters.

Let:
- **P** = number of patterns (codons) we want to test  
- **N** = gene length  
- **M** = pattern length  

Then brute-force must do **O(N × M)** work *for every pattern*, resulting in:

**O(P × N × M)**

Right now, in the sickle-cell example, we only test **P = 1** pattern (“GTG”).  
But once we move to a real gene such as **E. coli lacZ**, we often want to check **many** codons or multiple variants.  
In that situation, brute-force must rescan the entire lacZ sequence *P times*, and the cost grows quickly as P increases.

This is exactly where brute-force becomes inefficient and why alternative structures like TST/Trie start to show practical benefits.

---

## Space Complexity: O(1)

Brute-force matching uses constant extra memory.  
It only keeps a couple of index variables and does not store any additional data structures.

Thus the space complexity stays:

**O(1)**