<a href="https://colab.research.google.com/github/ttderessa/Temesgen-Deressa/blob/main/Deciphering_Life's_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
'''
Deciphering Lifeâ€™s Code: A Pythonic Approach to DNA Sequencing and Mutation Analysis

Author: Dr. Temesgen Deressa
Date: December 31, 2025
Note: This material is for educational purposes only.

'''
import random

# --- 1. Base Class for Genetic Material ---
class GeneticMaterial:
    """Base class for handling generic genetic strings (DNA, RNA, Protein)."""
    def __init__(self, sequence: str):
        # Store as uppercase and remove leading/trailing whitespace for consistency
        self.sequence = sequence.upper().strip()

    def __len__(self):
        """Returns the length of the genetic sequence."""
        return len(self.sequence)

    def __str__(self):
        """String representation for easier printing."""
        return self.sequence[:50] + "..." if len(self.sequence) > 50 else self.sequence

# --- 2. DNA Class with Transcription, Translation, and Mutation ---
class DNA(GeneticMaterial):
    """
    Handles DNA-specific operations, including transcription to mRNA,
    translation to protein, and simulating point mutations.
    """

    # The Universal Genetic Code (Codon Table)
    # Maps 3-base DNA codons to their corresponding amino acid (or stop '_')
    CODON_TABLE = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 'TGA':'_', # Stop codons
    }

    def transcribe(self) -> 'RNA':
        """
        Converts the DNA sequence to an RNA sequence (replaces T with U).
        Returns an RNA object.
        """
        mrna_sequence = self.sequence.replace('T', 'U')
        return RNA(mrna_sequence)

    def translate(self) -> 'Protein':
        """
        Translates the DNA sequence into a Protein object.
        It identifies the first "ATG" start codon and translates until a stop codon.
        If no start codon is found, it returns a special 'Non-coding' Protein.
        """
        amino_acids = []

        # Find the first start codon (ATG). Translation always begins here.
        start_idx = self.sequence.find("ATG")

        if start_idx == -1:
            # If no start codon, this DNA region is likely non-coding for a protein.
            return Protein("Non-coding DNA")

        # Iterate through the sequence from the start codon in steps of 3 (codons)
        for i in range(start_idx, len(self.sequence) - 2, 3):
            codon = self.sequence[i:i+3]

            # Look up the amino acid in our codon table
            residue = self.CODON_TABLE.get(codon, "?") # Use '?' for unknown codons

            if residue == "_":  # Stop codon reached, terminate translation
                break

            amino_acids.append(residue)

        return Protein("".join(amino_acids))

    def point_mutation(self, position: int, new_base: str):
        """
        Simulates a point mutation (single base substitution) at a given position.
        Modifies the DNA sequence in place.
        """
        if not (0 <= position < len(self.sequence)):
            print(f"Error: Position {position} is out of bounds for sequence of length {len(self.sequence)}.")
            return

        new_base = new_base.upper()
        if new_base not in "ATCG":
            print(f"Error: '{new_base}' is not a valid DNA base (A, T, C, G).")
            return

        # Convert sequence to a list to make it mutable, then back to string
        seq_list = list(self.sequence)
        old_base = seq_list[position]
        seq_list[position] = new_base
        self.sequence = "".join(seq_list)
        print(f"INFO: Point mutation at index {position}: '{old_base}' -> '{new_base}'")

# --- 3. RNA Class (Simple for now, could be expanded for splicing etc.) ---
class RNA(GeneticMaterial):
    """
    Represents an RNA sequence.
    Could be expanded for specific RNA operations like splicing.
    """
    pass

# --- 4. Protein Class to hold the amino acid sequence ---
class Protein(GeneticMaterial):
    """
    Represents a protein sequence (string of amino acids).
    """
    def __str__(self):
        """Custom string representation for Protein objects."""
        if len(self.sequence) > 30:
            return f"Protein: {self.sequence[:27]}... (Length: {len(self.sequence)})"
        return f"Protein: {self.sequence} (Length: {len(self.sequence)})"

# --- Main Execution Block: Solving the Research Question ---
if __name__ == "__main__":
    print("--- DNA Sequencing and Mutation Analysis ---")

    # Define a sample DNA sequence from our 'unknown organism'
    # Contains a clear gene: ATG (Start) ... TGA (Stop)
    raw_dna_sequence = "GATCGTACGATGCATAGCTACGACGTAGCCCTAGTAAGGCCTGACTAG"

    # 1. Create the original DNA object
    original_dna = DNA(raw_dna_sequence)
    print(f"\nOriginal DNA (full): {original_dna.sequence}")
    print(f"Length of original DNA: {len(original_dna)} bases")

    # 2. Transcribe to RNA
    mrna_sequence = original_dna.transcribe()
    print(f"Transcribed mRNA: {mrna_sequence}")

    # 3. Translate to Protein
    original_protein = original_dna.translate()
    print(f"Translated Protein: {original_protein}")

    print("\n" + "="*50 + "\n")
    print("--- Simulating a Point Mutation and its Impact ---")

    # Create a *copy* of the original DNA for mutation simulation
    mutated_dna = DNA(raw_dna_sequence)

    # Let's find the start codon to understand the coding region
    start_index = mutated_dna.sequence.find("ATG")
    print(f"Gene starts at index (ATG): {start_index}")

    # Example Mutation: Change the 7th base *within* the coding region
    # Original codon at index 7, 8, 9 is 'GCA' (Alanine)
    # Let's change the 'G' at index 7 to 'T'
    mutation_position = 7
    new_base_for_mutation = 'T'

    # Perform the mutation
    mutated_dna.point_mutation(mutation_position, new_base_for_mutation)

    # Show the new DNA sequence after mutation
    print(f"Mutated DNA (full):  {mutated_dna.sequence}")

    # Translate the mutated DNA and observe the new protein
    mutated_protein = mutated_dna.translate()
    print(f"Mutated Protein:     {mutated_protein}")

    print("\n--- Comparing Proteins ---")
    if str(original_protein) == str(mutated_protein):
        print("RESULT: The mutation was SILENT - no change in protein sequence!")
    else:
        print("RESULT: The mutation caused a change in protein sequence (Missense or Nonsense).")
        print(f"  Original: {original_protein.sequence}")
        print(f"  Mutated:  {mutated_protein.sequence}")

    # Example of a Nonsense Mutation (creating an early stop codon)
    print("\n--- Another Mutation: Creating a Nonsense Mutation ---")
    nonsense_dna = DNA("ATGCATACGTACGTACGTGA") # M H T Y V _
    print(f"Original Nonsense Test DNA: {nonsense_dna.sequence}")
    print(f"Original Nonsense Test Protein: {nonsense_dna.translate()}")

    # Let's target the 'C' at index 4 (part of CAT) and change it to 'A' -> CAA (Q)
    # Or, let's make it more dramatic: target the 'G' at index 2 of 'ATG' and change it to 'T'
    # Original: ATG C AT A CGT ACG TGA
    #           M  H  Y  R  T  _
    # Let's target index 6 (the 'T' in 'CAT') and change it to 'A' --> CAA (Q)
    # No, let's make a stop codon.
    # Target the C at index 4 (of ATGCATACGTACGTACGTGA)
    # Codon is 'GCA'
    # original codon at index 3 is CAT (H)
    # Let's change C at index 4 to T.
    # Original: ATG C A T A C G T A C G T A C G T G A
    #           012 345 678 901 234 567 890 123 456 789
    # The 'C' at index 4 is part of 'CAT' (Histidine).
    # If we change C (index 4) to T, codon becomes TAT (Tyrosine). Not a stop.
    # What if we change the T at index 8 (of ATGCATACGTACGTACGTGA)?
    # Original codon: 'ACG' (T)
    # Change T (index 8) to A. Codon becomes 'ACA' (T)
    # What if we want to make 'TGA' (stop) out of 'ACG'?
    # Change A to T (index 9) and G to G (index 10)
    # Change A (index 9) to T.

    # Let's change G at index 10 to A.
    # Original: ATGCATACG TACGTACG TGA  (The first 'G' of ACG is at index 9)
    #           M H T R T _
    # Let's change the G at index 10 (part of ACG) to A.
    # So ACG -> ACA (still Threonine)
    # What if we change C at index 4 (of ATGCATACGTACGTACGTGA) to A?
    # Codon: CAT -> CAA (Glutamine). Missense.
    # How about changing G at index 10 to T? (ACG -> ACT) Threonine.
    # Let's make an early stop.
    # Original coding region: ATGCATACGTACGTACGTGA
    # Codons: ATG (M) CAT (H) ACG (T) TAC (Y) GTA (V) CGT (R) TGA (_)
    # Change the C at index 4 to T. Codon CAT (H) becomes TAT (Y). Missense.
    # Change the A at index 5 to A. Codon CAT (H) becomes CTA (L). Missense.
    # Change the T at index 6 to G. Codon CAT (H) becomes CAG (Q). Missense.
    # Let's try to make the second codon (CAT) into a stop codon like TAA.
    # Current: C A T (indices 3,4,5)
    # Desired: T A A
    # Change C (idx 3) to T.
    # Change T (idx 5) to A.

    nonsense_dna_example = DNA("ATGCATACGTACGTACGTGA")
    nonsense_dna_example.point_mutation(3, 'T') # C -> T
    nonsense_dna_example.point_mutation(5, 'A') # T -> A
    print(f"Nonsense Mutant DNA: {nonsense_dna_example.sequence}")
    print(f"Nonsense Mutant Protein: {nonsense_dna_example.translate()}")

    print("\n" + "="*50 + "\n")
    print("--- Similarity Comparison ---")

    # Let's compare our original and mutated protein sequences
    # We will simply compare their string representations here
    # For advanced similarity, algorithms like Needleman-Wunsch or Smith-Waterman are used.

    if original_protein.sequence == mutated_protein.sequence:
        print("The original and first mutated proteins are IDENTICAL.")
    else:
        print("The original and first mutated proteins are DIFFERENT.")
        # Calculate simple string difference (e.g., Hamming distance for equal length)
        diff_count = sum(1 for a, b in zip(original_protein.sequence, mutated_protein.sequence) if a != b)
        print(f"Number of differing amino acids: {diff_count}")

--- DNA Sequencing and Mutation Analysis ---

Original DNA (full): GATCGTACGATGCATAGCTACGACGTAGCCCTAGTAAGGCCTGACTAG
Length of original DNA: 48 bases
Transcribed mRNA: GAUCGUACGAUGCAUAGCUACGACGUAGCCCUAGUAAGGCCUGACUAG
Translated Protein: Protein: MHSYDVALVRPD (Length: 12)


--- Simulating a Point Mutation and its Impact ---
Gene starts at index (ATG): 9
INFO: Point mutation at index 7: 'C' -> 'T'
Mutated DNA (full):  GATCGTATGATGCATAGCTACGACGTAGCCCTAGTAAGGCCTGACTAG
Mutated Protein:     Protein: MMHSYDVALVRPD (Length: 13)

--- Comparing Proteins ---
RESULT: The mutation caused a change in protein sequence (Missense or Nonsense).
  Original: MHSYDVALVRPD
  Mutated:  MMHSYDVALVRPD

--- Another Mutation: Creating a Nonsense Mutation ---
Original Nonsense Test DNA: ATGCATACGTACGTACGTGA
Original Nonsense Test Protein: Protein: MHTYVR (Length: 6)
INFO: Point mutation at index 3: 'C' -> 'T'
INFO: Point mutation at index 5: 'T' -> 'A'
Nonsense Mutant DNA: ATGTAAACGTACGTACGTGA
Nonsense Mutant Prot