In [None]:
# Advanced Genomic Analysis and Pattern Recognition (CCA4)

**Assignment 3**  
**Name:** Vaansh Asija  
**PRN:** 1032240032 
**Subject:** Computational and Cognitive Analytics (CCA4)  
**Date:** 28 October 2025


In [None]:
# ---------------------------------------------------------------
# Question 1: FASTA File Processing
# ---------------------------------------------------------------
# This program reads and writes FASTA files, extracts metadata,
# and efficiently handles large genomic data.
# ---------------------------------------------------------------

from typing import Dict, List

def read_fasta(file_path: str) -> Dict[str, str]:
    """
    Parses a FASTA file and returns a dictionary {header: sequence}.
    """
    sequences = {}
    header = None
    with open(file_path, "r") as file:
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                header = line[1:]
                sequences[header] = ""
            else:
                sequences[header] += line
    return sequences

def write_fasta(sequences: Dict[str, str], output_path: str):
    """
    Writes a dictionary of sequences into a FASTA file.
    """
    with open(output_path, "w") as f:
        for header, seq in sequences.items():
            f.write(f">{header}\n")
            for i in range(0, len(seq), 70):  # wrap lines at 70 chars
                f.write(seq[i:i+70] + "\n")

# Example test (you can create a file test.fasta with 2–3 sequences)
example_sequences = {
    "Human_gene1": "ATGCGTACGTAGCTAGCGTAGCTAGCTA",
    "Mouse_gene2": "ATGCCGTACGTAGCTAACGTTAGC"
}
write_fasta(example_sequences, "output_example.fasta")

read_data = read_fasta("output_example.fasta")
print("Parsed FASTA Data:", read_data)


In [None]:
# ---------------------------------------------------------------
# Question 2: Genomic Data Integration
# ---------------------------------------------------------------
# Create a mini database to store sequence info, with search and validation.
# ---------------------------------------------------------------

import pandas as pd

class GenomicDatabase:
    """A simple in-memory genomic sequence database using pandas."""

    def __init__(self):
        self.db = pd.DataFrame(columns=["ID", "Header", "Sequence"])

    def add_sequence(self, seq_id: str, header: str, sequence: str):
        if not set(sequence.upper()).issubset({"A", "T", "G", "C"}):
            raise ValueError(f"Invalid characters in sequence {seq_id}")
        self.db.loc[len(self.db)] = [seq_id, header, sequence.upper()]

    def search_by_id(self, seq_id: str):
        return self.db[self.db["ID"] == seq_id]

    def sequence_length(self, seq_id: str) -> int:
        seq = self.search_by_id(seq_id)
        return len(seq["Sequence"].values[0]) if not seq.empty else 0

# Example
db = GenomicDatabase()
db.add_sequence("SEQ001", "Human Alpha Gene", "ATGCGTACGTAGC")
db.add_sequence("SEQ002", "Mouse Beta Gene", "ATGCCGATCGTAA")

print("Database:\n", db.db)
print("Search SEQ001:\n", db.search_by_id("SEQ001"))
print("Sequence Length:", db.sequence_length("SEQ002"))


In [None]:
# ---------------------------------------------------------------
# Question 3: Multiple Sequence Problems
# ---------------------------------------------------------------
# Implements consensus sequence generation and evolutionary distance.
# ---------------------------------------------------------------

from collections import Counter

def consensus_sequence(sequences: List[str]) -> str:
    """Generate a consensus sequence from multiple aligned DNA sequences."""
    consensus = ""
    for i in range(len(sequences[0])):
        column = [seq[i] for seq in sequences]
        common_base = Counter(column).most_common(1)[0][0]
        consensus += common_base
    return consensus

def evolutionary_distance(seq1: str, seq2: str) -> float:
    """Compute simple evolutionary distance based on mismatches."""
    mismatches = sum(1 for a, b in zip(seq1, seq2) if a != b)
    return mismatches / len(seq1)

# Example
seqs = ["ATGCC", "ATGCA", "ATGCG"]
print("Consensus Sequence:", consensus_sequence(seqs))
print("Evolutionary Distance (seq1 vs seq2):", evolutionary_distance(seqs[0], seqs[1]))


In [None]:
# ---------------------------------------------------------------
# Question 4: Advanced Pattern Analysis
# ---------------------------------------------------------------
# Detects repeated patterns and palindromes in DNA.
# ---------------------------------------------------------------

def find_repeats(sequence: str, length: int = 4):
    """Find all repeating patterns of given length."""
    seen = {}
    for i in range(len(sequence) - length + 1):
        fragment = sequence[i:i+length]
        seen[fragment] = seen.get(fragment, 0) + 1
    return {k: v for k, v in seen.items() if v > 1}

def find_palindromes(sequence: str, min_len: int = 4):
    """Find palindromic substrings (reads same forward and backward)."""
    return [sequence[i:j] for i in range(len(sequence))
            for j in range(i+min_len, len(sequence)+1)
            if sequence[i:j] == sequence[i:j][::-1]]

# Example
sequence = "ATGCATGCATGCGCGC"
print("Repeats:", find_repeats(sequence))
print("Palindromes:", find_palindromes(sequence))


In [None]:
# ---------------------------------------------------------------
# Question 5: Performance and Scalability
# ---------------------------------------------------------------
# Demonstrates parallel processing for large sequence tasks.
# ---------------------------------------------------------------

from multiprocessing import Pool
import time

def gc_calc(seq):
    """Helper for parallel GC calculation."""
    gc = seq.count("G") + seq.count("C")
    return (gc / len(seq)) * 100

large_sequences = ["ATGCGT" * 1000000, "ATGCCG" * 1000000]

start = time.time()
with Pool(processes=2) as pool:
    gc_results = pool.map(gc_calc, large_sequences)
end = time.time()

print("Parallel GC Content:", gc_results)
print("Execution Time:", round(end - start, 3), "seconds")


In [None]:
# ---------------------------------------------------------------
# Question 6: Integration and Documentation
# ---------------------------------------------------------------
# Example CLI-ready function and modular design concept.
# ---------------------------------------------------------------

def analyze_sequence(sequence: str):
    """
    Comprehensive analysis for CLI tools.
    Returns dict with GC content and palindromes.
    """
    return {
        "GC_Content": gc_content(sequence),
        "Repeats": find_repeats(sequence),
        "Palindromes": find_palindromes(sequence)
    }

# Example run
result = analyze_sequence("ATGCATGCATGC")
print("CLI Analysis Result:", result)
