In [None]:
# Advanced Genomic Analysis and Pattern Recognition (CCA4)

**Assignment 2**  
**Name:** Vaansh Asija  
**PRN:** 1032240032  
**Subject:** Computational and Cognitive Analytics (CCA4)  
**Date:** 28 October 2025


In [2]:
# Question 1: GC Content Calculation
# -----------------------------------
# This program calculates the GC content of DNA sequences overall,
# in sliding windows, and visualizes GC skew.

from typing import List, Tuple
import matplotlib.pyplot as plt

def gc_content(sequence: str) -> float:
    """Calculate overall GC content in percentage."""
    gc_count = sequence.count("G") + sequence.count("C")
    return round((gc_count / len(sequence)) * 100, 2)

def sliding_window_gc(sequence: str, window_size: int = 50) -> List[float]:
    """Calculate GC content in sliding windows across the sequence."""
    gc_values = []
    for i in range(0, len(sequence) - window_size + 1, window_size):
        window = sequence[i:i + window_size]
        gc_values.append(gc_content(window))
    return gc_values

def gc_skew(sequence: str) -> Tuple[List[int], List[float]]:
    """Compute GC skew (G - C) / (G + C) for visualization."""
    skew = []
    positions = []
    g_count, c_count = 0, 0
    for i, base in enumerate(sequence):
        if base == "G":
            g_count += 1
        elif base == "C":
            c_count += 1
        if (g_count + c_count) != 0:
            skew.append((g_count - c_count) / (g_count + c_count))
        else:
            skew.append(0)
        positions.append(i)
    return positions, skew

# Example DNA sequence
sequence = "ATGCGCGATCGATCGCGCGTATGCGCGCGCGCGCGTATGCG"

print("Overall GC Content:", gc_content(sequence), "%")

# Sliding window analysis
gc_values = sliding_window_gc(sequence, window_size=10)
print("Sliding Window GC Content:", gc_values)

# GC Skew Plot
positions, skew = gc_skew(sequence)
plt.plot(positions, skew)
plt.title("GC Skew Plot")
plt.xlabel("Position")
plt.ylabel("GC Skew")
plt.show()


ModuleNotFoundError: No module named 'matplotlib'

In [None]:
### Observation:
- The overall GC content represents the proportion of guanine and cytosine bases in DNA.
- Sliding window GC content helps identify regions rich in G and C bases.
- The GC skew plot visually shows where G and C are unbalanced in the sequence.
- High GC regions indicate stronger DNA bonding and higher thermal stability.
