In [None]:
# DNA Fundamentals and Basic Tools (CCA3)
### Assignment 1

**Name:** Vaansh Asija  
**PRN:** 1032240032 
**Subject:** Computational and Cognitive Analytics (CCA3)  
**Date:** 28 October 2025


In [7]:
# ==========================================================
# Assignment 1: DNA Fundamentals and Basic Tools (CCA3)
# Name: Vaansh Asija
# PRN: __1032240032________________________
# ==========================================================
# This notebook implements DNA manipulation algorithms,
# transcription, reverse complement, and analysis functions.
# ==========================================================


In [10]:
# ==========================================================
# Question 1: DNA Data Structures
# ==========================================================

class DNA:
    """
    A class to represent and analyze DNA sequences.
    """

    def __init__(self, sequence):
        """
        Initialize the DNA object.

        Parameters:
            sequence (str): DNA sequence (should contain only A, T, G, C)
        """
        self.sequence = sequence.upper()  # Convert to uppercase for consistency
        self.valid_nucleotides = {'A', 'T', 'G', 'C'}
        
        # Validate sequence
        if not self._validate_sequence():
            raise ValueError("Invalid DNA sequence! Sequence must contain only A, T, G, C.")
    
    def _validate_sequence(self):
        """Check if the DNA sequence contains only valid nucleotides."""
        return all(base in self.valid_nucleotides for base in self.sequence)
    
    def length(self):
        """Return the length of the DNA sequence."""
        return len(self.sequence)
    
    def count_nucleotides(self):
        """Count the occurrences of each nucleotide (A, T, G, C)."""
        counts = {nuc: self.sequence.count(nuc) for nuc in self.valid_nucleotides}
        return counts
    
    def nucleotide_statistics(self):
        """Return basic statistics (percentage composition of each nucleotide)."""
        length = self.length()
        counts = self.count_nucleotides()
        percentages = {nuc: (count / length) * 100 for nuc, count in counts.items()}
        return percentages
    
    def __str__(self):
        """String representation of the DNA sequence."""
        return f"DNA Sequence: {self.sequence}"


In [9]:
# Example test for DNA class
try:
    dna1 = DNA("ATGCGTAGCTAG")
    print(dna1)
    print("Length:", dna1.length())
    print("Counts:", dna1.count_nucleotides())
    print("Statistics:", dna1.nucleotide_statistics())
except ValueError as e:
    print(e)


DNA Sequence: ATGCGTAGCTAG
Length: 12
Counts: {'T': 3, 'A': 3, 'C': 2, 'G': 4}
Statistics: {'T': 25.0, 'A': 25.0, 'C': 16.666666666666664, 'G': 33.33333333333333}


In [None]:
### Question 2: Nucleotide Counting and Analysis


In [11]:
# Question 2: Nucleotide Counting and Analysis
# --------------------------------------------------------------
def count_nucleotides(seq):
    """Count the number of each nucleotide."""
    seq = seq.upper()
    return {nuc: seq.count(nuc) for nuc in "ATGC"}

def nucleotide_frequencies(seq):
    """Calculate nucleotide frequencies as percentages."""
    seq = seq.upper()
    total = len(seq)
    counts = count_nucleotides(seq)
    return {nuc: (count / total) * 100 for nuc, count in counts.items()}

def compare_sequences(seq1, seq2):
    """Compare nucleotide composition between two sequences."""
    freq1 = nucleotide_frequencies(seq1)
    freq2 = nucleotide_frequencies(seq2)
    diff = {nuc: abs(freq1[nuc] - freq2[nuc]) for nuc in "ATGC"}
    return diff


# 🧪 Example Test
seq1 = "ATGCGTAA"
seq2 = "ATGCCCAA"
print("Sequence 1 frequencies:", nucleotide_frequencies(seq1))
print("Sequence 2 frequencies:", nucleotide_frequencies(seq2))
print("Difference between sequences:", compare_sequences(seq1, seq2))


Sequence 1 frequencies: {'A': 37.5, 'T': 25.0, 'G': 25.0, 'C': 12.5}
Sequence 2 frequencies: {'A': 37.5, 'T': 12.5, 'G': 12.5, 'C': 37.5}
Difference between sequences: {'A': 0.0, 'T': 12.5, 'G': 12.5, 'C': 25.0}


In [None]:
### Question 3: String Manipulation for Genomics


In [12]:
# Question 3: String Manipulation for Genomics
# --------------------------------------------------------------
def to_uppercase(seq):
    """Convert DNA sequence to uppercase."""
    return seq.upper()

def to_lowercase(seq):
    """Convert DNA sequence to lowercase."""
    return seq.lower()

def clean_sequence(seq):
    """Remove non-nucleotide characters."""
    return ''.join([nuc for nuc in seq.upper() if nuc in "ATGC"])

def split_into_codons(seq):
    """Split sequence into codons (groups of 3)."""
    seq = clean_sequence(seq)
    return [seq[i:i+3] for i in range(0, len(seq), 3) if len(seq[i:i+3]) == 3]

def merge_fragments(fragments):
    """Merge multiple DNA fragments into a single sequence."""
    return ''.join(fragments)


# 🧪 Example Test
seq = "atg-cg@t a#gc!"
print("Uppercase:", to_uppercase(seq))
print("Cleaned:", clean_sequence(seq))
print("Codons:", split_into_codons(seq))
print("Merged:", merge_fragments(["ATG", "CGA", "TGC"]))


Uppercase: ATG-CG@T A#GC!
Cleaned: ATGCGTAGC
Codons: ['ATG', 'CGT', 'AGC']
Merged: ATGCGATGC


In [None]:
### Aim:
To understand the fundamental concepts of DNA structure and write a Python program that counts the occurrences of each nucleotide (A, T, G, C) in a given DNA sequence.


In [None]:
### Theory:
DNA (Deoxyribonucleic Acid) is a molecule that carries genetic information in all living organisms.  
It consists of four nucleotide bases:
- **Adenine (A)**
- **Thymine (T)**
- **Guanine (G)**
- **Cytosine (C)**

Each DNA strand is made up of these bases connected by a sugar-phosphate backbone.  
Understanding DNA base composition is the first step in computational biology and bioinformatics.


In [13]:
# DNA Nucleotide Counting Program

def count_nucleotides(dna_sequence):
    dna_sequence = dna_sequence.upper()  # Convert to uppercase
    counts = {
        'A': dna_sequence.count('A'),
        'T': dna_sequence.count('T'),
        'G': dna_sequence.count('G'),
        'C': dna_sequence.count('C')
    }
    return counts

# Example DNA sequence
dna_seq = "ATGCTTAGCTAGCTTACGATCGATCGATCG"

# Get nucleotide counts
nucleotide_counts = count_nucleotides(dna_seq)
print("DNA Sequence:", dna_seq)
print("Nucleotide Counts:", nucleotide_counts)



DNA Sequence: ATGCTTAGCTAGCTTACGATCGATCGATCG
Nucleotide Counts: {'A': 7, 'T': 9, 'G': 7, 'C': 7}


In [14]:
# Calculate percentage composition of nucleotides

total = sum(nucleotide_counts.values())
percentages = {nuc: (count / total) * 100 for nuc, count in nucleotide_counts.items()}

print("Nucleotide Percentages:")
for nuc, pct in percentages.items():
    print(f"{nuc}: {pct:.2f}%")


Nucleotide Percentages:
A: 23.33%
T: 30.00%
G: 23.33%
C: 23.33%


In [None]:
### Conclusion:
This experiment successfully demonstrates the use of Python to analyze a DNA sequence.  
The program counts each nucleotide and calculates their percentage composition,  
which is a fundamental step in computational genomics and DNA analysis.
ye