Requirements

In [1]:
pip install biopython


Note: you may need to restart the kernel to use updated packages.


In [4]:
from Bio import SeqIO

# Define file paths for fna and faa
fna_file = "Dataset\CCDS_nucleotide.current.fna"  # Coding sequences
faa_file = "Dataset\CCDS_protein.current.faa"  # Protein sequences

# Parse fna (nucleotide) sequences
cds_sequences = {record.id: str(record.seq) for record in SeqIO.parse(fna_file, "fasta")}

# Parse faa (amino acid) sequences
aa_sequences = {record.id: str(record.seq) for record in SeqIO.parse(faa_file, "fasta")}

# Ensure there is a correspondence between nucleotide and protein sequences
assert set(cds_sequences.keys()) == set(aa_sequences.keys()), "Mismatch in sequence IDs!"

# Example: Accessing matched sequences
for seq_id in cds_sequences:
    dna_sequence = cds_sequences[seq_id]
    protein_sequence = aa_sequences[seq_id]
    print(f"ID: {seq_id}")
    print(f"DNA: {dna_sequence[:30]}...")  # Display part of the DNA
    print(f"Protein: {protein_sequence[:30]}...\n")  # Display part of the protein

ID: CCDS2.2|Hs110|chr1
DNA: ATGTCCAAGGGGATCCTGCAGGTGCATCCT...
Protein: MSKGILQVHPPICDCPGCRISSPVNRGRLA...

ID: CCDS3.1|Hs110|chr1
DNA: ATGGCAGCTGCGGGGAGCCGCAAGAGGCGC...
Protein: MAAAGSRKRRLAELTVDEFLASGFDSESES...

ID: CCDS4.1|Hs110|chr1
DNA: ATGGGGAACAGCCACTGTGTCCCTCAGGCC...
Protein: MGNSHCVPQAPRRLRASFSRKPSLKGNRED...

ID: CCDS5.1|Hs110|chr1
DNA: ATGGCCGCAGACACGCCGGGGAAACCGAGC...
Protein: MAADTPGKPSASPMAGAPASASRTPDKPRS...

ID: CCDS6.1|Hs110|chr1
DNA: ATGGGCTGGGACCTGACGGTGAAGATGCTG...
Protein: MGWDLTVKMLAGNEFQVSLSSSMSVSELKA...

ID: CCDS7.2|Hs110|chr1
DNA: ATGGCGCTGCGGCACCTCGCCCTCCTGGCT...
Protein: MALRHLALLAGLLVGVASKSMENTAQLPEC...

ID: CCDS8.1|Hs110|chr1
DNA: ATGGGGAGCAGCCAGGAGGAGGGACTCCGG...
Protein: MGSSQEEGLRCQPSQPDHDADGHCGPDLEG...

ID: CCDS9.1|Hs110|chr1
DNA: ATGGCACAGCACGGGGCGATGGGCGCGTTT...
Protein: MAQHGAMGAFRALCGLALLCALSLGQRPTG...

ID: CCDS10.1|Hs110|chr1
DNA: ATGGCACAGCACGGGGCGATGGGCGCGTTT...
Protein: MAQHGAMGAFRALCGLALLCALSLGQRPTG...

ID: CCDS11.1|Hs110|chr1
DNA: ATGTGCGTGGGGGCTC

In [None]:
# Function to split a DNA sequence into codons
def split_into_codons(dna_sequence):
    return [dna_sequence[i:i+3] for i in range(0, len(dna_sequence), 3) if len(dna_sequence[i:i+3]) == 3]

# Validate codon translation using the Biopython genetic code
from Bio.Data import CodonTable
genetic_code = CodonTable.unambiguous_dna_by_id[1]  # Standard genetic code

# Compare DNA codons and protein sequences
for seq_id in cds_sequences:
    dna_sequence = cds_sequences[seq_id]
    codons = split_into_codons(dna_sequence)
    protein_sequence = aa_sequences[seq_id]

    # Translate codons to amino acids
    translated_protein = ''.join([genetic_code.forward_table.get(codon, '?') for codon in codons])

    # Check if the translation matches
    if translated_protein != protein_sequence:
        print(f"Mismatch in sequence {seq_id}!")
    else:
        print(f"Sequence {seq_id} matches successfully!")


NameError: name 'fna_sequences' is not defined