# Python Assignment

## Script for Sequence Translating Penguin Data for Dr.X

This script is used for processing and visualizing data for 12 penguin species. More specifically, cytochrome-b sequencing data will be translated and used to compute simple summaries to visualize data.

Part of this script was created in part by Dr. X.

## Libraries/Installations

In [16]:
# IMPORTANT: install BioPython for script to be functional

from Bio import SeqIO
from Bio.Data import CodonTable
import pandas as pd

## Functions

In [3]:
# Assigns sequences with the species name to a dictionary variable
def get_sequences_from_file(fasta_fn):
    sequence_data_dict = {}
    for record in SeqIO.parse(fasta_fn, "fasta"):
        
        #Splits the description line of the fasta file (starting with ">")
        description = record.description.split()
        
        # Assigns the species name to a variable
        species_name = description[1] + " " + description[2]
        
        #Adds the species name and FASTA sequence to the dictionary variable
        sequence_data_dict[species_name] = record.seq
        
    #Returns the created dictionary
    return(sequence_data_dict)

In [11]:
#Translate a nucleotide sequence into an amino acid sequence
def translate_sequence(string_nucleotides):
    #Creates a BioPython object that holds amino acid code for Vertebrate Mitochondria
    mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]
    aa_sequence_string = ""
    
    #Iterates over the nucleotide sequence in steps of 3
    for i in range(0, len(string_nucleotides) - 2, 3):
        
        #Assigns three nucleotides to the codon variable
        codon = string_nucleotides[i:i+3]
        
        #Breaks loop if a stop codon is found, leaving it out of the returned aa sequence
        if codon in mito_table.stop_codons:
            break
            
        #Uses the codon table method to search the dictionary mito_table for the codon that matches the current
        #sequence and adds it to the aa_sequence_string
        aa = mito_table.forward_table[codon]
        aa_sequence_string += aa
        
    return(aa_sequence_string)

In [19]:
#Seperate function to translate the nucleotide sequence into an amino acid sequence using the Seq module from BioPython
from Bio.Seq import Seq
def alternative_translation(string_nucleotides):
    
    #Assigns the nucleotide sequence to an object in BioPython that is used for sequence manipulation
    seq = Seq(string_nucleotides)
    
    #Uses the translate method to obtain the aa sequence
    aa_seq = seq.translate(table = "Vertebrate Mitochondrial", to_stop = True)
    
    #Changes the variable type from Seq object to string
    aa_seq_string = str(aa_seq)
    
    return(aa_seq_string)
    

In [26]:
#Calculates the molecular weight for each amino acid sequence using the ProtPram BioPython Module
from Bio.SeqUtils.ProtParam import ProteinAnalysis
def compute_molecular_weight(aa_seq):
   
    #Ensures that the aa_seq variable is a str
    aa_seq_string = str(aa_seq)
    
    #Uses the Protein_Analysis function to convert the string into a Seq object for the ProtPram module
    analyzed_aa_seq = ProteinAnalysis(aa_seq_string)
    
    #Calculates the molecular weight of the aa string
    molecular_weight = analyzed_aa_seq.molecular_weight()
    
    return(molecular_weight)
    