# Project: DNATools
This program implements various Bioinformatics tools in a tutorial-style Jupyter Notebook.

## Background
[INSERT TEXT]

## Find the length of a DNA strand
Given a sample DNA strand, calculate the number of nucleotides in the strand.

In [5]:
def DNA_length(input_strand):
    # Initialize a counter used to keep track of the number of nucleotides in the strand
    nucleotide_count = 0
    
    # Use a loop to iterate through the strand
    for char in input_strand:
        # for every nucleotide in the input strand, increment the counter
        nucleotide_count += 1
    
    return nucleotide_count

In [6]:
# Initialize a string that will serve as a DNA strand
sample_dataset = 'AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC'

# Call the function we created 
number_of_nucleotides = DNA_length(sample_dataset)
print('This DNA strand consists of ' + str(number_of_nucleotides) + ' nucleotides.')

This DNA strand consists of 70 nucleotides.


## A Step Further: Calculate the number of 'A' 'T' 'G' 'C' occurences in the DNA strand
A DNA strand is made up of four nucleotides: A, T, G, C. We are going to write a function that counts the number of occurences for these four symbols in a given DNA strand.

In [7]:
def individual_nucleotides(input_string):
    # Initialize a dictionary to keep track of the counts for each symbol
    nucleotide_dict = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
    
    # Iterate through each symbol in the DNA strand
    for char in input_string:
        if char == 'A':
            nucleotide_dict[char] += 1
            
        if char == 'T':
            nucleotide_dict[char] += 1
            
        if char == 'G':
            nucleotide_dict[char] += 1
            
        if char == 'C':
            nucleotide_dict[char] += 1
            
    return nucleotide_dict

In [8]:
## TEST for individual_nucleotides

# Initialize a string consisting of the acceptable nucleotides 'A', 'T', 'G', 'C'
sample = 'ATTGGGCCCC'
individual_count = individual_nucleotides(sample)

# Format the output
print('The DNA strand contains: \n' + 
            'A: ' + str(individual_count['A']) +
            '\nT: ' + str(individual_count['T']) +
            '\nG: ' + str(individual_count['G']) +
            '\nC: ' + str(individual_count['C'] ))

The DNA strand contains: 
A: 1
T: 2
G: 3
C: 4


# The Central Dogma
[BACKGROUND INFO]

# DNA Replication: complementing a strand of DNA
[BACKGROUND INFO]

In [9]:
def complement(template_strand):
    # Empty string to store the complement of the template strand
    complement_strand = ''
    
    complement_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
    
    # For each nucleotide in the sequence, add its complement to a new string
    for char in template_strand:
        
        # Append the complement of the current nucleotide using our dictionary complement_dict
        complement_strand += complement_dict[char]
                    
    return complement_strand

In [10]:
## TEST for  complement

template_strand = 'ATCTGACC'
complement_strand = complement(template_strand)

print('Template strand: ' + "'5' " + template_strand + " 3'" + '\nComplement strand: ' + "3' "+ complement_strand + " 5'")

Template strand: '5' ATCTGACC 3'
Complement strand: 3' TAGACTGG 5'


### Extra: Reverse Complement of a strand of DNA

In [11]:
def reverse_complement(complement_strand):
    # Empty string to store the reverse of the complement strand
    reverse_output = ''
    
    # Calculate the length of the template strand using our DNA_length function
    length = DNA_length(complement_strand)
    
    while(length > 0):
        reverse_output += complement_strand[length - 1]
        length -= 1
        
    return reverse_output        

In [12]:
## TEST for reverse_complement

sample_dataset = 'AAAACCCGGT'
comp = complement(sample_dataset)
reverse_comp = reverse_complement(comp) 

# Reverse complement should be 'ACCGGGTTTT'
assert reverse_comp == 'ACCGGGTTTT'
print('Template Strand: ' + sample_dataset + '\nReverse Complement Strand: ' + reverse_comp)

Template Strand: AAAACCCGGT
Reverse Complement Strand: ACCGGGTTTT


# Transcription: transcribing a complement strand into RNA
[BACKGROUND INFO]

In [13]:
def transcribe(dna_sequence):
    
    # Use the python function replace to replace instances of 'A' with 'U'
    output_RNA = dna_sequence.replace('A', 'U')
    
    return output_RNA

In [14]:
## TEST for transcribe
sample_DNA = 'TTCCATA'
rna_strand = transcribe(sample_DNA)

assert rna_strand == 'TTCCUTU'
print('DNA strand: ' + sample_DNA + '\nRNA strand: ' + rna_strand)

DNA strand: TTCCATA
RNA strand: TTCCUTU


# Translation: translating RNA into Protein
[BACKGROUND INFO]

In [36]:
def translate(rna_sequence):
    rna_codon = { "UUU" : "F", "CUU" : "L", "AUU" : "I", "GUU" : "V",
                  "UUC" : "F", "CUC" : "L", "AUC" : "I", "GUC" : "V",
                  "UUA" : "L", "CUA" : "L", "AUA" : "I", "GUA" : "V",
                  "UUG" : "L", "CUG" : "L", "AUG" : "M", "GUG" : "V",
                  "UCU" : "S", "CCU" : "P", "ACU" : "T", "GCU" : "A",
                  "UCC" : "S", "CCC" : "P", "ACC" : "T", "GCC" : "A",
                  "UCA" : "S", "CCA" : "P", "ACA" : "T", "GCA" : "A",
                  "UCG" : "S", "CCG" : "P", "ACG" : "T", "GCG" : "A",
                  "UAU" : "Y", "CAU" : "H", "AAU" : "N", "GAU" : "D",
                  "UAC" : "Y", "CAC" : "H", "AAC" : "N", "GAC" : "D",
                  "UAA" : "STOP", "CAA" : "Q", "AAA" : "K", "GAA" : "E",
                  "UAG" : "STOP", "CAG" : "Q", "AAG" : "K", "GAG" : "E",
                  "UGU" : "C", "CGU" : "R", "AGU" : "S", "GGU" : "G",
                  "UGC" : "C", "CGC" : "R", "AGC" : "S", "GGC" : "G",
                  "UGA" : "STOP", "CGA" : "R", "AGA" : "R", "GGA" : "G",
                  "UGG" : "W", "CGG" : "R", "AGG" : "R", "GGG" : "G"}
    
    # Contains the start codon that begins the translation process
    start_codon = 'AUG'
    
    # Contains the three stop codons that terminate the translation process
    stop_codon = ['UAA', 'UAG', 'UGA']
    
    # String that stores the protein being formed
    protein = ''
    
    # Iterate through the rna sequence is steps of 3 to analyze one codon at a time
    for i in range(0, len(rna_sequence), 3):
        # Look at the next 3 nucleotides from our starting point
        codon = rna_sequence[i:i+3]
        
        # Add the amino acid for the current codon to our protein string
        protein += rna_codon[codon]
        
    return protein

In [38]:
## TEST for translate
sample_rna = 'AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA'

protein = translate(sample_rna)
#assert protein == 'MAMAPRTEINSTRING'
print(protein)

MAMAPRTEINSTRINGSTOP
