<a href="https://colab.research.google.com/github/willfinnigan/simple_codon/blob/master/Simple_Codon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install lea==2.0.0



In [0]:
# the lea library makes it easy to make a weighted random choice from a dictionary
import lea

def make_new_seq(protein_seq, codon_table, low_cuttoff):
    """
    This function takes a protein_sequence in the form MGACREDATY...,
    a codon table in the form {"N": {"AAT": 27, "AAC": 973}, "D": {"GAT": 48, "GAC": 952},...}
    and a low_cuttoff below which triplets will not be picked  (eg 100)

    A dna sequence will be returned for the protein,
    with codon bias similar to the provided codon table,
    but avoiding any triplets with frequencies below the low cuttoff
    """

    # make an empty variable to hold the new dna sequence
    dna_seq = ""

    # iterate through the protein sequence
    for aminoacid in protein_seq:

        # loop this section until a triplet which has a higher frequency than the low cuttoff is selected
        triplet_selected = False
        while triplet_selected == False:

            # get the dict of triplets for this amino acid
            # this will look like {"AAT": 27, "AAC": 973}
            codon_dict = codon_table.get(aminoacid)

            # make an lea dictionary from codon_dict
            # the lea library makes it easy to do weighted random selection
            lea_codon_dict = lea.Lea.fromValFreqsDict(codon_dict)

            # select a triplet randomly, weighted by the frequency of occurance
            triplet = lea_codon_dict.random()

            # if the triplet selected has a higher frequency than the cuttoff,
            # add the triplet to dna_seq and finish the while loop
            if codon_dict.get(triplet) > low_cuttoff:
                triplet_selected = True
                dna_seq += triplet

    # having iterated through the protein sequence, dna_seq is complete
    return dna_seq


In [0]:
codon_table = {  
        "A": {"GCT": 24, "GCG": 227, "GCC": 733, "GCA": 17},  
        "R": {"AGA": 10, "CGA": 15, "CGT": 19, "AGG": 189, "CGC": 340, "CGG": 426},  
        "N": {"AAT": 27, "AAC": 973},  
        "D": {"GAT": 48, "GAC": 952},  
        "C": {"TGT": 50, "TGC": 950},  
        "*": {"TAA": 195, "TAG": 368, "TGA": 438},  
        "Q": {"CAA": 130, "CAG": 870},  
        "E": {"GAA": 125, "GAG": 875},  
        "G": {"GGT": 25, "GGA": 61, "GGC": 400, "GGG": 515},  
        "H": {"CAT": 50, "CAC": 950},  
        "I": {"ATA": 46, "ATT": 94, "ATC": 860},  
        "L": {"TTA": 8, "CTA": 22, "TTG": 66, "CTT": 114, "CTG": 285, "CTC": 505},  
        "K": {"AAA": 85, "AAG": 915},  
        "M": {"ATG": 1000},  
        "F": {"TTT": 177, "TTC": 823},  
        "P": {"CCA": 23, "CCT": 68, "CCG": 176, "CCC": 733},  
        "S": {"TCA": 9, "AGT": 11, "TCT": 24, "TCG": 110, "AGC": 399, "TCC": 446},  
        "T": {"ACA": 11, "ACT": 12, "ACG": 250, "ACC": 726},  
        "W": {"TGG": 1000},  
        "Y": {"TAT": 43, "TAC": 957},  
        "V": {"GTA": 17, "GTT": 31, "GTC": 336, "GTG": 615}}

In [8]:
protein = 'MRAVVFENKERVAVKEVNAPRLQHPLDALVRVHLAGICGSDLHLYHGKIPVLPGSVLGHEFVGQVEAVGEGIQDLQPGDWVVGPFHIACGTCPYCRRHQYNLCERGGVYGYAIPINAEQENP*'
codon_cuttoff = 100

new_seq = make_new_seq(protein, codon_table, codon_cuttoff)
print(new_seq)

ATGCGGGCCGTGGTGTTCGAGAACAAGGAGCGCGTGGCGGTGAAGGAAGTGAACGCCCCCCGCCTTCAACACCCGCTGGACGCCCTGGTGCGCGTCCACCTCGCGGGGATCTGCGGGTCCGACCTGCACCTCTACCACGGCAAGATCCCGGTCCTCCCCGGCTCCGTCCTCGGCCACGAATTCGTGGGCCAGGTGGAGGCGGTGGGGGAGGGCATCCAGGACCTCCAGCCCGGCGACTGGGTGGTGGGCCCCTTCCACATCGCGTGCGGCACCTGCCCCTACTGCAGGCGGCACCAGTACAACCTGTGCGAGCGCGGCGGCGTCTACGGGTACGCGATCCCCATCAACGCCGAACAGGAGAACCCCTAG
