The 20 commonly occurring amino acids are abbreviated by using 20 letters from the English alphabet (all letters except for B, J, O, U, X, and Z). Protein strings are constructed from these 20 symbols. Henceforth, the term genetic string will incorporate protein strings along with DNA strings and RNA strings.

The RNA codon table dictates the details regarding the encoding of specific codons into the amino acid alphabet.

Given: An RNA string s
 corresponding to a strand of mRNA (of length at most 10 kbp).

Return: The protein string encoded by s

In [9]:
codon_dict = {}


# Read the text file and process each line
with open("./codon.txt", 'r') as file:
    for line in file:
        # Split the line into codons and their corresponding amino acids
        codons = line.split()
        
        # There should be four pairs (codon, amino acid) per line
        for i in range(0, len(codons), 2):
            codon = codons[i]
            amino_acid = codons[i + 1]
            
            # Add the codon and its corresponding amino acid to the dictionary
            codon_dict[codon] = amino_acid

# Now codon_dict contains all codons as keys and their corresponding amino acids as values
print(codon_dict)

{'UUU': 'F', 'CUU': 'L', 'AUU': 'I', 'GUU': 'V', 'UUC': 'F', 'CUC': 'L', 'AUC': 'I', 'GUC': 'V', 'UUA': 'L', 'CUA': 'L', 'AUA': 'I', 'GUA': 'V', 'UUG': 'L', 'CUG': 'L', 'AUG': 'M', 'GUG': 'V', 'UCU': 'S', 'CCU': 'P', 'ACU': 'T', 'GCU': 'A', 'UCC': 'S', 'CCC': 'P', 'ACC': 'T', 'GCC': 'A', 'UCA': 'S', 'CCA': 'P', 'ACA': 'T', 'GCA': 'A', 'UCG': 'S', 'CCG': 'P', 'ACG': 'T', 'GCG': 'A', 'UAU': 'Y', 'CAU': 'H', 'AAU': 'N', 'GAU': 'D', 'UAC': 'Y', 'CAC': 'H', 'AAC': 'N', 'GAC': 'D', 'UAA': 'Stop', 'CAA': 'Q', 'AAA': 'K', 'GAA': 'E', 'UAG': 'Stop', 'CAG': 'Q', 'AAG': 'K', 'GAG': 'E', 'UGU': 'C', 'CGU': 'R', 'AGU': 'S', 'GGU': 'G', 'UGC': 'C', 'CGC': 'R', 'AGC': 'S', 'GGC': 'G', 'UGA': 'Stop', 'CGA': 'R', 'AGA': 'R', 'GGA': 'G', 'UGG': 'W', 'CGG': 'R', 'AGG': 'R', 'GGG': 'G'}


In [10]:
with open("./sample_input.txt", "r") as file:
    input = file.read()

with open("./sample_output.txt", "r") as file:
    output = file.read()

In [11]:
input.split("\n")

['AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA']

In [13]:
def split_into_segments(string):
    return [string[i:i + 3] for i in range(0, len(string), 3)]

In [15]:
def translate_ran_into_protein(input):
    protein_string = ""
    
    rna_string = input.split("\n")[0]
    
    codons = split_into_segments(rna_string)
    
    for c in codons:
        protein = codon_dict[c]
        if protein == "Stop":
            break
        protein_string += protein
    
    return protein_string
    
    

    # if len(str1) != len(str2):
    #     raise ValueError("Strings must be of equal length.")
    
    # return sum(char1 != char2 for char1, char2 in zip(str1, str2))


print(translate_ran_into_protein(input))


MAMAPRTEINSTRING


In [16]:
with open("./rosalind_prot.txt", "r") as file:
    real_input = file.read()

print(translate_ran_into_protein(real_input))


MAIARVSKELNTGCFSDRAIPIYRYNPSSLGKSPLYTLFKRSVTQRRPRVGCAPNSSCYWNAHLKVSPAGISTTRAADPPRSLRIPQRAQYVDTRSVDTVKEKSVNFRWFPSRFRQYSKVMLTLVVQRPAHGPTLAIRPISMNEPLSTKPGALRSCDPYTFVAFMYLGHTLKRYQILIYPGYSGPRWRNNHRVPPTTTKSQGCSSGSRKVEPISQRTAHSGEKQQHTRGTYGLLARYSSKFLCQPGYQPEINLHMPVFESRGGMSETERQALVLCGQYLTRGASTILYKPCWGDTKPIPAMFPAILSLKAIVAIRVVPVSTGDLHKNISYVAQSHKSACGVRRVQNGSLWPMVTTGLSTLHLMARVLGREYPLPFILVITQIKSRILPAYVELRVFDHVKEKLSLDRCQNGRPAASTCPHRSVDRIADDYSFFPYKAPIHDNLWCNWKGPGSPVVRRPPCKSTVSQTTTAVVLCKRCLKVNDEICATRRFISRWLCRCRIGCPILLSVHRTAVCDASSSALQTMLEVPDCYIDHDTFFLAGNLRAGECLDFQILVVLNAQDDPTVAHHPADDNWAPVVLTGVVRRKSHVTIGVTQLRSAATEQLISKLDDTLVGTVSRGPHSKLYALYGHCNDINVPTVLSIKNSDHIAPIKLLTNLASVNKPHLTKTNRAAGSGRHRGEKQLPRLHPGNGPVVSCCWIAAQLATHATGQGILRKHPSSGAFDCKLLMRPNHFKCGNRGGAIKPTDRLYYMDRLEPLSYDEIYNLAGRIEYHVKCPRVSGCGVKIVGLLHHCPFPEMAALYRHIMLPSGLSCFIKSNQVRMLRGSTLRIPSRRWRLRAIGTGQLGLDNKSGSAESWGHSHTYSRSRVADDAPKTGISRSASGLSDHMGVRTGYKRTATLPCGRPSSPALATDKTLQQALYVKVARLAESDSLGSRICEARLAEIVGSKHIHKTQRPPGLYLTGKFPSSLRRQRPNNRNNDSKGPDNCLGMADARYGFLGL