In [3]:
#reading in fasta function same as before
#taken from P5 solution

#takes a filename as input and returns a dictionary 
#where the keys are sequence IDs and the values are the corresponding sequences
def read_fasta(filename):
    sequences = {}
    with open(filename, "r") as fasta_file:
        # Initialize variables
        sequence_id = None
        sequence = ""

        # Iterate over each line in the file
        for line in fasta_file:
            # Remove newline character
            line = line.strip()

            if line.startswith(">"):
                # This is a header line
                if sequence_id is not None:
                    # Save the previous sequence
                    sequences[sequence_id] = sequence
                    sequence = ""

                # Extract the sequence ID from the header line
                sequence_id = line[1:]
            else:
                # This is a sequence line
                sequence += line

        # Save the last sequence
        if sequence_id is not None:
            sequences[sequence_id] = sequence

    return sequences


In [4]:
seqs = read_fasta('test.txt')

In [38]:
#from P2
def transcribe(dna_in):
    rna_out = dna_in.replace("T","U")
    return rna_out

In [13]:
#taken from P3
def revc(dna_in):
    dna_out = ''
    for i in dna_in:
        if i == 'A':
            dna_out = 'T' + dna_out  
        elif i == 'T':
            dna_out = 'A' + dna_out
        elif i == 'C':
            dna_out = 'G' +  dna_out 
        elif i == 'G':
            dna_out = 'C' + dna_out
        else:
            print ('non-canonical base!')
            break

    return dna_out    

In [30]:
#taken from P8
with open('codontable.txt', "r") as codon_table:
    codons = {} #dict that matches a 3 length string to an amino acid
    sequence = None
    amino = None
    
    for line in codon_table:

        #calls each element in a line
        for i in line.split():
            
            #if the element is a sequence
            if len(i)==3:
                sequence = i
            
            #if the element is an amino acid
            elif len(i)==1:
                amino = i
            
            #if the element is a stop codon
            elif i=='Stop':
                amino = '!Stop'
            
            else:
                print('ERROR unexpected string length: ', i)

            #adds to dict 
            if sequence is not None and amino is not None:
                codons[sequence] = amino
                sequence = None
                amino = None

In [68]:
#also from P8

def translate(sequence):
    protein_sequence = ''
    flag = 0    #to ensure protein has stop codon...
    
    #for each length three string call an amino acid
    for i in range(0,len(sequence),3):
        codon = sequence[i:i+3]
        
        #checks if codon is length 3
        if len(codon) == 3:
            aa = codons[codon]
        
        #halt translation at stop codon...
        if aa == '!Stop':
            flag = 1
            break
        
        else:
            protein_sequence += aa
        
        
    if flag == 1:
        return protein_sequence
    
    #returns None if protein has no stop codon - change flag check if it dosnt matter
    elif flag == 0:
        return None
    

In [48]:
codons

{'UUU': 'F',
 'CUU': 'L',
 'AUU': 'I',
 'GUU': 'V',
 'UUC': 'F',
 'CUC': 'L',
 'AUC': 'I',
 'GUC': 'V',
 'UUA': 'L',
 'CUA': 'L',
 'AUA': 'I',
 'GUA': 'V',
 'UUG': 'L',
 'CUG': 'L',
 'AUG': 'M',
 'GUG': 'V',
 'UCU': 'S',
 'CCU': 'P',
 'ACU': 'T',
 'GCU': 'A',
 'UCC': 'S',
 'CCC': 'P',
 'ACC': 'T',
 'GCC': 'A',
 'UCA': 'S',
 'CCA': 'P',
 'ACA': 'T',
 'GCA': 'A',
 'UCG': 'S',
 'CCG': 'P',
 'ACG': 'T',
 'GCG': 'A',
 'UAU': 'Y',
 'CAU': 'H',
 'AAU': 'N',
 'GAU': 'D',
 'UAC': 'Y',
 'CAC': 'H',
 'AAC': 'N',
 'GAC': 'D',
 'UAA': '!Stop',
 'CAA': 'Q',
 'AAA': 'K',
 'GAA': 'E',
 'UAG': '!Stop',
 'CAG': 'Q',
 'AAG': 'K',
 'GAG': 'E',
 'UGU': 'C',
 'CGU': 'R',
 'AGU': 'S',
 'GGU': 'G',
 'UGC': 'C',
 'CGC': 'R',
 'AGC': 'S',
 'GGC': 'G',
 'UGA': '!Stop',
 'CGA': 'R',
 'AGA': 'R',
 'GGA': 'G',
 'UGG': 'W',
 'CGG': 'R',
 'AGG': 'R',
 'GGG': 'G'}

In [69]:
#function that takes in the output of read fasta (for any number of fasta seqs) and outputs the fasta name alongside all possible protein seqs based on ORF and STOP sites

def orf(fasta_dict):
    
    #making protein list to ensure only distinct protein seqs are printed
    protein_list = []
    
    #iterate over all seqs in fasta file
    for fasta_name, seq in fasta_dict.items():
        
        seq_revc = revc(seq)
        protein = None #change to prot seq when methionine is encountered
        
        #translate DNA strings into RNA strings 
        RNA = transcribe(seq)
        RNA_revc = transcribe(seq_revc)
        
        
        #iterate over all codons in seq to determine if any are M
        for i in range(0, len(RNA)-2):
   
            #checks if codon matches methionine
            if codons[RNA[i:i+3]] == 'M':
                protein = translate(RNA[i:])
                
                #returns only proteins with STOP codons
                if protein is not None and protein not in protein_list:
                    protein_list.append(protein)
                
        #iterate over all codons in revc seq to determine if any are M
        for i in range(0, len(RNA_revc)-2):
            
            #checks if codon matches methionine
            if codons[RNA_revc[i:i+3]] == 'M':
                protein = translate(RNA_revc[i:])
                
                #returns only proteins with STOP codons
                if protein is not None and protein not in protein_list:
                    protein_list.append(protein)
    
    #easier to copy output
    for i in protein_list:
        print (i)
    
    return protein_list

In [65]:
orf(seqs)

M
MGMTPRLGLESLLE
MTPRLGLESLLE
MLLGSFRLIPKETLIQVAGSSPCNLS


['M', 'MGMTPRLGLESLLE', 'MTPRLGLESLLE', 'MLLGSFRLIPKETLIQVAGSSPCNLS']

In [66]:
seqs = read_fasta('rosalind_orf.txt')

In [70]:
orf(seqs)

MNRSNLS
M
MGRQV
MTELAPRPCLADIILPRRGF
MPMYHEI
MYHEI
MTGLADLDVRRRQLKVLSDGRKRR
MLVIWYEVTL
MEAQLSVCEPGAVSSIVSVMLVLPARGRNQGLESAR
MRTRCGIVHSQCHARASSERKKSGS
MLVLPARGRNQGLESAR
MVITRLRGHMDSMQSSPARPYSQLPVQQPRTQIAGASLLESTRREEIKS
MDSMQSSPARPYSQLPVQQPRTQIAGASLLESTRREEIKS
MQSSPARPYSQLPVQQPRTQIAGASLLESTRREEIKS
MWPRNRVMTIFGLRHLLW
MTIFGLRHLLW
MT
MTLTMDDTAPGSHTDN
MDDTAPGSHTDN
MNV
MSARQGLGANSVMRSASYRPNMLNLNMMAKVRCI
MRSASYRPNMLNLNMMAKVRCI
MLNLNMMAKVRCI
MMAKVRCI
MAKVRCI
MITLRTRGGVKPRSNFLLSVYAGSLISLAAYGEVGPRDCCTQDKFERFMRI
MLGHLYHLQPTARLAHGIAVLKTSLNGS
MRI


['MNRSNLS',
 'M',
 'MGRQV',
 'MTELAPRPCLADIILPRRGF',
 'MPMYHEI',
 'MYHEI',
 'MTGLADLDVRRRQLKVLSDGRKRR',
 'MLVIWYEVTL',
 'MEAQLSVCEPGAVSSIVSVMLVLPARGRNQGLESAR',
 'MRTRCGIVHSQCHARASSERKKSGS',
 'MLVLPARGRNQGLESAR',
 'MVITRLRGHMDSMQSSPARPYSQLPVQQPRTQIAGASLLESTRREEIKS',
 'MDSMQSSPARPYSQLPVQQPRTQIAGASLLESTRREEIKS',
 'MQSSPARPYSQLPVQQPRTQIAGASLLESTRREEIKS',
 'MWPRNRVMTIFGLRHLLW',
 'MTIFGLRHLLW',
 'MT',
 'MTLTMDDTAPGSHTDN',
 'MDDTAPGSHTDN',
 'MNV',
 'MSARQGLGANSVMRSASYRPNMLNLNMMAKVRCI',
 'MRSASYRPNMLNLNMMAKVRCI',
 'MLNLNMMAKVRCI',
 'MMAKVRCI',
 'MAKVRCI',
 'MITLRTRGGVKPRSNFLLSVYAGSLISLAAYGEVGPRDCCTQDKFERFMRI',
 'MLGHLYHLQPTARLAHGIAVLKTSLNGS',
 'MRI']