In [1]:
#reading in fasta function same as before
#taken from P5 solution

#takes a filename as input and returns a dictionary 
#where the keys are sequence IDs and the values are the corresponding sequences
def read_fasta(filename):
    sequences = {}
    with open(filename, "r") as fasta_file:
        # Initialize variables
        sequence_id = None
        sequence = ""

        # Iterate over each line in the file
        for line in fasta_file:
            # Remove newline character
            line = line.strip()

            if line.startswith(">"):
                # This is a header line
                if sequence_id is not None:
                    # Save the previous sequence
                    sequences[sequence_id] = sequence
                    sequence = ""

                # Extract the sequence ID from the header line
                sequence_id = line[1:]
            else:
                # This is a sequence line
                sequence += line

        # Save the last sequence
        if sequence_id is not None:
            sequences[sequence_id] = sequence

    return sequences


In [40]:
seqs = read_fasta('text.txt')
#note that the first sequence in seqs is the original transcript and all other ones are introns

In [6]:
#taken from P8
with open('codontable.txt', "r") as codon_table:
    codons = {} #dict that matches a 3 length string to an amino acid
    sequence = None
    amino = None
    
    for line in codon_table:

        #calls each element in a line
        for i in line.split():
            
            #if the element is a sequence
            if len(i)==3:
                sequence = i
            
            #if the element is an amino acid
            elif len(i)==1:
                amino = i
            
            #if the element is a stop codon
            elif i=='Stop':
                amino = '!Stop'
            
            else:
                print('ERROR unexpected string length: ', i)

            #adds to dict 
            if sequence is not None and amino is not None:
                codons[sequence] = amino
                sequence = None
                amino = None

In [8]:
#from P18

def translate(sequence):
    protein_sequence = ''
    flag = 0    #to ensure protein has stop codon...
    
    #for each length three string call an amino acid
    for i in range(0,len(sequence),3):
        codon = sequence[i:i+3]
        
        #checks if codon is length 3
        if len(codon) == 3:
            aa = codons[codon]
        
        #halt translation at stop codon...
        if aa == '!Stop':
            flag = 1
            break
        
        else:
            protein_sequence += aa
        
        
    if flag == 1 or 0:   #changed as we dont care in this case 
        return protein_sequence
    
    #returns None if protein has no stop codon - change flag check if it dosnt matter
    elif flag == 0:
        return None    

In [72]:
#from P2
def transcribe(dna_in):
    rna_out = dna_in.replace("T","U")
    return rna_out

In [67]:
#takes in fasta dict with first seq as original seq and rest as introns and outputs spliced sequence
#dicts are NOT ordered
#you must input name of the original seq

def splc(seqs, seq_name):
    introns = []
    
    for name, seq in seqs.items():
        
        #sort seqs into introns and original sequence
        if name == seq_name:
            spliced_seq = seq 
        else:
            introns.append(seq)
    
    #loop over all introns
    for intron in introns:
        i_len = len(intron)
        positions = [] #capable of splicing multiple occurences of the same intron 
        
        #loops over all positions in original seq that can be the starting point for the intron
        for pos in range(0, len(spliced_seq)-i_len):
            
            #checks if sequence at position is intronic
            if spliced_seq[pos:pos+i_len] == intron:
                positions.append(pos)
                #print(spliced_seq[pos:pos+i_len])
        
        #removes all instances of introns in sequence
        #pos is modified to account for intron removal per instance
        #print (positions)
        mod_pos = 0
        
        #loops over positions, modifying next positions based on spliced and shortened seq
        for raw_pos in positions:
            raw_pos -= mod_pos
            spliced_seq = spliced_seq[0:raw_pos] + spliced_seq[(raw_pos+i_len):]
            #print(spliced_seq)
            mod_pos += i_len
    
    return spliced_seq
            
    #for i in range(0, len(seq)):
        

In [76]:
seqs = read_fasta('rosalind_splc.txt')
#note that the first sequence in seqs is the original transcript and all other ones are introns

In [77]:
DNA = splc(seqs, 'Rosalind_4142')

In [78]:
RNA = transcribe(DNA)

In [79]:
prot = translate(RNA)

In [80]:
print(prot)

MVIIVTPPNYLVDSPLTILDSELSYACSVRPRTKRLFWVPPIADHWLDPNNTNIAVMYCLRLSFLTTFLSGGSVRLRGSSGETFLWCIVKVIAEGLVTHENWDYRILAPTSLRAPALTAQRLQPTQTSAIRACWPSCFCTGLLVLVRGTQSIMDPLARDFLRVGVGVLFAVWPSGLRSPDAIWRTPSLSL
