In [2]:
# http://rosalind.info/problems/locations/

def validate_base_sequence(base_sequence, RNAflag = False):
    valid_bases = 'UCAG' if RNAflag else 'TCAG'
    return all([(base in valid_bases) for base in base_sequence.upper()])

In [3]:
RNA_codon_table = {
    "UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
    "UCU":"S", "UCC":"s", "UCA":"S", "UCG":"S",
    "UAU":"Y", "UAC":"Y", "UAA":"STOP", "UAG":"STOP",
    "UGU":"C", "UGC":"C", "UGA":"STOP", "UGG":"W",
    "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
    "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
    "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
    "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
    "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
    "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
    "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
    "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
    "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
    "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
    "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
    "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",
}

def translate_RNA_codon(codon):
    return RNA_codon_table[codon]

In [21]:
from numpy.random import randint

def random_base(RNAflag = False):
    return ('UCAG' if RNAflag else 'TCAG')[randint(0,3)]

def random_codon(RNAflag = False):
    return random_base(RNAflag) + random_base(RNAflag) + random_base(RNAflag)

def random_codons(minlength = 3, maxlength = 10, RNAflag = False):
    """Generate a random list of codons (RNA if RNAflag, else DNA)
    between minlength and maxlength, inclusive"""
    return [random_codon(RNAflag) for n in range(randint(minlength, maxlength))]

def random_codons_translation(minlength = 3, maxlength = 10):
    """Generate a random list of codons between minlength and 
    maxlength, inclussive"""
    return [translate_RNA_codon(codon) for codon in random_codons(minlength, maxlength, True)]

In [28]:
random_codons_translation()

['L', 'P', 'N', 'N', 'N', 'F', 'T', 'S', 'STOP']

In [6]:
# Split the file contents at '>' to get a list of strings representing entries
def read_FASTA_strings(filename):
    with open(filename) as file:
        return file.read().split('>')[1:]

# Partition the strings to seperate the first line from the rest
def read_FASTA_entries(filename):
    return [seq.partition('\n') for seq in read_FASTA_strings(filename)]

# Remove the newlines from the sequence data
def read_FASTA_sequences(filename):
    return [(info[1:], seq.replace('\n', '')) 
            for info, ignore, seq in #ignor is ignores (!)
            read_FASTA_entries(filename)]

# Split the description line into peices where vertical bars appear
def read_FASTA_sequences_and_info(filename):
    return [[seq[0].split('|'), seq[1]] for seq in read_FASTA_sequences(filename)]

In [7]:
def make_indexed_sequences_dictionary(filename):
    return {info[0]: seq for info, seq in read_FASTA(filename)}

In [10]:
def aa_generator(rnaseq):
    """Return a generator object that produces an amino acid by translating
    the next three characters of rnaseq each time nex is called on it"""
    return (translate_RNA_codon(rnaseq[n:n+3])
           for n in range(0, len(rnaseq), 3))

In [11]:
aagen= aa_generator('CCACCGCACCAACAGCGC')
next(aagen, None)

'P'

In [12]:
next(aagen, None)

'P'

In [13]:
next(aagen, None)

'H'

In [20]:
def dr(name):
    """Return the result of the dir(name), ommitting any names beginning
    with an underscore"""
    return [nm for nm in dir(name) if nm[0] != '_']

# Extract sequence descriptions from a FASTA file and split them into fields
# at their vertical bars
def get_FASTA_descriptions(filename):
    with open(filename) as file:
        return [line[1:].split('|') for lin in file if line[0] == '>']

# Returns a list of all the different codes it finds in the third field    
def get_FASTA_codes(filename):
    with open(filename) as file:
        if len(line.split('|')) < 3:
            return []
        return {line.split('|')[2] for line in file if line [0] == '>'}

# No provision was made for entries whose descriptions begin with something
# other than 'gi'
def make_gi_indexed_sequences_dictionary(filename):
    return {info[1]: seq for info, seq in read_FASTA(filename)
            if len(info) >= 2 and info[0] == 'gi'}

In [15]:
# Using a generator to find the first common element
def first_common(list1, list2):
    """Return the first element in list1 that is in list2"""
    return next((item for item in list1 if item in list2), None)

In [16]:
def generate_triples(chars='TCAG'):
    """Return a list of all three-character combinations of unique
    characters in chars"""
    chars = set(chars)
    return [b1 + b2 + b3 for b1 in chars for b2 in chars for b3 in chars]

In [19]:
generate_triples('01')

['111', '110', '101', '100', '011', '010', '001', '000']