http://rosalind.info/problems/list-view/

## Read FASTA file

In [1]:
# Split the file contents at '>' to get a list of strings representing entries
def read_FASTA_strings(filename):
    with open(filename) as file:
        return file.read().split('>')[1:]

# Partition the strings to seperate the first line from the rest
def read_FASTA_entries(filename):
    return [seq.partition('\n') for seq in read_FASTA_strings(filename)]

# Remove the newlines from the sequence data
def read_FASTA_sequences(filename):
    return [(info[0:], seq.replace('\n', '')) 
            for info, ignore, seq in #ignor is ignores (!)
            read_FASTA_entries(filename)]

# Split the description line into peices where vertical bars appear
def read_FASTA_sequences_and_info(filename):
    return [[seq[0].split('|'), seq[1]] for seq in read_FASTA_sequences(filename)]

# Create an sequence dictionary from sequence data
def make_indexed_sequences_dictionary(filename):
    return {info: seq for info, seq in read_FASTA_sequences(filename)}

### Counting DNA Nucleotides

In [2]:
# Counting DNA Nucleotides
# http://rosalind.info/problems/dna/
def count_bases(DNA_string):
    DNA_string.upper()
    base_count = {}
    base_count['A'] = DNA_string.count('A')
    base_count['C'] = DNA_string.count('C')
    base_count['G'] = DNA_string.count('G')
    base_count['T'] = DNA_string.count('T')
    return base_count

dna = 'AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC'
count_bases(dna)

{'A': 20, 'C': 12, 'G': 17, 'T': 21}

### Transcribing DNA into RNA

In [3]:
# Transcribing DNA into RNA
# http://rosalind.info/problems/rna/
def transcribe_base_sequence(base_sequence, RNAflag = False):
    if RNAflag:
        base_sequence = base_sequence.replace('U', 'T')
    else:
        base_sequence = base_sequence.replace('T', 'U')
    return base_sequence

dna = 'GAGTGTATATCACTTCGACACGACGTGTCCTCAGTCCTCCCTGTTGCTGCTTTTAACATTCGTGTATGACGGGGCCGGTCCATATAAGCAACGGACTTGGACGCGCTAGTCTTGGCCCGGCCGTTTTAGGCTCGACGAAACTCTATATTTTTACCTCGTCCACGAAACTACTGACTACTAACATATATCTACTCACGCGCATCCGTACCTCATAGTGAGAGATTCGCCCTGTTTTGATGCTGAATTGAAAGCATACGTACAGTCAGTCCATCGAATCCACCCGCATATAGTGTAAACGCCCTCGCATGCCCGACCTGGACACTAGCGTAGTATGGCATCTAGAGAGCCGGACCCCTCCATGAACGCGAGCTGCCAACTCCTCTCTTATTGCCACGAGTAGAGACCCCGTCATATTTCCCGGACAGGGCGCACCTGGGTTGTCGTGATTGCCAGATAGACCTAGTGGTTTTCCATAACCTTTCCTACGTTGTCCACCCGTATGTGTAGACAGTTCCACCTTGCTTCGATGCCATCACGATGAGGAACTTAGTCCCATTAGGGTCAAAGCTCATGGTTTATGCATTTTAGATCCGAAGCGGCATACCGGTTTTTGGCATAGATCCCAATAGTAGCTCATTACACATAATTGCGGTCGTGCTTACGATATAATCCCGGTGAACCGTTGATATCACGCGTAAAGATCACTAGGAAAGTTCCGACCGTGTCGTTGCGGATCATTATTTGCTTCTACAGCTGAAGCGTGGGATGCGCTACCCGGCACAGCAGCACACGCAAATCCAAATTTTTGGAGTCTTACGACAGTTACCTGGGCGGTATTGGTTATCACAGCCTCGTTTTATTAGTATTGAATACTCGGCTCGTCCTGGTAGGCGCAGGTGAAGAGCCTCACGTTCTTCAACACTGGGTCGGA'
transcribe_base_sequence(dna)

'GAGUGUAUAUCACUUCGACACGACGUGUCCUCAGUCCUCCCUGUUGCUGCUUUUAACAUUCGUGUAUGACGGGGCCGGUCCAUAUAAGCAACGGACUUGGACGCGCUAGUCUUGGCCCGGCCGUUUUAGGCUCGACGAAACUCUAUAUUUUUACCUCGUCCACGAAACUACUGACUACUAACAUAUAUCUACUCACGCGCAUCCGUACCUCAUAGUGAGAGAUUCGCCCUGUUUUGAUGCUGAAUUGAAAGCAUACGUACAGUCAGUCCAUCGAAUCCACCCGCAUAUAGUGUAAACGCCCUCGCAUGCCCGACCUGGACACUAGCGUAGUAUGGCAUCUAGAGAGCCGGACCCCUCCAUGAACGCGAGCUGCCAACUCCUCUCUUAUUGCCACGAGUAGAGACCCCGUCAUAUUUCCCGGACAGGGCGCACCUGGGUUGUCGUGAUUGCCAGAUAGACCUAGUGGUUUUCCAUAACCUUUCCUACGUUGUCCACCCGUAUGUGUAGACAGUUCCACCUUGCUUCGAUGCCAUCACGAUGAGGAACUUAGUCCCAUUAGGGUCAAAGCUCAUGGUUUAUGCAUUUUAGAUCCGAAGCGGCAUACCGGUUUUUGGCAUAGAUCCCAAUAGUAGCUCAUUACACAUAAUUGCGGUCGUGCUUACGAUAUAAUCCCGGUGAACCGUUGAUAUCACGCGUAAAGAUCACUAGGAAAGUUCCGACCGUGUCGUUGCGGAUCAUUAUUUGCUUCUACAGCUGAAGCGUGGGAUGCGCUACCCGGCACAGCAGCACACGCAAAUCCAAAUUUUUGGAGUCUUACGACAGUUACCUGGGCGGUAUUGGUUAUCACAGCCUCGUUUUAUUAGUAUUGAAUACUCGGCUCGUCCUGGUAGGCGCAGGUGAAGAGCCUCACGUUCUUCAACACUGGGUCGGA'

### Complementing a strand of DNA

In [4]:
# Complementing a strand of DNA
# http://rosalind.info/problems/revc/
def reverse_complement(base_sequence):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    return "".join(complement.get(base, base) for base in reversed(base_sequence))

dna = 'CAGCCTCTGTGTACCTGCATAAAGGATGAAATACATGTAACGATAGGCAGTGAGATAAGATTGGAATTTGAGAACTAATCATCCGTCGTGATTCTAAGTTCAGATTGCAGCAGATAACATGTATATCTTATGCTGGAGGCCACCTTACCGACGATTACTGCCTGTGGGGCGAAGTCTACCTTGATGCTAACCGACGCCAACCTGGCAAAATAGTGTCCGCAGAAACAAACATTGGCCTGGGGATTACGGGTTCTACGCACATGATCCGTCATGGAGCCTGACGGCTTATGCACATACGTCGCTAAGGCCCTGCGCTTCTATATATCTAGAGGTGTCGGTCTTCTAACGTGGTGCATAACCATTGGATCTATATCCATGCCTCCAGTAAAGGAATATAACGCTTCAAACGGCAGTGAGAATTGGGGGTGCTGATCCTACTAGATCACCTGGCTGACTTCTTGAGTAGTTGCTGACGCTCCTCTACATATTATCACGTGAGCTTTGCCTATCCCACGATCGACTCGAGTACTATCTGCCGTACAAGATTCCTCACAGCCGGGCCATGCCTTACACCCAAGTTATGCTCCAATTGCGACGCGGGATGTCATTCCCGACCAGTCCTAGACATTGTGAACGGCCTGTGATGAGTCGACGTATGGGTTTCTGCAGTGAGGGTGATACTCGCGACGATTCGCCGTTACGTCTAAGCCTAAGGTTCGGGGAGGTTCACTACGTCCTTCGCTTACAACGATCGTCTCGACCGGGATTTCACATGCTATCGGAGACCAAGTGATCGTGAGGTCCATGGGTACTCCTGGGTGGGCTTTCCGGTCTTGGATCTCCTGTTCGTCCTATACTCTATGTTAGTACCGTATCTTAACGCCTTTTTCACAGCTACGTGTTGTGCCAACATA'
reverse_complement(dna)

'TATGTTGGCACAACACGTAGCTGTGAAAAAGGCGTTAAGATACGGTACTAACATAGAGTATAGGACGAACAGGAGATCCAAGACCGGAAAGCCCACCCAGGAGTACCCATGGACCTCACGATCACTTGGTCTCCGATAGCATGTGAAATCCCGGTCGAGACGATCGTTGTAAGCGAAGGACGTAGTGAACCTCCCCGAACCTTAGGCTTAGACGTAACGGCGAATCGTCGCGAGTATCACCCTCACTGCAGAAACCCATACGTCGACTCATCACAGGCCGTTCACAATGTCTAGGACTGGTCGGGAATGACATCCCGCGTCGCAATTGGAGCATAACTTGGGTGTAAGGCATGGCCCGGCTGTGAGGAATCTTGTACGGCAGATAGTACTCGAGTCGATCGTGGGATAGGCAAAGCTCACGTGATAATATGTAGAGGAGCGTCAGCAACTACTCAAGAAGTCAGCCAGGTGATCTAGTAGGATCAGCACCCCCAATTCTCACTGCCGTTTGAAGCGTTATATTCCTTTACTGGAGGCATGGATATAGATCCAATGGTTATGCACCACGTTAGAAGACCGACACCTCTAGATATATAGAAGCGCAGGGCCTTAGCGACGTATGTGCATAAGCCGTCAGGCTCCATGACGGATCATGTGCGTAGAACCCGTAATCCCCAGGCCAATGTTTGTTTCTGCGGACACTATTTTGCCAGGTTGGCGTCGGTTAGCATCAAGGTAGACTTCGCCCCACAGGCAGTAATCGTCGGTAAGGTGGCCTCCAGCATAAGATATACATGTTATCTGCTGCAATCTGAACTTAGAATCACGACGGATGATTAGTTCTCAAATTCCAATCTTATCTCACTGCCTATCGTTACATGTATTTCATCCTTTATGCAGGTACACAGAGGCTG'

### Mendel's First Law

In [5]:
# Mendel's First Law
# http://rosalind.info/problems/iprb/
# Import comb (combination operation) from the scipy library 
from scipy.special import comb

def calculateProbability(k, m, n):
    # Calculate total number of organisms in the population:
    totalPop = k + m + n 
    # Calculate the number of combos that could be made (valid or not):
    totalCombos = comb(totalPop, 2)
    # Calculate the number of combos that have a dominant allele therefore are valid:
    validCombos = comb(k, 2) + k*m + k*n + .5*m*n + .75*comb(m, 2)
    probability = validCombos/totalCombos
    return probability

In [6]:
calculateProbability(21,24,29)
calculateProbability(2, 2, 2)

0.78333333333333333

### Rabbits and Recurrence Relations

In [63]:
# Rabbits and Recurrence Relationsdef fib_rabbits(n, m):
# http://rosalind.info/problems/fib/
# https://medium.com/algorithms-for-life/rosalind-walkthrough-rabbits-and-recurrence-relations-4812c0c2ddb3

def fib_rabbits(n, k):
    # n = number of Months
    # k = number of offspring
    
    # base case: in the first month there is only one pair of rabbits
    if n == 1:
        return 1
    # base case: in the second month there is k amount of rabbits + 1
    elif n == 2:
        return k
    
    # Fibonacci's sequence F(n) = F(n-1) + F(n-2)
    one_gen = fib_rabbits(n - 1, k) # F(n-1)
    two_gen = fib_rabbits(n - 2, k) # f(n-2)
    
    # Size of populaiton up to the 4th gen can be predicted by adding gen1 and gen2
    if n <= 4:
        return one_gen + two_gen

    # Multiplying the number of offspring born during each mating with the 
    # number of rabbits two generations ago, gives us the number of offspring 
    # in the current generation. We simply add this to the number of rabbits 
    # one generation ago to get our answer.
    return (one_gen + (two_gen * k))
    

In [66]:
# Rabbits and Recurrence Relations
fib_rabbits(5, 3)

19

### Computing GC Content

In [9]:
# Computing GC Content
# http://rosalind.info/problems/gc/

def gc_content(base_sequence):
    # Return the precentage of G and C characters in base_sequence
    base_sequence = str(base_sequence)
    seq = base_sequence.upper()
    g = float(seq.count('G'))
    c = float(seq.count('C'))
    l = float(len(seq))
    return (g + c) / l

def key_with_max_val(d):
    # a) create a list of the dict's keys and values; 
    # b) return the key with the max value 
    v=list(d.values())
    k=list(d.keys())
    return k[v.index(max(v))]

def highest_gc_content(filename):
    fasta_list = read_FASTA_sequences(filename)
    gc_dict = {}
    for i in range(0, len(fasta_list)):
        name = fasta_list[i][0]
        gc_dict[name] = gc_content(fasta_list[i][1])
    max_key = key_with_max_val(gc_dict)
    print max_key
    print gc_dict[max_key] * 100

In [10]:
fasta_list = highest_gc_content('rosalind_gc.txt')
fasta_list

Rosalind_2
62.5


### Translating RNA into Protien

In [4]:
# Translating RNA into Protien
RNA_codon_table = {
    "UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
    "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
    "UAU":"Y", "UAC":"Y", "UAA":"Stop", "UAG":"Stop",
    "UGU":"C", "UGC":"C", "UGA":"Stop", "UGG":"W",
    "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
    "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
    "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
    "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
    "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
    "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
    "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
    "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
    "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
    "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
    "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
    "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",
}

def translate_RNA_codon(codon):
    return RNA_codon_table[codon]

def aa_generator(rnaseq):
    """Return a generator object that produces an amino acid by translating
    the next three characters of rnaseq each time nex is called on it"""
    return (translate_RNA_codon(rnaseq[n:n+3])
           for n in range(0, len(rnaseq), 3))

def translate(rnaseq):
    """Translate rnaseq into amino acid symbols"""
    gen = aa_generator(rnaseq)
    seq = ''
    aa = next(gen, None)
    while aa:
        if aa == 'Stop':
            aa = next(gen, None)
        else:
            seq += aa
            aa = next(gen, None)
    return seq

In [12]:
rnaseq = 'AUGGACCAUUUUGGGAGACUUCAAGGACCACUCUCGCGAGAGCUACGUCCCUGUAAGGACAAACAGACCUCACUCUCGAUUAAGGCACUAAUGCCGGACCGAUUUUCCAACAGAAUGUGGAAAGGCCAUCUCGAUUGCGACAUAGUGCAGCUACUCGGAGGGCGUAUGUUCUUAUUGACUCAUGAGUUAAUCAAAUUUUCGGGGCGUUUUACGCAGACAAGCUUUGCAGCAGUUGAUUCGGCCCAAGAAACCAACGAAGUCUAUAAUACACAAUCGUCUCCAGCUCACGGAGGACCCAAAGAAGUGUCGGAGACGACACUUCUACAAAAGCGUUCCCAUAACACAUGCGCCCUUGGCUCACCCCAGAGAACUAUCUAUAGAAGAUAUUUGAGAUAUUACUGCAUUAAGGUUUCCCAGUGUAUUGACGCGAUUGUGGAUCACGUUCCAAACACGGCUUUCAUAUUUUUGUAUCGGUGGACCGAUUGCGGUACAAUUACGAGUGCGAUACACGGGUACCGAGGUACGAUAACAUACAUGCGGCACUGGUCACAGCACUGCUGGCUCCCCCAAAACCAACGUACACUGGGGUGGAGUGCUACCCUGGGCUCUUUGCUGGGGACACGGAAGCCCAAUCACGAAUCCGUGUCCCAUUGGGCCUUUUACGUUCACCAGAAUUCUUGGAACUGCUUGUACUUUACUACACAUUUUGCAGCCUCUGCACAUGCGAUUGCCCGGAGUCCUAGAAAGCAGGCAAAGUCUGACUUACGUUCCAUGAGGCGUAUUCUCUACUCGGCGUUGUUAAGUCUGAAUGGGGGGAGGAAAGAGUGGCAAAGGCUGAUUAGCAAAUGCCAUCGGCAUUCAUAUAAUUGUCUGUACUUAACUUUCCGAUUGCUAGUUUCACCACCGGUCGAUCCCACGACCGGAUGUUGGAGCGAACAAAUUAGAACGACUCGAUCCACGCAGCGCGUAAUAAGAUCCCUUCACACCCGUGUCCUCUCUUUGGCUCUGCGGGUCUCACUGGACUUUUCUGUCCGUGGCAAAGGUUCCACACGCUUUCCCCCCCCUACGUCUAGUAUGCGGACGCGCCCAAGGGGUAAAAACAAACCCUUGGGGCAAUCGCCACCAAAAGCCGUUCAGCUAGUCCGUGAGCACCAACGUCUUGAACGGCAAAAUACGCGGAACGGUUGCUGUGUAUGGGGAAAACGUGCAGAAACACACCCCUUAGGCGUCCGUAAAUUCGCCUUUCUGUGCUAUGGCCAACAUAGAGCAACAAAGGGGUACAGGAAUGUGUGUCCAAGCAGCUUUAAACAUGUAGUACCCAGGCAUGUGUCGGUUCGUCAAUUCAAAGUGGAUGGUGGGAGACUAUGCUUCAAGCCCCGAGUAAAUAGCUGGGCACGAGCGUUAUGUACCACUCGGAGGGAUAUCUAUAAGCAGGCGACAAACCCUCGCCUAGGGUAUUACGGAAGCAGUGUAAAUCUGGAUGUCGUCGCGAAUCUGAGCUCUGCGUGCCUGGGAAAGCUCACACCCUUAGGGACUACCACCUAUGGGUUACAGAAAGGCUGGACGGGGCAAUCAGUUGCGGACCUUACAAGCGUCCAGUUCUCUUUCAUCCGGUCUGCGUUUAAAUCAAAUUCAAUUGUGCGUCCGCCUCACAGCAUUGAGAUUGCUCUAAACCACACUCGGCCGUCCGCAGCCGAGUUGCGCCAAUGCGUCCUCUGUAGCCCUCUUACAUUCCAACAGACUAGAACUUGUCCCCUCCCAGGGUGUUUUUUUUCUCCUGCCCGUCACGAUGGUAUGUCAAAGGGUACAUGGCGAUAUAGAGGCUACUGCAAAACAUGCAUUAGUGGGACACCAGAUGGCAGACCCUACUAUGCAAACUUGUGUGAUGAGCAGCCGGGACCGGUGUUACGGCUCACGCGCGUUGGUUUGUGCUUAUUACGGCAAGCUAUAAGUGUGCUUCCCCUCUCUAAACUCAUUUUGUUCUCCUGCAGUCCAGGUAGCUCCCUUGGGGCGCAUGAACCGGUCUUAUUCAGAACAGCCAGUAACUUUGUGAAUUCUGCAUUAUUUCGCAUAGAGGAUGACGCGAUACCUUAUCGUAGAGGAAUGUUACUCAAGUCGUCGAGGGUUUGUAUUUAUUUAGUCAUCCCACCGCAAGAGCCAGAAUCAAAGGCUGUUAGUUACAGCAUCUCCAAGGACUGUGUUGCCCGUGCCACGCAAGUAAUAUCAAAAUCAGGUCUCGUGCCAUUCUUACCAAGCGUGUACACUACCCCCAGGCCCGAGCUAGAUGACGUUCUGUGGCGAGUCAAAUUCGUCAGCGUUCCCCCGAGACACAGAAGGCCUUACAGCUCCACCAUUCCAUCCGAAGCGCGCGAAGCGUAUUGCACCACGGGGAGUAUAGAAUGGCGCGCUAGGUACUUUCGACGGAAUAUAUGCCUUGCUAUGUGGGAGGGCCAAAAUCAUCGGGGCUUGCGUGAGGGCUAUGAGUAUGGUUCACACCUGACCUUACUUUAUACGACCGGAAGUCGUUCCUGUUCCAUUGCCGGAUUGUCCAUGCCUCUUGUAAGGGUACAUGCUGAAUAUGCGCAUGCAGAAUCGACGCCCGCGUCAGGCUAUACGGUUUCCACGAGUGAGUACGGUAUGCUUCGAUGGUAUGGCAAGCCACUGGUGAUCCUAUACUCUAUGGAGAUCGAAUGGGGCGGGAGAAUAAGGAGCAAGUCACAAACGCUGACUAACCGCACGUUGGAUGUCGAUAUCUCAGUCAGACCGCCUCUUAAACGGUCAUUAGCCAAAAUAGGGAUGCGUGAAUUAAAAGCGAACAAGUUGGAGAGAUUGGUUAGAAACGCGCCGGAGCGUUGUUUAGACUUUGACUAUACGUCGAUUUGUGUUGUGGGGAAAUUUCUGAAACUCUCAACAUCUGCAUGCUUUAGGGGUAUUACUUAUCGUUACGUUAACGGGGGGUCAAAAAGUACUUUUGAAAUAGAACUGAUGUCCCGCAGUUCUGGGUCCGAACAGUGGAGAUAUUGGCCCAAUCCUACUCAUAAGCUGCCGUUCAACUUAUAUGAUCUGGAAUGCCUUACAGAGCUAAGCAUCCGGCGUCGGGUUGCUGCGAGCCGUCAGUUCCGGAACCGCAGUACCUAUCCGUGCAUAUCGGACUUCCCGUACGUACAGAGGGGAGAUAGGCAUAAAGCGGUCAGCAUCUCUUGCCGAGAGUGGAUGACUGCCAACAAGUUGAAUCUAGGACGCGUAGUGCCACGGUACACCCAGGCAGCAAUACCUUCUCGGUCGUUCACAUCGACUACUUCAGGGAGCUGCAUGCUUGUAGAGAAGAGCGGCAGAACGAUCUGCUUCCAUAAGUUAGUAUACUCUCUUGCAUGGCCGCACUACCGGAGUAGUCCUUGUCCACUCCAGAUCAUGAUUACCUUUACACCCACACUUCAUACAACCGUUGAAAAAGUCGGCGGGCAUAGAACGUUCGACUUUUUACAUUCCAUCAGUCAGCCAUGGGAUGGCUAUCUAGUUAGCGCCCACGGAAAUCGUGAAGUGACAUCACUUGUGCGGGCUCUUAGAAUUACUUUUAUAGACGUGAAGAUCGUGGUAGAUAAGGAAUCCAGUCAAAUAUACCUUAGCUGGAGGCCGGAUAAGUCCAAGGCCGAUAGCUUGUUGUCAUUGACACUCGAAUGUACUCGGAAAACAUCACGACCUAGCUGGAAACACUGUAAGCUAAUGGCUUGCACAGGGAUGUGCCCUUUAUCACCGUAUGAUACGGUACUACGCCUAUACAGCAAUGGACAGGGGCGGCUGAUUCCGGCUCCUUGUAAAGGUUCCUCGGGGAACAGGUCCGCUAAGUUUGGAUGGAAGACAUAUCAGACAGAGCGAAUAUACUACGUAAGAGAGUCCAGAGUAGUAAACGUUCUUCAGGGUCUCGCAUGGAAACACCAGAAAGAGCUUCCCACUCUCCUUGGAUCGAAACAUUCAAUAAAUAUACGUAUCCACAGUGGAGCCAGAUGGGCGGUCAUUUUACACAAGUGUUGGAGUGCGCCUCGUUUGGCGGAGUUGUUUUGUGUACAUCCAGUCUCGCUACGCGGUGCCGGCAUCGCGUACACGCAUAUGCCUCGCCACCGCGCAACCACUAGGCUGGGUCCUAUUCGACGACCCGGCUCUGUUCCCAGGACGCAAAUUCACGACAAAGGGUAUGUACGCUUACGUGGCGUAUAUAUACGUGGGUGUAGUUGUCUCAGAGCGUACAAUAUUCGACGCACUUAUUUACGCUCCGUCAACGCGAGCAUUACAUGCCAUAGUAGUUACUUUAAUAACACCCCCCCAGCGACCCGCAAGAGAACCGAAGUCAGCACGUCCUGCUGCAGCGUGAUUUCCAGGGAAAAUGUGGCCUCGCUAUUGUCGUUAGCUCUUUUUCCUCUGCCUGCUGGCAGAUCUCAACGAACAGCCAGGUUGAGACACCUGACCUCCAGGCAUCCGUCAGAAUCGUCUACACCAGCGGGAAUUUCAACUAGCGCCCGGAUAUAUUGGCCCCGUCCUCCAGGAAAGCAUGUGUGGCUGUACGUCACUGGGACACAUGGGAAAGGAACCCACGAACAAAUACCGGCCGGUCGCUCAUUUCUGACUUCACGACGACCCUUCGCCAUUGAGAGAUACAAGUGUUAUAAGUCAGAACCUACCAGUACACGCUUUUAUAAACUGCCCACCCACGAUGUGAGGAACGACCUCGCACGCGGCCGGAAUAGACGGGACUUCCAAGUAGUGUUGAUGAUGACAUUAGGAGCUUGUGUAGGAAACGCAUACGCGAGUUUCAGACGGGAGCGAGUGAAUGUACGGUGUCAGGAAUCAGGCCAGGGGAGGCUGUUAGCGAUUCAGGUUAAUGUGACUGGACCUCGCCAAGCUGCGUCCAAAGCAACGAGAAAAUACGUCAGCCGAACAAGGGACCUGUGUAUGGUACCGCACAGUAGCGUUUCAUUUCAACUUCGAUCUUCAAAGCGGGUAGGACAGAAUCAGGUGAAUAUCGCGCGAUGUAAACUAGUUAGCGCAGCAACGUCGGCUCGAGGUGCCGAAAAAAUGGAGACGCUGAAGCAGGCGACCUUCGACACUCUUAUGAGUAUCGUAAGUGAAAUAAAAGUGCGCGUUCUGGCACCUAAUAGUUUGAGGACGGCGAGCUUUCGUAUCCACGAUUUUCUAGAGCAUCAAAAGGUAGUAAGGCAUACGACGAAAAAUACCCCAGAGGUCAUGCCACGGCUAAUUCCGUGCGUUCUACCGAGCGGGACUCAGCCUUGGUUUGCCCCCAAUGUAAAACUGCUUUCCAAGAUGUUCACCCAUCGUGGGUUGCAUGGUCGGGUAUCUACUAUCCCUACGAGCGCUCCUCCAACAGUCUUUGUUCAAGAUUACGCCACGUGCUGGACGAAAGAAGUAACUACAACGCCUGAGCGGUAUCGCGGCCCUAUGGGUGGAUGCGACGGAAGCAGUACAGCACCAUACACAGACCGACUUAGCAAGCUUCGACCCAUUAGGAUUCGCUUACUAGUCUGCGCAUUGGACGCACACUCACUAAUGCCUUGGUGUUGCUUGUGGCCGCAUAGUAGUCCACACCGUAGUUCUCGCAGACCUAAUAAGUUGAACGAGUACCCGAUAGUUACGGAUUUGUCAAUCAACAAUGGUUGCCGUAGUACGCGCGCUUACUCAAUCUCUUACCUUAUCGACUCGGGCGACAACUUUAAAGGUUCGUGUAGUCGCGCUUCUACGUUUAGCGAUGCGAAUCAUGCCCGCGUAAACUGCAAUUGUUACCGUUUAGAUCAUAGCGCCGACCACCGGUUUAGCCGGCGCAGGACAACGCUCCGACUUGCGGCUCUAUUUACUCACAGCUAUGUUCAUCCAGUGACGUCAGUGACUGUCGUAGGUAUAAGUAAUACACAUAGGUCUGGCUUACAACCCCUUUCGCGGGCUCUCCAACCGUGGAGGAUGCGAGGUACGCACACUAUAGAGCUUUCCACUGAGCUCACCACAUUCUCUCGGCUUUGGAAAUGUAAAACAUAUGUCGCCCCUACUGAACAUUUUCUAUGUGUGUUAGCGACCGGAUGGACAAUGAUAUCGCUGCAGCACGCAAGAGAGCGGUGUCACACCGAGCGAAAGACAGCGGCAAUCGUUGCGUUCGGCAGAACUAGGCAGUCAAAUGCUUCCCACGGAAAUCAUACAUGUGGUCUAGGCCUCAUAGGUACUUGUAUUUAUAAAUGCAGUACCGCCGUGGUAAUUUGGAUCGGACUAGGUCCCAUACCUCGCUCUUGUCGGGCAAGCGUACUUAAUGGCUAUGUGCCGUUGUCUUUAGAGCAUACUCUGCACGUUAGUAAUUCAAAAGGACCCAUCUCAAAACGCGACAGUCGUUGGGAUUUUUGGGAGCCAUAUUUAAUACGGAAGGGACACGCAACGCGACCCACCUGCAGCAAAUCUAGUGCCUUCGGGUUCGCUAGGAGCGGCGUACCACACUCACUGCCUGCUUUAAUGGUUUGCACGUCGGAAGAGAUAAUAAGGUUGAUUGGAAAUCUUAAAAGAAGUGCCGAUUUCCAGGGCGGGUGUGAAGAGUGUUUGAACGCAUUGAUCCCCGCUCGGGAGGGGCAGACGAACGUUCGAGCUAGCAGUUUCUUAAGUGGAGAGGGGAUGCUUGCCACGCGUGCUCCUUCGAGCGAUGUAACAUGUAAAAUCUUGCAGGUCGAUAUUGUGGACCGAAAUUUGAACUUUAAAAGUCCUUCCCUAUUCGGCGGUGCUUUGAAAUACAUCGGUAAAUAUCGAAGGCAGACCCGGAUAGGUAAUGGCAUUCGGCGAUUCGUACGGACUGAUUAUCACAGGCCCGGGCACCACAUUGAGUUGGGAGAACAACUGACAUGGCGUCAAAAUGAACCCACUGGGAUAAUUGCCGUAUGCAAUUGGUUUGUAGCCGUCAGUGACACCUCCUCUUUUCAGUUGUACAUAUCGGUCCAUCUCUUAAAAGUUGGAACAAGACAGCACACGGGAUCAAUCGAUGGAAGAUCCACAGUUCCCUUAUUACAUUCUGAUCUAUCCACGACUGAGGGUAUUAGAAUUAAGGGUGGCCGUUUUGAGGACUGUCUCCGGCCUCAUGGCUGGGUGACCGCGUCGAGAGGCGGAGUCCCCCGGCUACGCUAUUGGCCUGGACGUAGAGAGGAAGGCACGCUAAGUGGGAUUUGCAAGGAAGGCACCCCAACUGCCCUUCCCCAUUCCGCGCUGUGUUUGAACGAGCCACAAGUAAUUGGCAGCCCUAGGGCCCACCUUAUACGACGAGACGGUAUAUUGUUCCGAAAGGCAGGUUUAAUAUGUGUGCUGGUCGGUAUGUCGAGAAACUCUCAUAUCCGUUGUUUAAUCCCUUACGAGAGACGGCCUAGCCAUUCGUUAAGGCGUCUAGAACAGCUUGUCUCACACGCGAAACCUACCAUGACCCAGACCGCGGUGUUAGCUAUUCUCCUAAGUACUUACACAGGUCAGCUGCUCUUGGCCCGGUUUGGCGGCUUUGCAGUAGCCGGAUUGAUUGAGGCCAGAUUUUGGGAUAGCAUUCUCUUGGUAGCACUGCCUACCACCUGCCGUAGCGCUCCGCGUAUGUCGACGCGACCGCGAAUAAGACCGCCACGUACCCUCCCAAUAGAAGCCUCAUCGGACUCUGACGCCUCCGCGAAUUUUCAGCUAGAUGUCGCGUCACCUGACUACACUUACAAUACGACAGUCCAUCUUUGCUUCUGCAUAAUUAGUCCGCAGUUCGGGAGGACUCCCUUAAAGACCAACGAAGGGAGUAAAAGUUCUGCAGACAGAACCGUCAGUAUUACUUACGAGUCCGGGGGCAGUAGUAGAUCGACGCGCCCGGUUCAGUUGGUAAGGAUUGCCCCGUUGGGGAAGGAUACUCGCUACUUGACGAGCAUGUCUAGGAUCGUCAACUUACAGGUGGACGGUGUCGCCCAGUGCAGUACCAGUGUAAAUGAAGGUGUCCAUUCUCUUGUGCGACGAGUCACAGGCGGUGUUAUUGGUGCUUCAUACGCUUAUUCCAUGGAAGAAGUUAGUCCCGACCCAACGUUCGUCGGAAAGCCAACUACAUCGUGCCAAGAGGUAAUCCUCUCCGUUGGUAGGAUGGGUAAAGCAAGCAUAUCAUGA'

translate(rnaseq)

'MDHFGRLQGPLSRELRPCKDKQTSLSIKALMPDRFSNRMWKGHLDCDIVQLLGGRMFLLTHELIKFSGRFTQTSFAAVDSAQETNEVYNTQSSPAHGGPKEVSETTLLQKRSHNTCALGSPQRTIYRRYLRYYCIKVSQCIDAIVDHVPNTAFIFLYRWTDCGTITSAIHGYRGTITYMRHWSQHCWLPQNQRTLGWSATLGSLLGTRKPNHESVSHWAFYVHQNSWNCLYFTTHFAASAHAIARSPRKQAKSDLRSMRRILYSALLSLNGGRKEWQRLISKCHRHSYNCLYLTFRLLVSPPVDPTTGCWSEQIRTTRSTQRVIRSLHTRVLSLALRVSLDFSVRGKGSTRFPPPTSSMRTRPRGKNKPLGQSPPKAVQLVREHQRLERQNTRNGCCVWGKRAETHPLGVRKFAFLCYGQHRATKGYRNVCPSSFKHVVPRHVSVRQFKVDGGRLCFKPRVNSWARALCTTRRDIYKQATNPRLGYYGSSVNLDVVANLSSACLGKLTPLGTTTYGLQKGWTGQSVADLTSVQFSFIRSAFKSNSIVRPPHSIEIALNHTRPSAAELRQCVLCSPLTFQQTRTCPLPGCFFSPARHDGMSKGTWRYRGYCKTCISGTPDGRPYYANLCDEQPGPVLRLTRVGLCLLRQAISVLPLSKLILFSCSPGSSLGAHEPVLFRTASNFVNSALFRIEDDAIPYRRGMLLKSSRVCIYLVIPPQEPESKAVSYSISKDCVARATQVISKSGLVPFLPSVYTTPRPELDDVLWRVKFVSVPPRHRRPYSSTIPSEAREAYCTTGSIEWRARYFRRNICLAMWEGQNHRGLREGYEYGSHLTLLYTTGSRSCSIAGLSMPLVRVHAEYAHAESTPASGYTVSTSEYGMLRWYGKPLVILYSMEIEWGGRIRSKSQTLTNRTLDVDISVRPPLKRSLAKIGMRELKANKLERLVRNAPERCLDFDYTSICVVGKFLKLSTSACFRGITYRYVNGGSKSTFEIELMSRS

### Finding a Motif in DNA

In [20]:
# Finding a Motif in DNA
# http://rosalind.info/problems/subs/

def find_all(a_str, sub):
    start = 0
    while True:
        print "start before find:", start
        start = a_str.find(sub, start)
        if start == -1: return
        print "start after find:", start
        yield start
        print "start after yield:", start
        start += 1

In [22]:
import numpy as np

dna = 'GATATATGCATATACTT'
motif = 'ATAT'
# dna = 'CCGACCTCCGACCTCCCGGCACCGACCTCCGACCTCATCAACCGACCTCCGACCTCCCGACCTAAACGCCGACCTAGTCCGACCTAGCCCGACCTCCGACCTCCGACCTCCACCGACCTCGGCCGACCTCCGACCTTCCCGACCTCCCGACCTCTCCCGACCTACCGACCTGACGCCGACCTTCCGACCTCCGACCTTCCGACCTCTCCGACCTGGCCGACCTGTACCGACCTACTGAATGCTCTTAGCCGACCTGTGCTTTTACCGACCTGCCGACCTGTCCGACCTGTTTCACAAGGATCCGACCTCCGACCTCCCGACCTCCGACCTGATTCCGACCTAAACCGACCTATCGGACCCGACCTCCGACCTCCGACCTCATCGCCGACCTGTCCGACCTGCCCGACCTCCGACCTCCCGACCTCCGACCTTTACTCCCGACCTCATACCGACCTACCGACCTGTACCGACCTAACCGACCTCCGACCTCACCGACCTATACCCCGACCTTTCCGACCTCCGACCTACCCGACCTCCCCCGACCTCCGACCTTCGGGCGCCCGACCTACTCCCGACCTGTGGACCGACCTCCCGACCTATTGCCGACCTCCGACCTCCCCGACCTTGATACCCGACCTCTATCCGACCTTCCGACCTTCCGACCTATCGCCGACCTGCGCCGACCTAGCACCCGACCTCCGACCTGGAGCCGACCTTTATCTGGTCCGACCTCCGACCTCCGACCTGAGTAAACCGACCTGCCGACCTCCGACCTTCGCCGACCTACCCGACCTGATCCGACCTGCCGACCTCCCGACCTGGCCGACCT'
# motif = 'CCGACCTCC'

y = list(find_all(dna, motif))
y = np.array(y)
# y = y + 1
y

start before find: 0
start after find: 1
start after yield: 1
start before find: 2
start after find: 3
start after yield: 3
start before find: 4
start after find: 9
start after yield: 9
start before find: 10


array([1, 3, 9])

### Counting Point Mutatnions

In [15]:
# Counting Point Mutatnions
def hamming_distance(s1, s2):
    #Return the Hamming distance between equal-length sequences
    if len(s1) != len(s2):
        raise ValueError("Undefined for sequences of unequal length")
    return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2))

In [16]:
s1 = 'CTTTTTCCATCGCTCGATGTTAAACCTCTACGAGGGCCCCTGTGGTCCGGTGATGGCTCTAGGCTCTACGCTAACAGATCCGAAACGCCAGAGGTGCCCCATAGCTGCTAACTCACAATCATTACTCTCACATCAGCTTACATACCTAGCCGCAATCCCTTACATTAAATTGCCCCAACGACAAGGTTCAATCATACTAAAATCCTACTACGTGTGAGACTGATCGTACATAACTCAAGCTCAAGCTATATGTGTCCCGGCAGGCTTGCGACTCGATCATACATCATAGTTACGGTAATTAAAAACCAGTTGATCGGACGCAAGTGATCGGCCGCTGTATGCTTGACACCCTGTAATACTACCGCATTCTAGTCACATCCCTAAATCTAATGAAGCAGTTGTAAAAGACGCGAACATAAGAGCATCCAGCCGAGCGGGACACATGGCGTTGACCTCTATTCAGAGCCGCTACCATAAATACAGGGTATGTCCTTCAATCCTTGAGCGCGAGCCCTAATCCGGAAGGATAGTTTCGGCAATCAATTCCATCACTGCGTGTAACCCGGGAGTAGCAGGCTATTGGTCATTATGTGACATCCTATCATGGGGTTATAAGGCAATCACCTGGAATAAGACCTAGTGTGGAAGGGCCTACTGGAGAGCTAGCCACTGGATGGCTCTTGCCCAGGCCAACTGACGAAATACCCGATTGGTGCCAGTACCGTTCTAGTCGCACGTATTAGCGGGGGGGCCACGCCGCGCCGCGATTGACATAGTGCCCGGGTCTCGTATGTTTGAAACCGTACCTGTGATGCGCCTACACGAGTACTACTTTATATTACCGACAACCATACACGCGAGCTTTAAATAGCGATGTTCTGGAGTGCTCTCACATATGATGTCGATGCTCAACAAGAAGTATTGCCTGGAAGTAGTGAAGTAATTGCTAAATTGAGTGATTTTATTCTTGGTAACAGACCCGAAGTTCCTTGCG'
s2 = 'GTATTAAGCTCACGACATGATGACACCTTCCCTGTTCCCCTACGGTCTGCTAAATGGTATAAACTCTACGCTATCACATCCACATCGTAAGGGATGACGAAGAACATATAAAGGCGGCTCCGTAAGCTAACCTTAACATACCTACACAATTCCAATACCGAAAGTTGAACTGCCGGAAAGCCGAGGGCCGATTCGACCAAAATCCTTGGGCGTGTTAGCATGATGGCTTTTTATTCAAAATCTAAGTACATTTCCCCAGACAACGAAGCGAGTCGGAATTACATAAGACTAACGGAACTTCCAAACCACTCGATTGAATGCAAGTGCTAGGCCGCGCTAGCAAGCCATCGCTCCGCTGCTCGATCAGAGTGCAGAGGCCCATTAAATGGTTGAGGCTCACAAAAGATCCGGGAGCTTCAAACGATCAAGACTCCGTGGACACGTGAAGTTCACCCCTATTCATCTCCGTGGTTATGGCTACCTGGTTGGCTTTGCAATTTCCGACCGCCTGCCCAGATACGGAAACTTGGGTTACCCAACCAATTCCCCGCCTGCGTCTCATGCGGCTGCAGCTATCTGGCCGCTTTTGTATAAAATTATATCACCGGGTTATTAACCACTGTCTTTGACTCCGACCGCAGCGAAAACGGCCAGGTTGGTAGGTGGCGACTAACCCGTTTGGCTCGAGTCACGCTCGCGACATAAGCAGTTGAATTATGATTTGCACACGTCGAACGTGTCAGCGGGTTGGACGCTCAGTTGCACGCCGTCCATATGTCTCCATTCTCATCTCCTTACAGCGGTACCGTGGAGGCATATAGGGGAGTACTGCCTGACATATCCATTACTCAAGAACGGGAGGTGTGTATAGAAGGTCTCTCGAGAGGCCTAAGCCTGGATATGACTGGTTAAATACGCGGACTGATCGGAGTAATTCAAGTAATTGTCATAGCAATCGTGAAATTCCTTCTTATCGAGTCCGAAGTTCTTGGAA'

hamming_distance(s1, s2)

466

### Calculating Expected Offspring

In [38]:
# http://rosalind.info/problems/iev/
# array = (1, 0, 0, 1, 0, 1)

geno_dict = {"AA-AA": 17804, 
             "AA-Aa": 19798, 
             "AA-Aa": 19570, 
             "Aa-Aa": 19533, 
             "Aa-aa": 19633, 
             "aa-aa": 19957}

def cal_exp_offspring(geno_dict):
    for i in geno_dict:
        geno_dict[i] = float(geno_dict[i])
        
    multiplier = {"AA-AA": 1.0, 
                  "AA-Aa": 1.0, 
                  "AA-Aa": 1.0, 
                  "Aa-Aa": 0.75, 
                  "Aa-aa": 0.50, 
                  "aa-aa": 0.0}
    exp = 0
    for x in geno_dict.keys():
        exp = exp + 2 * geno_dict[x] * multiplier[x]
    return exp

cal_exp_offspring(geno_dict)

123680.5

### Mortal Fibonacci Rabbits

In [96]:
# http://rosalind.info/problems/fibd/
def fib_rabbits(n, k):
    # n = number of Months
    # k = number of offspring
    
    # base case: in the first month there is only one pair of rabbits
    if n == 1:
        return 1
    # base case: in the second month there is k amount of rabbits + 1
    elif n == 2:
        return k
    
    # Fibonacci's sequence F(n) = F(n-1) + F(n-2)
    one_gen = fib_rabbits(n - 1, k) # F(n-1)
    two_gen = fib_rabbits(n - 2, k) # f(n-2)
    
    # Size of populaiton up to the 4th gen can be predicted by adding gen1 and gen2
    if n <= 4:
        return one_gen + two_gen

    # Multiplying the number of offspring born during each mating with the 
    # number of rabbits two generations ago, gives us the number of offspring 
    # in the current generation. We simply add this to the number of rabbits 
    # one generation ago to get our answer.
    return (one_gen + (two_gen * k))

def dying_fib_rabbits(n, m):
    # n = number of Months
    # m = number of months to live
    print n, m
    # base cases: 
    if n <= 0:
        print 'return 0'
        return 0
    if n == 1:
        print 'return 1'
        return 1
    # Fibonacci's sequence F(n) = F(n-1) + F(n-2)
    # one_gen = dying_fib_rabbits(n-1, m) # F(n-1)
    # two_gen = dying_fib_rabbits(n-2, m) # f(n-2)
    x = m+1
    if m <= n:
        print 'm <= n:'
        return dying_fib_rabbits(n-1, m) + dying_fib_rabbits(n-2, m)
    elif n == x:
        print 'n == (m + 1)'
        return dying_fib_rabbits(n-1, m) + dying_fib_rabbits(n-2, m) - 1
    else:
        print 'else'
        return dying_fib_rabbits(n-1, m) + dying_fib_rabbits(n-2, m) + dying_fib_rabbits(n-(m+1), m)
    

In [97]:
dying_fib_rabbits(6, 3)

6 3
m <= n:
5 3
m <= n:
4 3
m <= n:
3 3
m <= n:
2 3
else
1 3
return 1
0 3
return 0
-2 3
return 0
1 3
return 1
2 3
else
1 3
return 1
0 3
return 0
-2 3
return 0
3 3
m <= n:
2 3
else
1 3
return 1
0 3
return 0
-2 3
return 0
1 3
return 1
4 3
m <= n:
3 3
m <= n:
2 3
else
1 3
return 1
0 3
return 0
-2 3
return 0
1 3
return 1
2 3
else
1 3
return 1
0 3
return 0
-2 3
return 0


8

### Inferring mRNA from Protein

In [10]:
# http://rosalind.info/problems/mrna/

# 
def codon_frequencies():
    frequencies = {}
    for k, v in RNA_codon_table.iteritems():
        if not frequencies.has_key(v):
            frequencies[v] = 0
        frequencies[v] += 1
    return frequencies

def possible_rna_strings(input):
    f = codon_frequencies()
    n = f['Stop']

    for c in input:
        n *= f[c]

    return n

small_dataset = 'MA'
large_dataset = 'MQMADTLSMFIQPWGGVKKCTQLSYLLVATDQHHKTAAANERCRPENIGINGTTLAQDSTDMDPPWTREYIIDNANPKPFCCRCCAYCHSLISQEVSNDQDNANNWRIAMGVMMKEERNYATGSVGLKELTIFTHSDKTDPGVEQSYIICFLIPNRASLYLGDLNDMMPIYEMGTNSWDNHWCRQCIYHKPDFFGAGHEVDKYMVHEIEMDDMVPARDCDPEHPKMCDPYENRHCNIYWGAMHAFYNNVGTWHNPCIMFCTAPFFCIAVLIAENHWHTNVLGDHMWLRTFKGVREHNCEIKEGCTTYAYSTQKWYGKKCCNQEQSLISENMTRGRHGCQDRARYRATANCEPNINPPDEHPQIWLTCTRGVSSAMHFFWWSMMSLDESTPNYACPHCYGVTDLFAWREQKPYRFVKVKQELDSDIPVRESDESTMHSKPDKEFVSKQLQDGLFPVTMHAANIVGIHYVNVPQNLFQYMGPKCHNRAAQHWHLVCCRLRRVQHWNANRCEHGMWRNWYTKHVSEYYMHSYTHKGHFTHYNSITWQYRDFMYKMVLVKQYLGCVQARQQFWPELHWDEYEANHAWYTRHLMNYDCYIDETQVEWYTFYGMYDTQNRTVKTNWRAHVDETMKTMHLGVNMSWIIWEDTFTLVRYVHFFVFVYRIHVIQFTCNRHYEHTTRSHHIIHIPCSRTSKHFDLGGYYPPMKCDREMQQNLKYYDAHQWYTIELNFEWHYVMEYVPQEAQQCDPTDLIVPLLYCGVRHQLLVGAPQENDVGRNYMMWYISCEHEKLQDYKMIYIDVTEEPEFTSINKNHFCAHFTNQGCAPRMCIVRLYSWDIGEEHHDVMTVDASFVENMRYPTHAHMGISWWMIMRRRECAINPRQTKHFNSNIACSSQAVCTTTNKPWWWTLAIMQITWHTHCDTNKHPKDDWDKTFHEMKRWTRADPVPVNVNFKSHDAQVDLDNCGRMWLDWKLRVMYMVTPERDGAIANNQSWNVQFWGQL'
print possible_rna_strings(small_dataset)
print possible_rna_strings(large_dataset) % 1000000


12
849472


### Independent Alleles

In [None]:
# http://rosalind.info/problems/lia/


### Consesnsus and Profile

In [17]:
# Consesnsus and Profile
import pandas as pd

def string_to_list(string):
    lis = []
    for i in range(len(string)):
        lis.append(string[i])
    return lis

def dict_values_to_list(d):
    for i in d.keys():
        string = d[i]
        l = string_to_list(string)
        d[i] = l
    return d

In [18]:
def value_counts(series):
    return series.value_counts()
    
def series_to_int(series):
    return series.astype(int)
    
def profile_maker(df):
    profile_dict = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
    profile_df = pd.DataFrame.from_dict(profile_dict,orient='index').sort_index()
    profile_df = df.apply(value_counts, axis=0).fillna(0)
    for i in range(len(profile_df.columns.values)):
        profile_df[i] = profile_df[i].astype(int)
    return profile_df

def max_value_counts(series):
    return series.idxmax()

def common_ancestor(df):
    profile_df = profile_maker(df)
    base_max_values = profile_df.apply(max_value_counts, axis=0)
    seq_list = base_max_values.tolist()
    seq = ''.join(seq_list)
    print seq
    print profile_df

In [19]:
fasta_dict = make_indexed_sequences_dictionary('rosalind_cons.txt')
fasta_dict = dict_values_to_list(fasta_dict)
fasta_df = pd.DataFrame.from_dict(fasta_dict,orient='index').sort_index()
common_ancestor(fasta_df)

AAGATTACGAGGCTGCTCCTTAAAACGTGTCCCGCCAGGACGGACTTCCCACATAAGCAAACACAGGGCTATTAAACCGCCGAGGTAAAACAATATAATGCGCAATTTCAAAATACACACAGAAAGCTACAGAAAAATCATAGATGGACTAAGATAATATAAGCTACGTCCGCAAGCAGCCTGGCGTTAAGGCGCGAAAGAAAAGGCGTGGGCCCCTCATGTTAGCACGACGCCTACAGTGTCGCCCTACGCAATATTAAACCAAGTATTAAATGGATGCGAACAAAACAACGTCAAACGAAAACACAGTTCGCGATGACCTGCGGGCAAACAACAAGGCGCCCAAAACAGAACCAGGAATCAAGGAGAGGCTCAAGAGTGTACAACTCAGCACGAAACAATCCCCGGAAGCAGTACTTGACGCCGGAATAAAATACCACGACCTATAAATTCCACTACCCAAAGGTGACCTCGCAACTCGGAATTGGGAAAGGGAGGGGCACTCAATAACTGCAAGAAACAGTCAGACAATACCCACACGGAGGTCATACACATACTTACAAACAACGACGAAGAGTCACTGGTACAGAACACGGCAACAGTCAGCAACTGACAAATTGACATGCCTCCGCGAACTTACCGTTGACTCATTACGAGATACTCTAAACCACAATGGTTTTGCCAGGGTGAACTGACAAGAGCATTTGGAGATGTGAGCAAGTCCACAAGGGCAGACATCAATGGGTACAAAAAAGAAACCCAACAGGCTGGCGGCAAGTGCTCGACCCATAGAAACCCCTACAGTAAGACTGCAGTATGGCGCCATTTCGACACCATTTGGAGAGTGGGTTACCCGGTCAGCACCAGAAACCTACAGCCACTCCGCCAGTTGGAGCTTTCCTACTCATTAGCATAGGACATTTAAACGACAAAACTCGCCGGCATGGGATGTAGGGCAGCGTAA
   0    1    2    3    4    5    6 

### Introduction to Random Strings

In [69]:
import math

def AT_GC_counts(seq):
    base_counts = count_bases(seq)
    AT = base_counts['A'] + base_counts['T']
    GC = base_counts['G'] + base_counts['C']
    return AT, GC

def log_10_probabilites(value, AT, GC):
    return round(math.log10((((1 - value) / 2)**AT) * (value / 2)**GC), 3)

def probabilities(gc_content, seq):
    AT, GC = AT_GC_counts(seq)
    probabilities = []
    for i in gc_content:
        prob = log_10_probabilites(i, AT, GC)
        probabilities.append(prob)
    return probabilities

def random_strings(file_name):
    seq = ''
    bases = ['A', 'T', 'G', 'C']
    with open(file_name, 'r') as f:
        for line in f:
            if any(base not in line for base in bases):
                numbers = line.split()
                gc_content = [float(x) for x in numbers]
            seq = seq + line
    
    return probabilities(gc_content, seq)

random_strings('rosalind_prob.txt')   

[-80.03,
 -66.018,
 -61.306,
 -56.78,
 -55.96,
 -52.405,
 -51.632,
 -50.033,
 -48.703,
 -48.236,
 -48.162,
 -48.276,
 -49.734,
 -51.622,
 -54.09,
 -57.34,
 -64.008]

### Speeding Up Motif Finding

SyntaxError: invalid syntax (<ipython-input-73-84a4e24fdb36>, line 11)