### BA3A | String Composition Problem

In [1]:
def Composition(text, k):
    s = set()
    for i in range(len(text)-k+1):
        s.add(text[i:i+k])
    return sorted(s)

In [2]:
Composition('CAATCCAAC', k=5)

['AATCC', 'ATCCA', 'CAATC', 'CCAAC', 'TCCAA']

### BA3B | Reconstruct a String from its Genome Path

In [3]:
def StringFromGenomePath(patterns):
    k = len(patterns[0])
    s = ''.join([pattern[-1] for pattern in patterns]) 
    text = patterns[0]+s[1:]
    return text

In [4]:
with open('../data/rosalind_ba3b.txt') as f:
    patterns = [p.strip() for p in f.readlines()]

In [5]:
StringFromGenomePath(patterns)

'ACCGAAGCT'

### BA3C | Overlap Graph Problem

In [7]:
with open('../data/rosalind_ba3c.txt', 'r') as f:
    patterns = [p.strip() for p in f.readlines()]

In [8]:
def OverlapGraph(patterns):
    k = len(patterns[0])
    for pattern_a in patterns:
        for pattern_b in patterns:
            if pattern_a[1:] == pattern_b[:k-1]:
                print pattern_a + ' -> '+ pattern_b          

In [9]:
OverlapGraph(sorted(patterns))

AGGCA -> GGCAT
CATGC -> ATGCG
GCATG -> CATGC
GGCAT -> GCATG


### BA3D | De Bruijn Graph from a String Problem

In [10]:
def DeBruijnGraphOfKmers(patterns):
    k=len(patterns[0])
    graph_dict = {}
    for pattern in patterns:
        if pattern[:k-1] not in graph_dict:
            graph_dict[pattern[:k-1]] = [pattern[1:]]
        else:
            graph_dict[pattern[:k-1]].append(pattern[1:])

    for node in sorted(graph_dict):
        if len(graph_dict[node])>1:
            print node + ' ->',
            print ','.join([n for n in sorted(graph_dict[node])])
        else:
            print node + ' -> '+ graph_dict[node][0]   

In [11]:
def DeBruijnGraphOfString(text, k):
    patterns = [text[i:i+k] for i in range(len(text)-k+1)]
    DeBruijnGraphOfKmers(patterns)

In [12]:
k = 4
text = 'AAGATTCTCTAC'
DeBruijnGraphOfString(text, k)

AAG -> AGA
AGA -> GAT
ATT -> TTC
CTA -> TAC
CTC -> TCT
GAT -> ATT
TCT -> CTA,CTC
TTC -> TCT


### BA3E | De Bruijn Graph from k-mers Problem

In [13]:
def DeBruijnGraphOfKmers(patterns):
    k=len(patterns[0])
    graph_dict = {}
    for pattern in patterns:
        if pattern[:k-1] not in graph_dict:
            graph_dict[pattern[:k-1]] = [pattern[1:]]
        else:
            graph_dict[pattern[:k-1]].append(pattern[1:])

    for node in sorted(graph_dict):
        if len(graph_dict[node])>1:
            print node + ' ->',
            print ','.join([n for n in sorted(graph_dict[node])])
        else:
            print node + ' -> '+ graph_dict[node][0] 
    return graph_dict

In [14]:
with open('../data/rosalind_ba3e.txt', 'r') as f:
    patterns = [p.strip() for p in f.readlines()]

In [15]:
DeBruijnGraphOfKmers(patterns)

AGG -> GGG
CAG -> AGG,AGG
GAG -> AGG
GGA -> GAG
GGG -> GGA,GGG


{'AGG': ['GGG'],
 'CAG': ['AGG', 'AGG'],
 'GAG': ['AGG'],
 'GGA': ['GAG'],
 'GGG': ['GGG', 'GGA']}

### BA3F | Find an Eulerian Cycle in a Graph

In [28]:
from random import choice

In [29]:
def EulerianCycle(graph):
    path = []
    start = choice(graph.keys())
    path = WalkEulerian(graph,path, start)
    #print '->'.join(path[::-1][1:])
    return path[::-1][1:]
    #return ''.join(path[::-1][1:])

In [30]:
def WalkEulerian(graph, path, node):
    while len(graph[node]) > 0:
        new_start = choice(graph[node])
        graph[node].remove(new_start)
        WalkEulerian(graph, path, new_start)
    path.append(node)
    return path

In [31]:
with open('../data/rosalind_ba3f.txt', 'r') as f:
    lines = [line.strip().split(' -> ') for line in f.readlines()]
    graph = {}
    for line in lines:
        graph[line[0]] = line[1].split(',')

In [32]:
('->').join(EulerianCycle(graph))

'5->4->2->1->0->3->2->6->8->7->9->6'

### BA3G | Find an Eulerian Path in a Graph

In [33]:
from random import choice
import sys

In [34]:
def EulerianPath(graph):
    # make a reverse graph
    reverse_graph = {}
    for node in graph:
        for n in graph[node]:
            if n in reverse_graph:
                reverse_graph[n].append(node)
            else:
                reverse_graph[n] = [node]
    
    # parse add one edge to balance the graph 
    in_n_out = {}
    for node in graph:
        in_n_out[node] = [len(graph[node]), 0]
    for node in reverse_graph:
        if node in in_n_out:
            in_n_out[node][1] = len(reverse_graph[node])
        else:
            in_n_out[node] = [0, len(reverse_graph[node])]

    for node in in_n_out:
        if node not in graph:
            graph[node] = []
        in_degree = in_n_out[node][0]
        out_degree = in_n_out[node][1]
        if in_degree - out_degree < 0:
            node_from = node
        if in_degree - out_degree > 0:
            node_to = node

    if node_from in graph:
        graph[node_from].append(node_to)
    else:
        graph[node_from] = [node_to]    
    
    path = []
    start = choice(graph.keys())
    #path.append(WalkEulerian(graph,path, start))
    path = WalkEulerian(graph,path, start)
    idx = 0
    for i in range(len(path[:-1])-1):
        if path[i:i+2] == [node_to, node_from]:
            idx = i
            break
    result = path[:idx+1][::-1] + path[idx+1:-1][::-1]
    #print '->'.join(node for node in result)
    return result

In [23]:
def WalkEulerian(graph, path, node):
    if node in graph:
        while len(graph[node]) > 0:
            new_start = choice(graph[node])
            graph[node].remove(new_start)
            WalkEulerian(graph, path, new_start)
        path.append(node)
    return path

In [39]:
with open('../data/rosalind_ba3g.txt', 'r') as f:
    lines = [l.strip().split(' -> ') for l in f.readlines()]
    graph = {}
    for line in lines:
        line_list = line[1].split(',')
        graph[line[0]] = line_list

In [40]:
path = EulerianPath(graph)

In [41]:
('->').join(path)

'6->7->8->9->6->3->0->2->1->3->4'

### BA3H | Reconstruct a String from its k-mer Composition

In [42]:
with open('../data/rosalind_ba3h.txt', 'r') as f:
    k = int(f.readline().strip())
    patterns = [line.strip() for line in f.readlines()]

In [43]:
G = DeBruijnGraphOfKmers(patterns)

ACC -> CCA
CTT -> TTA
GCT -> CTT
GGC -> GCT
TAC -> ACC
TTA -> TAC


In [44]:
res = EulerianPath(G)

In [45]:
StringFromGenomePath(res)

'GGCTTACCA'

### BA3I | Find a k-Universal Circular String

In [46]:
def KUniversalCircularString(k):
    patterns = [("{0:0"+repr(k)+"b}").format(i) for i in range(2**k)]
    G = DeBruijnGraphOfKmers(patterns)
    cycle = EulerianCycle(G)
    result = StringFromGenomePath(cycle)
    return result[:-(k-2)]

In [47]:
KUniversalCircularString(3)

00 -> 00,01
01 -> 10,11
10 -> 00,01
11 -> 10,11


'11100010'

### BA3J | Reconstruct a String from its Paired Composition

In [59]:
import sys
from random import choice
sys.setrecursionlimit(10000)

In [60]:
with open('../data/rosalind_ba3j.txt','r') as f:
    k,d = [int(num) for num in f.readline().strip().split()]
    gapped_patterns = [line.strip() for line in f.readlines()]

In [61]:
def DeBruijnGraphOfGappedKmers(gapped_patterns):
    graph_dict = {}
    k = len(gapped_patterns[0].split('|')[0])
    for pattern in gapped_patterns:
        gapped_read = pattern.split('|')
        node_a = (gapped_read[0][:k-1], gapped_read[1][:k-1])
        node_b = (gapped_read[0][1:], gapped_read[1][1:])
        if node_a not in graph_dict:
            graph_dict[node_a] = [node_b]
        else:
            graph_dict[node_a].append(node_b)
    return graph_dict

In [62]:
def StringSpelledByGappedPatterns(gapped_patterns, k, d):
    first_patterns = [pattern.split('|')[0] for pattern in gapped_patterns]
    second_patterns = [pattern.split('|')[1] for pattern in gapped_patterns]
    prefix_str = StringFromGenomePath(first_patterns)
    suffix_str = StringFromGenomePath(second_patterns)
    for i in range(k+d, len(prefix_str)):
        if prefix_str[i] != suffix_str[i-k-d]:
            print 'There is no string spelled by the gapped pattern.'
    return prefix_str+suffix_str[-(k+d):]

In [63]:
G = DeBruijnGraphOfGappedKmers(gapped_patterns)

In [64]:
gp = [edge[0]+'|'+edge[1] for edge in EulerianPath(G)]

In [65]:
StringSpelledByGappedPatterns(gp, k, d)

'GTGGTCGTGAGATGTTGA'

### BA3K | Generate Contigs from a Collection of Reads

In [74]:
with open('../data/rosalind_ba3k.txt', 'r') as f:
    patterns = [p.strip() for p in f.readlines()]

In [75]:
G = DeBruijnGraphOfKmers(patterns)

AG -> GA
AT -> TG,TG
CA -> AT
GA -> AT
GG -> GA
TG -> GG,GT


In [76]:
contigs = MaxNonBranchingPaths(G)

In [77]:
for contig in contigs:
    print StringFromGenomePath(contig),

AGA CAT ATG ATG GAT TGT TGGA


### BA3L | Construct a String Spelled by a Gapped Genome Path

In [81]:
with open('../data/rosalind_ba3l.txt','r') as f:
    k,d = [int(num) for num in f.readline().strip().split()]
    gapped_patterns = [line.strip() for line in f.readlines()]

In [82]:
def StringSpelledByGappedPatterns(gapped_patterns, k, d):
    first_patterns = [pattern.split('|')[0] for pattern in gapped_patterns]
    second_patterns = [pattern.split('|')[1] for pattern in gapped_patterns]
    prefix_str = StringFromGenomePath(first_patterns)
    suffix_str = StringFromGenomePath(second_patterns)
    for i in range(k+d, len(prefix_str)):
        if prefix_str[i] != suffix_str[i-k-d]:
            print 'There is no string spelled by the gapped pattern.'
    return prefix_str+suffix_str[-(k+d):]

In [83]:
StringSpelledByGappedPatterns(gapped_patterns, k, d)

'GACCGAGCGCCGGA'

### BA3M | Generate All Maximal Non-Branching Paths in a Graph

In [84]:
from random import choice

In [85]:
with open('../data/rosalind_ba3m.txt', 'r') as f:
    lines = [line.strip().split(' -> ') for line in f.readlines()]
    graph = {}
    for line in lines:
        graph[line[0]] = line[1].split(',')

In [86]:
def MaxNonBranchingPaths(graph):
    # make a reverse graph
    reverse_graph = {}
    for node in graph:
        for n in graph[node]:
            if n in reverse_graph:
                reverse_graph[n].append(node)
            else:
                reverse_graph[n] = [node]

    # parse add one edge to balance the graph 
    in_n_out = {}
    for node in graph:
        in_n_out[node] = [len(graph[node]), 0]
    for node in reverse_graph:
        if node in in_n_out:
            in_n_out[node][1] = len(reverse_graph[node])
        else:
            in_n_out[node] = [0, len(reverse_graph[node])]

    # add nodes that do not have outgoing edges to graph
    for node in in_n_out:
        if node not in graph:
            graph[node] = []

    # make a set of all one-in-one-out nodes
    one_in_one_out = set()
    for node in in_n_out:
        if (in_n_out[node][0] == 1) and (in_n_out[node][1] == 1):
            one_in_one_out.add(node)

    paths = []
    nodes_added = set()
    for v in graph:
        if v not in one_in_one_out:
            if in_n_out[v][0]>0:
                for w in graph[v]:
                    non_branching_path = [v,w]
                    nodes_added.add(v)
                    nodes_added.add(w)
                    while w in one_in_one_out:
                        for u in graph[w]:
                            non_branching_path.append(u)
                            nodes_added.add(u)
                        w = u
                    paths.append(non_branching_path)

    seperate_edges = {}
    for node in one_in_one_out - nodes_added:
        seperate_edges[node] = graph[node][0]
    
    if len(seperate_edges)>0:
        node = choice(seperate_edges.keys())
        path = []
        while len(seperate_edges) > 0:
            if node in seperate_edges:
                to_node = seperate_edges[node]
                path.append(to_node)
                seperate_edges.pop(node, None)
                node = to_node
            else:
                paths.append(path+[path[0]])
                if len(seperate_edges)>0:
                    path = []
                    node = choice(seperate_edges.keys())
                else:
                    break
        paths.append(path+[path[0]])
    else:
        return paths

    return paths

In [87]:
path_list = MaxNonBranchingPaths(graph)

In [88]:
for path in path_list:
    print ' -> '.join(path)

1 -> 2 -> 3
3 -> 4
3 -> 5
7 -> 6 -> 7
