# See Last cell for BA3J
Reconstruct a String from its Paired Composition

# Eulerian Cycle

In [3]:
from random import shuffle

def get_edges(graph):
    res = []
    for k in graph.keys():
        for v in graph[k]:
            res.append((k, v))
    return res

def eulerian_cycle(graph):
    edges_visit = get_edges(graph)
    #shuffle to start with different vertex every time
    shuffle(edges_visit)
    res = []
    while edges_visit:
        pair = edges_visit[0]
        i = 1
        if res != []:
            while pair[0] not in res:
                pair = edges_visit[i]
                i += 1
        edges_visit.remove(pair)
        start, nxt = pair
        cycle = [start, nxt]
        while start != nxt:
            for suc in graph[nxt]:
                if (nxt, suc) in edges_visit:
                    cycle.append(suc)
                    pair = (nxt, suc)
                    nxt = suc
                    edges_visit.remove(pair)
        if not res:
            res = cycle
        else:
            pos = res.index(cycle[0])
            for i in range(len(cycle) - 1):
                res.insert(pos + i + 1, cycle[i + 1])
    return res

def adjList2graph(filename):
    with open(filename, 'r') as f:
        graph = {}
        for line in f:
            k = line.rstrip().split()[0]
            if k not in graph.keys():
                graph[k] = []
            values = line.rstrip().split()[2].split(',')
            for v in values:
                if v not in graph.keys():
                    graph[v] = []
                if v not in graph[k]:
                    graph[k].append(v)
    return graph

###  Test 

In [6]:
testGraph = adjList2graph('eulerianCycle_test1.txt')
print('->'.join(eulerian_cycle(testGraph)))

4->2->1->0->3->2->6->8->7->9->6->5->4


In [7]:
quizGraph = adjList2graph('dataset_203_2.txt')
print('->'.join(eulerian_cycle(quizGraph)))

318->2081->2082->2080->318->317->316->1181->1182->1180->316->57->55->742->743->744->55->874->876->875->1358->1357->1359->875->55->359->1849->1850->1851->359->360->856->857->858->2319->2317->2318->858->360->1305->1303->1304->360->1549->1774->1776->1775->1549->1551->1550->360->358->55->441->439->440->55->508->510->509->55->516->1035->1034->1033->1063->2570->2571->2569->1063->1064->1065->1033->516->515->514->2906->2907->2905->514->55->56->504->643->645->644->504->502->503->594->592->1100->1823->1822->1824->1100->1101->1099->592->593->503->56->39->119->2904->2902->2903->119->176->987->1544->1545->1543->987->985->986->176->175->177->2039->2040->2038->177->119->1538->1537->1539->119->120->245->246->2438->2439->2437->246->244->1574->1573->1575->244->314->1172->1940->1939->1941->2879->2880->2878->1941->1172->1171->1173->314->315->313->737->1278->2525->2524->2865->2863->2864->2524->2526->1278->1276->1277->2686->2688->2687->1277->737->736->738->313->1020->1018->1019->313->244->1428->2017->2018->

# Eulerian Path

In [2]:
def in_degree(k, graph):
    count = 0
    for key in graph.keys():
        if k == key:
            continue
        elif k in graph[key]:
            count += graph[key].count(k)
    return count


def out_degree(k, graph):
    return len(graph[k])


def add_edge(graph):
    for k in graph.keys():
        if in_degree(k, graph) > out_degree(k, graph):
            outNode = k
        elif in_degree(k, graph) < out_degree(k, graph):
            inNode = k
    return outNode, inNode


def eulerian_path(graph):
    outNode, inNode = add_edge(graph)
    graph[outNode].append(inNode)
    eCycle = eulerian_cycle(graph)
    for i in range(len(eCycle) - 1):
        if eCycle[i] == outNode and eCycle[i + 1] == inNode:
            break
    path = eCycle[i+1:] + eCycle[1:i+1]
    return path

### Test

In [9]:
testGraph3 = adjList2graph('eulerianPath_test1.txt')
print(testGraph3)
print(add_edge(testGraph3))
print('->'.join(eulerian_path(testGraph3)))

{'0': ['2'], '2': ['1'], '1': ['3'], '3': ['0', '4'], '4': [], '6': ['3', '7'], '7': ['8'], '8': ['9'], '9': ['6']}
('4', '6')
6->7->8->9->6->3->0->2->1->3->4


In [22]:
testGraph4 = adjList2graph('eulerianPath_test2.txt')
print('->'.join(eulerian_path(testGraph4)))

1630->1631->1587->1817->1816->1818->1886->1887->1885->1818->1587->1114->1124->1123->1125->1114->1115->1269->1785->1784->1783->1269->1268->1267->1115->1116->1232->1953->1952->1951->1232->1231->1233->1675->1677->1676->1744->1745->1746->1676->1233->1116->70->383->382->446->1131->1129->1799->1800->1798->1129->1130->446->447->631->633->1695->1694->1693->633->632->1399->1400->1629->1636->1637->1638->1629->1628->1627->1400->1401->632->447->445->638->639->637->1112->1741->1743->1742->1112->1111->1113->1541->1540->1542->1113->637->445->382->384->70->651->650->649->70->71->72->59->606->604->605->59->46->376->1257->1255->1256->376->377->378->46->47->11->26->80->92->977->978->1391->1392->1390->978->976->92->91->428->893->892->1351->1352->1353->892->894->428->427->429->91->93->351->548->549->547->351->349->917->918->1822->1823->1824->918->916->349->350->93->80->1157->1156->1461->1478->1477->1479->1978->1979->1980->1479->1461->1459->1460->1156->1158->80->1611->1609->1610->80->81->297->295->296->832-

In [10]:
quizGraph2 = adjList2graph('dataset_203_6.txt')
print('->'.join(eulerian_path(quizGraph2)))

615->613->2300->2299->2301->613->542->2332->2334->2333->542->541->87->11->0->2->2031->2030->2029->2->1->790->1747->1748->1749->790->791->911->912->910->791->792->1->326->325->327->1701->2088->2086->2087->1701->1700->1699->327->2214->2708->2709->2707->2214->2212->2213->327->1->31->32->1202->1203->1201->32->140->256->257->258->1452->1451->1450->258->1041->1040->1039->258->140->141->260->267->1331->1330->1332->267->266->1306->1307->1308->266->265->2434->2435->2770->2771->2772->2435->2436->265->260->338->337->799->800->801->337->339->260->261->259->141->703->705->704->141->196->976->978->977->1577->1578->1576->977->196->198->991->1740->1738->2115->2114->2113->1738->1739->2197->2198->2199->1739->991->993->1656->1655->1654->993->992->198->1871->1872->1870->2679->2678->2677->1870->198->197->358->2596->2597->2598->358->359->2125->2127->2126->359->360->197->433->485->2303->2302->2304->485->486->1382->1383->1381->486->484->1282->1421->1420->1422->1282->1283->1313->1314->1312->1283->1284->484->43

# StringConstruction

In [1]:
def deBruijn(kmers):
    graph = {}
    for kmer in kmers:
        if kmer[:-1] not in graph.keys():
            graph[kmer[:-1]] = []
        if kmer[1:] not in graph.keys():
            graph[kmer[1:]] = []
        graph[kmer[:-1]].append(kmer[1:])
    return graph


def get_seq(path):
    res = path[0]
    for i in range(1, len(path)):
        res += path[i][-1]
    return res

### Test

In [12]:
with open('stringConstruction_test1.txt', 'r') as f:
    k = int(f.readline().rstrip())
    kmers = []
    for line in f:
        kmers.append(line.rstrip())
    testGraph5 = deBruijn(kmers)
    testPath = eulerian_path(testGraph5)
    print(get_seq(testPath))

GGCTTACCA


In [33]:
with open('dataset_203_7.txt', 'r') as f:
    k = int(f.readline().rstrip())
    kmers = []
    for line in f:
        kmers.append(line.rstrip())
    testGraph6 = deBruijn(kmers)
    testPath2 = eulerian_path(testGraph6)
    print(get_seq(testPath2))

TCCTCCTAATAAAAAACCCTCCGTAAATAGTTGTAAACACACTATTGTACTGTAGCTCCAATGAGGTAATGAGCGGTAGAGTGAGGTATCCGCGCCTTAAACCCTGTAATGCCACTTGGGCGTGCGAATGCCCTACCTCATTTGCCAGCCCGTTGTTAGCCGGAATCAGCCCTTCTCGCCATCTTGATACGCTTAGTATGATCAGGCCTCCTACCACGGGTCCAACCGTGTGGCTGCCCTGTTGTACGCGGTGCCGCGGTTCGCCCGTCCACTACCACTCGTTCCAGTGTCCCCTGGTTAAGGGAATAGTATGATTTAGGATCGAACTATCGGTGATTATCAAGATGATACCTGCCGTTTTATCATGGATTTCTACCCCAGCCCTCGTGTAAATCTACTACGTCTCCCCGCCCTGAGAGAGGTTCCACATCGTCAGCCATCCCTCATCACAGATTACAAATTGTCGACAGAGAGACAGTTCGTTGTCGTGAATCTCGAGAGTGCTCTAATTTTCGAGCGCGTGGAATGGAAGCAATGGATTAGACGAATTTGCGCCCGGTTGTAGTCACTCTGACCTAATGTAAAGCCAAAGCCGAATTCGATCGAAGCAGTCCTGGCGATGGCGCACAGTTTGGCATAATCGAATGCGATCGGTAAATTTTCCCTGGGGACTTGCTCAGCGGGATCCACTGTGGGAGACTGCACCTGATGAATGCCGGAGCGCCAACACAAATTCAAAGACTTCGCTCATTGGATTCTCCCAGTTAGCGGTGCTCTCAAAGGCGACGTTTTATAGCCGTGTTTGATACAAACGCGTGGATAACACGAGTGCTTATAGCCCAATGTAGTCGGGTCGCTGCTTTTTGCCACCCAAGGGCAAAAACTCCCAGCACCCCCACTCGACGGGACGACCATATTTAAACGAACTTGGACAGAGTTATACGTTCGCGCATGGCGAAAAGGGAATCACTGTAAGTACAGACTGGGTGCGTATGGCCGG

# K-Universal String Problem

In [34]:
def allBinaryKmers(k):
    allKmers = ['0', '1']
    for i in range(0, k - 1):
        allKmers = allKmers * 2
        allKmers[:int(len(allKmers)/2)] = [allKmers[i] + '0' for i in range(int(len(allKmers)/2))]
        allKmers[int(len(allKmers)/2):] = [allKmers[i] + '1' for i in range(int(len(allKmers)/2), len(allKmers))]
    return(allKmers)

In [35]:
testBinaryKmers = allBinaryKmers(4)
testGraph7 = deBruijn(testBinaryKmers)
testeCycle = eulerian_cycle(testGraph7)[:-1]
print(''.join([k[-1] for k in testeCycle]))

0101001101111000


In [36]:
testBinaryKmers = allBinaryKmers(8)
testGraph7 = deBruijn(testBinaryKmers)
testeCycle = eulerian_cycle(testGraph7)[:-1]
print(''.join([k[-1] for k in testeCycle]))

0100010000100100101001000110010011100100000101000101010100110101011101010000110100101101011011010001110100111101011111010000001100010011001100001011001010110001101100111011011110110000011100010111001101110111000011110010111100011111001111110111111110000000


# StringSpelledByGappedPatterns

In [9]:
def consistent(seq1, seq2, k, d):
    res = True
    for i in range(k + d, len(seq1)):
        if seq1[i] != seq2[i - k - d]:
            return False
    return res


def deBruijnPE(pe_kmers):
    #pe_kmers are list of set of pe k mer
    graph = {}
    for pe in pe_kmers:
        if ((pe[0][:-1], pe[1][:-1])) not in graph.keys():
            graph[(pe[0][:-1], pe[1][:-1])] = []
        if ((pe[0][1:], pe[1][1:])) not in graph.keys():
            graph[(pe[0][1:], pe[1][1:])] = []
        graph[(pe[0][:-1], pe[1][:-1])].append((pe[0][1:], pe[1][1:]))
    return graph

#Test 1
with open('pairedDebruijn_test1.txt', 'r') as f:
    k,d =[int(num) for num in f.readline().rstrip().split()]
    pe_kmers = []
    for line in f:
        first,second = line.rstrip().split('|')
        pe_kmers.append((first, second))
    peGraph = deBruijnPE(pe_kmers)
    pePath = eulerian_path(peGraph)
    firstPath = [i[0] for i in pePath]
    secondPath = [i[1] for i in pePath]
    firstSeq = get_seq(firstPath)
    secondSeq = get_seq(secondPath)
    print(firstSeq + secondSeq[len(secondSeq) - k - d:])

GTGGTCGTGAGATGTTGA


In [11]:
#Quiz
with open('dataset_204_16.txt', 'r') as f:
    k,d =[int(num) for num in f.readline().rstrip().split()]
    pe_kmers = []
    for line in f:
        first,second = line.rstrip().split('|')
        pe_kmers.append((first, second))
    peGraph = deBruijnPE(pe_kmers)
    pePath = eulerian_path(peGraph)
    firstPath = [i[0] for i in pePath]
    secondPath = [i[1] for i in pePath]
    firstSeq = get_seq(firstPath)
    secondSeq = get_seq(secondPath)
    print(firstSeq + secondSeq[len(secondSeq) - k - d:])

CGTATTGTCGAGATTAATTGTATATAGTGCGAAGCGCCAGCAAGGCCGTTACAATGAGGGCTTATTCTTCACTTACTGTCCATGGTAGTGTCATGGTGACCATGGATTCCCGCCGCTCCTCCTGTGTGTAGACTTGATGTCTCTTTGCCACCGTTACACAGCTTATATGACCTACGCCACACCTGTCGGTATGGATGCGTCTACAGAGCGATCGGGTTCGCGTCGTTCCGGGGACTCTTTGCCATGCCTGCTACCTCACGGACCACGAATAGACCTAGTCAAATAGGAAGTCTCCTGGTTGTTGAGAGTGCAGGGTACGCACGTGGCTCACCATGAATCCATAGGGCACACGTCCTTGACATGAGCCTTCCTCCCCACTCATGACATGATTCAGACCCTGGCACGGCGCAAGATCACGTCCCTCACTGCCGTAGCAAGCCTAACCTTGGGTGGCGGAAGAGGGCTGTACGATTGTAAACCACTAGACGGATGCGAGCCAGTTAGACAGTCCTCTCCGTGGTCCGATCCTAAGAGTAAACACCCATGCGTCTACAGAGCGATCGGGTTCGCGTCGTTCCGGGGACTCTTTGCCAGGGACACCTGTAATCAGGCCGTACGAAGTCAGCTAACTCCGCAGAACGGCCTTTTAACGAGTAGTCAGGTTTGGTTGGAACGTTTTCCCGCGACTACTGAATGCGTCTACAGAGCGATCGGGTTCGCGTCGTTCCGGGGACATGCGTCTACAGAGCGATCGGGTTCGCGTCGTTCCGGGGACTCTTTGCCATCTTTGCCATTACCTGCATGGGGAACATCTATAGCGGGCGGAGAGACACAAATGGAAAAGTCATGCGCGGATCCCCAGCACAAACGAGTCGCCATGCCTCAGGCCTCGTGGATCGGGACGCCGTGCAATAGATAAATTAGACAGTCCGGACACGTTGATGTACCTACCATCGTATTCCAAGTCGGTGCCCAACTTTGCTTTACGATTGGCCTCAGG