In [371]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re


In [372]:
p = re.compile(r'requires a total of\s+(\w+)\.')
int(p.search('fdas fwej \nrequires a total of      9.000').groups()[0])

9

In [373]:
def show_graph_with_labels(adjacency_matrix, mylabels):
    rows, cols = np.where(adjacency_matrix == 1)
    edges = zip(rows.tolist(), cols.tolist())
    gr = nx.Graph()
    gr.add_edges_from(edges)
    np.random.seed(42)
    nx.draw(gr, node_size=500, labels=mylabels, with_labels=True)
    plt.show()

In [374]:
def tokenize_newick(infile):
    """
    Iterates through the tokens in a stream in newick format
    infile -- a string or file stream
    """

    def iter_stream(infile):
        while True:
            yield infile.read(1)

    if not isinstance(infile, str):
        infile = iter_stream(infile)
    else:
        infile = iter(infile)

    word = []
    for c in infile:
        if c == "":
            # EOF encountered
            break

        elif c in " \t\n":
            # skip white space
            if word:
                yield "".join(word)
                word[:] = []

        elif c in ";(),:[]":
            # special tokens
            if word:
                yield "".join(word)
                word[:] = []

            if c == "[":
                # parse comment
                word.append(c)
                for c in infile:
                    word.append(c)
                    if c == "]":
                        break
                yield "".join(word)
                word[:] = []
            else:
                yield c
        else:
            # word token
            word.append(c)

    if word:
        yield "".join(word)
        word[:] = []

In [375]:
def gettree(tokens):
    ancestors = []

    # create tree
    tree = Tree()

    # create root
    node = TreeNode()
    tree.root = node
    nodes = [node]
    token = None
    data = []
    empty = True
    while True:
        prev_token = token
        token = tokens.next()
        empty = False
        if token == '(':  # new branchset
            child = TreeNode()
            nodes.append(child)
            child.parent = node
            node.children.append(child)
            ancestors.append(node)
            node = child

        elif token == ',':  # another branch
            parent = ancestors[-1]
            child = TreeNode()
            nodes.append(child)

            child.parent = parent
            parent.children.append(child)
            node = child
        elif token == ')':  # optional name next
            node = ancestors.pop()
        elif token == ':':  # optional length next
            pass
        elif token == ';':  # end of tree
            break
        else:
            if prev_token in '(,':
                node.name = token
            elif prev_token in '):':
                pass
            else:
                pass # data.append(token)

    tree.nodes = nodes
    k = 0
    for i in range(len(tree.nodes)):
        if tree.nodes[i].name =='':
            tree.nodes[i].name = 'Idx'+str(k)
            k+=1
        tree.nodes[i].id = i
    return tree

In [376]:
class Tree:
    def __init__(self):
        self.root = None
        self.nodes = []
    def __str__(self):
        return self.recursive(self.root)
    def recursive(self, node, level=0):
        ret = "\t"*level + str(node) +"\n"
        for child in node.children:
            ret += self.recursive(child, level+1)
        return ret
    def toadj(self):
        adj = np.zeros([len(self.nodes), len(self.nodes)])
        for i in range(len(self.nodes)):
            for j in range(len(self.nodes[i].children)):
                adj[i,self.nodes[i].children[j].id] = 1
                adj[self.nodes[i].children[j].id,i] = 1
        return adj
    def findroot(self):
        for i in range(len(self.nodes)):
            if self.nodes[i].name == 'G.L.':
                return i
    def reorder(self):
        # adj = self.toadj()
        queue = [self.findroot()]
        newnodes = [self.nodes[self.findroot()].copy()]
        newtree = Tree()
        while len(queue)>0:
            nid = queue.pop(0)
            pnode = [n for n in newnodes if n.id ==nid][0]           
            neibours = self.nodes[nid].connectedids()
            for n in neibours:
                if n.name not in [k.name for k in newnodes]:
                    nn = n.copy()
                    newnodes.append(nn)
                    pnode.children.append(nn)
                    nn.parent = pnode
                    queue.append(nn.id) 
        newtree.nodes = newnodes
        newtree.root = newnodes[0]
        for i in range(len(newnodes)):
            newtree.nodes[i].id = i
        return newtree
    def show_ntree(self):
        t = self.reorder()
        out = []
        for i in range(len(t.nodes)):
            if len(t.nodes[i].children) > 0:
                out.append('{}\t{}'.format(t.nodes[i].id, '\t'.join(list(map(lambda n:str(n.id), t.nodes[i].children)))))
        return '\n'.join(out)
    def show_graph(self):
        show_graph_with_labels(self.toadj(), dict(zip(list(range(len(self.nodes))),[n.name for n in self.nodes])))
class TreeNode:
    def __init__(self):
        self.children = []
        self.name = ''
        self.parent = None
        self.id = None
    def connectedids(self):
        return [n for n in [self.parent]+self.children if n]
    def __repr__(self):
        if self.name == '':
            return 'iternode'
        else:
            return self.name
    def copy(self):
        newnode = TreeNode()
        newnode.name = self.name
        newnode.id = self.id
        newnode.children = []
        newnode.parent = None
        return newnode

In [377]:
f = open('data/Simulated/sim1/00001.phy.outtree','r')
token = tokenize_newick(f)
tree = gettree(token)
print(tree.show_ntree())

0	1
1	2	3	4
2	5	6	7
3	8	9
4	10	11
5	12	13	14
7	15	16
12	17	18
14	19	20
15	21	22
16	23	24	25	26
19	27	28
20	29	30	31
26	32	33
33	34	35


In [378]:
for isim in range(1, 10):
    for i in range(1, 501):
        fname = 'data/Simulated/sim%d/%05d.phy.outtree'%(isim, i)
        fin = open(fname,'r')
        finn = open(fname.replace('outtree','outfile'),'r')
        nsize = int(p.search(finn.read()).groups()[0])
        fout = open(fname.replace('outtree', 'out.tree'),'w')
        token = tokenize_newick(fin)
        tree = gettree(token)
        print >>fout, nsize
        print >>fout, tree.show_ntree()
        fin.close()
        fout.close()

In [345]:
tree.show_ntree()

0	1
1	2	3	4
2	5	6	7
3	8	9
4	10	11
5	12	13	14
6	
7	15	16
8	
9	
10	
11	
12	17	18
13	
14	19	20
15	21	22
16	23	24	25	26
17	
18	
19	27	28
20	29	30	31
21	
22	
23	
24	
25	
26	32	33
27	
28	
29	
30	
31	
32	
33	34	35
34	
35	


In [346]:
len(tree.nodes)

36