In [1]:
import pandas as pd
import numpy as np
from kerMIT.tree import Tree
from stanfordcorenlp import StanfordCoreNLP
import ast
import time

In [2]:
from nltk import tree

In [3]:
nlp = StanfordCoreNLP('/stanford-corenlp-full-2018-10-05')

In [4]:
def parse(text, nlp=None, **kwargs):
    if nlp is None:
        nlp = StanfordCoreNLP(r'./stanford-corenlp-full-2018-10-05')

    #text = (text.encode('ascii', 'ignore')).decode("utf-8")
    implemented_annotators = {'parse', 'depparse'}
    if 'annotator' in kwargs and kwargs['annotator'] in implemented_annotators:
        annotator = kwargs['annotator']
    else:
        annotator = 'parse'
    #try:
    try:
        props={'annotators': annotator,'outputFormat':'json'}
        output = nlp.annotate(text, properties=props)
    except Exception as e:
        print("Exception during parsing!!")
        print(e)
        if annotator == 'parse':
            return "(S)"
        elif annotator == 'depparse':
            return "(ROOT)"

    outputD = ast.literal_eval(output)
    sentences = outputD['sentences']

    if annotator == 'parse':
        if len(sentences) <= 1:
            root = sentences[0]['parse'].strip('\n')
            root = root.split(' ',1)[1]
            root = root[1:len(root)-1]
        else:
            s1 = sentences[0]['parse'].strip('\n')
            s1 = s1.split(' ', 1)[1]
            s1 = s1[1:len(s1)-1]
            root = "(S" + s1
            for sentence in sentences[1:]: # not sure if there can be multiple items here. If so, it just returns the first one currently.
                s2 = sentence['parse'].strip('\n')
                s2 = s2.split(' ', 1)[1]
                s2 = s2[1:len(s2)-1]
                root = root + s2
            root = root + ")"

        return root.replace("\n", "")

    if annotator == 'depparse':
        trees = []
        for i in range(len(sentences)):
            dependencies = sentences[i]['basicDependencies']
            tokens = sentences[i]['tokens']

            for d in dependencies:
                print(d)

            pos_tags = False
            if 'pos_tags' in kwargs:
                pos_tags=kwargs['pos_tags']

            root = min([d['governor'] for d in dependencies])
            parsing = ParseDependencies(root, dependencies, tokens, pos_tags=pos_tags)
            tree = parsing.tree()
            trees.append(tree)

        if len(sentences) == 1:
            return str(trees[0])
        else:
            return str(Tree(root="ROOT", children=trees))


"""except Exception as e:
    print(e)
    print("Except")
    if annotator == 'parse':
        return "(S)"
    elif annotator == 'depparse':
        return "(ROOT)"""

class ParseDependencies:
    def __init__(self, root, dependencies, tokens, **kwargs):
        self.dependencies = dependencies
        self.tokens = {token["index"]: token for token in tokens}
        self.root = root

        self._pos_tags = False
        if "pos_tags" in kwargs:
            self._pos_tags = kwargs["pos_tags"]

        self.nodes = self._nodes()
        self.adj = self._adj()

    def _find_dependency(self, idx):
        for d in self.dependencies:
            if d["dependent"] == idx: # TODO one root per word
                return {"label": d["dep"], "token": d["dependentGloss"]}


    def _nodes(self):
        nodes = {}
        for idx in self.tokens:
            nodes[idx] = self._find_dependency(idx)
            if self._pos_tags:
                nodes[idx]["pos"] = self.tokens[idx]['pos']
        return nodes

    def _adj(self):
        children = sorted([dep for dep in self.dependencies if dep["governor"] == self.root], key=lambda dep: dep['dep'])
        adj = {self.root: [dep["dependent"] for dep in children]}
        for token in self.tokens:
            children = sorted([dep for dep in self.dependencies if dep["governor"] == token], key=lambda dep: dep['dep'])
            adj[token] = [dep["dependent"] for dep in children]
        return adj

    def to_str(self):
        return str(self.tree())

    def tree(self) -> Tree:
        return self._rec_visit(self.root)

    def _rec_visit(self, root):
        if root in self.nodes:
            if not self._pos_tags:
                tree = Tree(root=self.nodes[root]['label'], children=[Tree(root=self.nodes[root]['token'])])
            else:
                tree = Tree(root=self.nodes[root]['label'], children=[Tree(root=self.nodes[root]['pos'],
                                                                           children=[Tree(root=self.nodes[root]['token'])])
                                                                      ]
                            )
            for child in self.adj[root]:
                tree.children.append(self._rec_visit(child))

        else:
            if root == self.root:
                child = self.adj[root][0]
                tree = self._rec_visit(child)
            else:
                raise Exception(f"Unkown node {root}")
        return tree

In [5]:
def test(text, nlp):
    tree_str = parse(text, nlp=nlp, annotator='depparse')
    print()
    print(tree_str)

    print()
    t = Tree(string=tree_str)
    print(t)
    for c in t.children:
        print(c)

    print()
    nlt = tree.Tree.fromstring(tree_str)
    nlt.pretty_print()

In [6]:
text = "The cat is on the table"
test(text, nlp)

{'dep': 'ROOT', 'governor': 0, 'governorGloss': 'ROOT', 'dependent': 6, 'dependentGloss': 'table'}
{'dep': 'det', 'governor': 2, 'governorGloss': 'cat', 'dependent': 1, 'dependentGloss': 'The'}
{'dep': 'nsubj', 'governor': 6, 'governorGloss': 'table', 'dependent': 2, 'dependentGloss': 'cat'}
{'dep': 'cop', 'governor': 6, 'governorGloss': 'table', 'dependent': 3, 'dependentGloss': 'is'}
{'dep': 'case', 'governor': 6, 'governorGloss': 'table', 'dependent': 4, 'dependentGloss': 'on'}
{'dep': 'det', 'governor': 6, 'governorGloss': 'table', 'dependent': 5, 'dependentGloss': 'the'}

(ROOT table (case on) (cop is) (det the) (nsubj cat (det The)))

(ROOT table (case on) (cop is) (det the) (nsubj cat (det The)))
table
(case on)
(cop is)
(det the)
(nsubj cat (det The))

           ROOT                  
   _________|_____________        
  |    |    |    |      nsubj    
  |    |    |    |    ____|____   
  |   case cop  det  |        det
  |    |    |    |   |         |  
table  on   is  the ca

In [7]:
text = "The cat sleeps on the table"
test(text, nlp)

{'dep': 'ROOT', 'governor': 0, 'governorGloss': 'ROOT', 'dependent': 3, 'dependentGloss': 'sleeps'}
{'dep': 'det', 'governor': 2, 'governorGloss': 'cat', 'dependent': 1, 'dependentGloss': 'The'}
{'dep': 'nsubj', 'governor': 3, 'governorGloss': 'sleeps', 'dependent': 2, 'dependentGloss': 'cat'}
{'dep': 'case', 'governor': 6, 'governorGloss': 'table', 'dependent': 4, 'dependentGloss': 'on'}
{'dep': 'det', 'governor': 6, 'governorGloss': 'table', 'dependent': 5, 'dependentGloss': 'the'}
{'dep': 'nmod', 'governor': 3, 'governorGloss': 'sleeps', 'dependent': 6, 'dependentGloss': 'table'}

(ROOT sleeps (nmod table (case on) (det the)) (nsubj cat (det The)))

(ROOT sleeps (nmod table (case on) (det the)) (nsubj cat (det The)))
sleeps
(nmod table (case on) (det the))
(nsubj cat (det The))

             ROOT                  
   ___________|_____________        
  |          nmod         nsubj    
  |       ____|____     ____|____   
  |      |   case det  |        det
  |      |    |    |   | 

In [8]:
text = "This time around, they're moving even faster."
test(text, nlp)

{'dep': 'ROOT', 'governor': 0, 'governorGloss': 'ROOT', 'dependent': 7, 'dependentGloss': 'moving'}
{'dep': 'det', 'governor': 2, 'governorGloss': 'time', 'dependent': 1, 'dependentGloss': 'This'}
{'dep': 'nmod:tmod', 'governor': 7, 'governorGloss': 'moving', 'dependent': 2, 'dependentGloss': 'time'}
{'dep': 'advmod', 'governor': 2, 'governorGloss': 'time', 'dependent': 3, 'dependentGloss': 'around'}
{'dep': 'punct', 'governor': 7, 'governorGloss': 'moving', 'dependent': 4, 'dependentGloss': ','}
{'dep': 'nsubj', 'governor': 7, 'governorGloss': 'moving', 'dependent': 5, 'dependentGloss': 'they'}
{'dep': 'aux', 'governor': 7, 'governorGloss': 'moving', 'dependent': 6, 'dependentGloss': "'re"}
{'dep': 'advmod', 'governor': 9, 'governorGloss': 'faster', 'dependent': 8, 'dependentGloss': 'even'}
{'dep': 'advmod', 'governor': 7, 'governorGloss': 'moving', 'dependent': 9, 'dependentGloss': 'faster'}
{'dep': 'punct', 'governor': 7, 'governorGloss': 'moving', 'dependent': 10, 'dependentGloss':