In [1]:
# %load debug
import sys
import render_tree, init, treebanks, parse_errors, head_finder, tree_transform
import pstree_debug
from collections import defaultdict
from StringIO import StringIO
from difflib import SequenceMatcher
from nltk.tree import Tree

In [12]:
def replace_words(ptb_tree, ms_words, i1, i2, j1, j2):
    for k, sidx in enumerate(range(i1, i2)):
        this_node = ptb_tree.get_nodes(start=sidx+k, end=sidx+k+1)
        leaf = this_node[-1]
        assert leaf.is_terminal()
        leaf.word = ms_words[j1+k]
    ptb_tree.check_consistency()

# delete one word at a time
def delete_word(ptb_tree, i1):
    to_remove = ptb_tree.get_nodes(start=i1, end=i1+1)
    for node in to_remove:
        tree_transform.remove_node_by_node(node, True)
    # update spans
    for node in ptb_tree.get_nodes():
        if node.span[0] > i1: 
            node.span = (node.span[0]-1, node.span[1])
        if node.span[1] > i1: 
            node.span = (node.span[0], node.span[1]-1)
    ptb_tree.check_consistency()

def delete_words(ptb_tree, i1, i2):
    for k in range(i1, i2)[::-1]:
        # delete from back to front so it's easier to update spans
        delete_word(ptb_tree, k)

def insert_words(ptb_tree, ms_words, i1, i2, j1, j2):
    to_insert = ms_words[j1:j2]
    str_mrg = ['(XX ' + w +')' for w in to_insert]
    mrg = '(X ' + ' '.join(str_mrg) + ' )'
    node_to_add = pstree_debug.tree_from_text(mrg)
    # adjust span of new node
    for n in node_to_add.get_nodes():
        n.span = (n.span[0] + i1, n.span[1] + i1)
        
    # adjust span of original tree
    offset = j2 - j1
    for node in ptb_tree.get_nodes():
        if node.span[0] >= i1: node.span = (node.span[0] + offset, node.span[1])
        if node.span[1] > i1: node.span = (node.span[0], node.span[1] + offset)

    # choose left sibling's parent by default
    sibling = ptb_tree.get_nodes(request='highest', end=node_to_add.span[0])
    print sibling.span, sibling.label
    
    node_to_add.parent = sibling.parent
    idx = sibling.parent.subtrees.index(sibling)
    node_to_add.parent.subtrees.insert(idx+1, node_to_add)
    ptb_tree.check_consistency()

In [19]:
#ms_file = '../samples/4936_A9_0.ms.bkout'
ms_file = '../samples/2015_B30_2.ms.bkout'
ms_tree_candidates = open(ms_file).readlines()
ms_tree_candidates = [x.strip() for x in ms_tree_candidates]
ms_tree_candidates = ["(ROOT"+ x[1:] for x in ms_tree_candidates]

#ptb_tree_mrg = "(ROOT (S (INTJ (UH Uh) ) (INTJ (UH uh) ) (NP-SBJ-1 (PRP$ my) (NN wife) ) (VP (VBZ has) (VP (VBN picked) (PRT (RB up) ) (NP (NP (DT a) (NN couple) ) (PP (IN of) (NP (NNS things) ) ) ) (S-ADV (VP (VBG saying) (INTJ (UH uh) ) (S-SEZ (INTJ (UH boy) ) (SBAR-ADV (IN if) (S (NP-SBJ (PRP we) ) (VP (MD could) (VP (VB refinish) (NP (DT that) ) ) ) ) ) (NP-SBJ (DT that) ) (VP (MD would) (VP (VB be) (NP-PRD (NP (DT a) (JJ beautiful) (NN piece) ) (PP (IN of) (NP (NN furniture) ) ) ) ) ) ) ) ) ) ) ) )"
ptb_tree_mrg = "(ROOT (S (NP-SBJ (PRP you) ) (VP (VP (VBD started) (PRT (RP off) ) ) (CC and) (VP (VBD said) ) ) ) )"

temp = pstree_debug.tree_from_text(ms_tree_candidates[0])
ms_words = temp.word_yield(as_list=True)
ptb_tree = pstree_debug.tree_from_text(ptb_tree_mrg)
ptb_words = ptb_tree.word_yield(as_list=True)
ptb_words = [x.lower() for x in ptb_words]


In [20]:
t1 = Tree.fromstring(ptb_tree_mrg)
#t1.pretty_print()

t2 = Tree.fromstring(ms_tree_candidates[0])
#t2.pretty_print()

In [21]:
sseq = SequenceMatcher(None, ptb_words, ms_words)
# .get_opcodes returns ops to turn a into b 
ops = []
for info in sseq.get_opcodes():
    if info[0] != 'equal': ops.append(info)

print ops

[('insert', 4, 4, 4, 5)]


In [22]:
for op in ops[::-1]:
    tag, i1, i2, j1, j2 = op
    print tag
    if tag == 'replace':
        # easy case: same number of words --> just substitute
        if (i2 - i1) == (j2 - j1):
            replace_words(ptb_tree, ms_words, i1, i2, j1, j2)
        # harder case: replace ptb_words[i1:i2] by ms_words[j1:j2]
        else:
            delete_words(ptb_tree, i1, i2)
            insert_words(ptb_tree, ms_words, i1, i2, j1, j2)
    elif tag == 'delete':
        delete_words(ptb_tree, i1, i2)
    else:
        insert_words(ptb_tree, ms_words, i1, i2, j1, j2)
    ptb_tree.check_consistency()

insert
(3, 4) CC


In [23]:
s_new = str(ptb_tree)
t_new = Tree.fromstring(s_new)
t_new.pretty_print()

t2.pretty_print()

                   ROOT             
                    |                
                    S               
   _________________|____            
  |                      VP         
  |              ________|_______    
  |             VP       |   |   |  
  |        _____|___     |   |   |   
NP-SBJ    |        PRT   |   X   VP 
  |       |         |    |   |   |   
 PRP     VBD        RP   CC  XX VBD 
  |       |         |    |   |   |   
 you   started     off  and you said

                ROOT                 
                 |                    
                 S                   
        _________|____________        
       S              |       |      
  _____|_____         |       |       
 |           VP       |       S      
 |      _____|___     |    ___|___    
 NP    |        PRT   |   NP      VP 
 |     |         |    |   |       |   
PRP   VBD        RP   CC PRP     VBD 
 |     |         |    |   |       |   
you started     off  and you     said



In [26]:
import transform_search
from classify_english import classify
init_errors = parse_errors.get_errors(ptb_tree, temp)
iters, path = transform_search.greedy_search(temp, ptb_tree, classify)

for e in init_errors: print e
print
for p in path: print p

('extra', (0, 1), 'NP-SBJ', (NP-SBJ (PRP you)))
('extra', (1, 6), 'VP', (VP (VP (VBD started) (PRT (RP off))) (CC and) (X (XX you)) (VP (VBD said))))
('extra', (4, 5), 'X', (X (XX you)))
('missing', (0, 1), 'NP', (NP (PRP you)))
('crossing', (0, 3), 'S', (S (NP (PRP you)) (VP (VBD started) (PRT (RP off)))))
('missing', (4, 5), 'NP', (NP (PRP you)))
('missing', (4, 6), 'S', (S (NP (PRP you)) (VP (VBD said))))

((ROOT (S (NP-SBJ (PRP you)) (VP (VP (VBD started) (PRT (RP off))) (CC and) (X (XX you)) (VP (VBD said))))), {'classified_type': 'UNSET init', 'type': 'init'}, 0)
((ROOT (S (NP (PRP you)) (VP (VP (VBD started) (PRT (RP off))) (CC and) (X (XX you)) (VP (VBD said))))), {'family': ['NP', 'VP'], 'auto preterminal span': (0, 1), 'span': (0, 1), 'subtrees': ['PRP'], 'parent': 'S', 'over_word': True, 'auto preterminals': ('PRP',), 'classified_type': 'Single Word Phrase', 'type': 'relabel', 'change': ('NP-SBJ', 'NP')}, 2)
((ROOT (S (NP (PRP you)) (VP (VP (VBD started) (PRT (RP off))) (CC 