# "Better" boundary-based feature for constituency parses

Following suggestions brought up in meeting on 7/9/20.

Andrew Yang 7/11/20

In [1]:
import sys, os, json, string
import spacy
import numpy as np
import pandas as pd
from collections import defaultdict
from benepar.spacy_plugin import BeneparComponent

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe(BeneparComponent('benepar_en'))

Instructions for updating:
non-resource variables are not supported in the long term


First, load an example sentence and generate a constituency parse for it.

In [2]:
item_to_passage = {}
with open('data/moby-passages-36/passages-with-line-breaks.tsv') as f:
    for line in f:
        item, passage = line.strip().split('\t')
        item_to_passage[item] = passage[:passage.index('#')]
item_to_passage_cleaned = {item: passage.replace('$$', ' ').replace('$', '') for item, passage in item_to_passage.items()}
doc_330 = nlp(item_to_passage_cleaned['330'])
sent = list(doc_330.sents)[-1]

The following constituency parse is hard to interpret:

In [3]:
print(sent._.parse_string.replace('(', '[').replace(')', ']'))

[S [NP [NNP Sam] [CC and] [NNP Jo]] [VP [VBD smiled] [, ,] [S [VP [VBG knowing] [SBAR [S [NP [DT the] [NNS squirrels]] [VP [VBD were] [ADJP [JJ safe] [PP [IN with] [NP [PRP$ their] [NN mother]]]]]]]]]] [. .]]


<img src='misc/fig-parse-tree-last-sentence.png' width=600>

Some of the constituencies identified in the parse tree are arcane. Why does it matter that *the squirrels were safe with their mother* is an **S** contained inside an **SBAR**?

Below is a more interpretable way of analyzing constituencies. It identifies constituencies that are more "reasonable-sounding" than simply reading off the constituency parse.

In [4]:
def compile_parent_lists(sent):
    # ._.children field is hard to use
    token_to_parents = {}
    for token in sent:
        parents = []
        curr = token
        while curr._.parent is not None:
            curr = curr._.parent
            parents.append(curr)
        token_to_parents[token] = parents
    deepest = max(len(parents) for token, parents in token_to_parents.items())
    adjusted = {
        token: [None] * (deepest - len(parents)) + parents
        for token, parents in token_to_parents.items()
    }
    level_to_constituents = defaultdict(set)
    for i in range(deepest):
        for token, parents in adjusted.items():
            i_constituent = parents[-i - 1]
            if i_constituent:
                level_to_constituents[i].add(i_constituent)
    return level_to_constituents

In [5]:
for s in list(doc_330.sents):
    for k, v in compile_parent_lists(s).items():
        print('depth', k, v)
    print()

depth 0 {Sam and Jo went for a hike.}
depth 1 {went for a hike, Sam and Jo}
depth 2 {for a hike}
depth 3 {a hike}

depth 0 {They took a path through the woods.}
depth 1 {took a path through the woods}
depth 2 {a path through the woods}
depth 3 {a path, through the woods}
depth 4 {the woods}

depth 0 {Suddenly, Sam heard a noise coming from the tree above their heads.}
depth 1 {heard a noise coming from the tree above their heads}
depth 2 {a noise coming from the tree above their heads}
depth 3 {coming from the tree above their heads, a noise}
depth 4 {from the tree above their heads}
depth 5 {the tree above their heads}
depth 6 {the tree, above their heads}
depth 7 {their heads}

depth 0 {Jo climbed up to see what the noise was and found two baby squirrels.}
depth 1 {climbed up to see what the noise was and found two baby squirrels}
depth 2 {climbed up to see what the noise was, found two baby squirrels}
depth 3 {two baby squirrels, to see what the noise was}
depth 4 {see what the nois

Using these new constituencies, I propose the following family of features:

In [6]:
def is_contained_in_constituent(token, const):
    curr = token
    while curr._.parent is not None:
        curr = curr._.parent
        if type(curr) == type(const):
            if curr == const:
                return True
        
    return False

In [7]:
def depth_k_sisters(sent, k=1):
    '''
    for each token:
        1 if the token and its preceding token belong to the same constituency k level beneath the root
    '''
    rv = [0]
    levels_to_constituents = compile_parent_lists(sent)
    for idx, token in enumerate(sent):
        if idx == 0:
            continue
        prev_token = sent[idx - 1]
        match = 0
        for c in levels_to_constituents[k]:
            if is_contained_in_constituent(token, c) and is_contained_in_constituent(prev_token, c):
                match = 1
                break
        rv.append(match)
    return rv

In [8]:
def pretty_print_sisters(sent, k):
    x = depth_k_sisters(sent, k)
    print(x)
    for idx, val in enumerate(x):
        if val == 0:
            print('\n' + str(sent[idx]), end=' ')
        else:
            print(sent[idx], end=' ')

In [9]:
pretty_print_sisters(sent, 0)

[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Sam and Jo smiled , knowing the squirrels were safe with their mother . 

In [10]:
pretty_print_sisters(sent, 1)

[0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]

Sam and Jo 
smiled , knowing the squirrels were safe with their mother 
. 

In [11]:
pretty_print_sisters(sent, 2)

[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0]

Sam 
and 
Jo 
smiled 
, 
knowing the squirrels were safe with their mother 
. 

In [12]:
pretty_print_sisters(sent, 3)

[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0]

Sam 
and 
Jo 
smiled 
, 
knowing 
the squirrels were safe with their mother 
. 

In [13]:
pretty_print_sisters(sent, 4)

[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0]

Sam 
and 
Jo 
smiled 
, 
knowing 
the squirrels 
were safe with their mother 
. 

In [14]:
pretty_print_sisters(sent, 5)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0]

Sam 
and 
Jo 
smiled 
, 
knowing 
the 
squirrels 
were 
safe with their mother 
. 

In [15]:
pretty_print_sisters(sent, 6)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0]

Sam 
and 
Jo 
smiled 
, 
knowing 
the 
squirrels 
were 
safe 
with their mother 
. 

In [16]:
pretty_print_sisters(sent, 7)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]

Sam 
and 
Jo 
smiled 
, 
knowing 
the 
squirrels 
were 
safe 
with 
their mother 
. 

In [17]:
def generate_table(doc):
    doc_deepest = max(len(compile_parent_lists(sent).keys()) for idx, sent in enumerate(list(doc.sents)))
    rows = []
    for i in range(doc_deepest):
        row = []
        for idx, sent in enumerate(list(doc.sents)):
            row += depth_k_sisters(sent, i)
        rows.append(row)
    df = pd.DataFrame(np.array(rows).transpose())
    df.columns = ['depth_sister_' + str(i) for i in range(doc_deepest)]
    tokens = [str(t) for sent in doc.sents for t in sent]
    df.index = tokens
    return df

In [18]:
df = generate_table(doc_330)

In [19]:
df.head()

Unnamed: 0,depth_sister_0,depth_sister_1,depth_sister_2,depth_sister_3,depth_sister_4,depth_sister_5,depth_sister_6,depth_sister_7
Sam,0,0,0,0,0,0,0,0
and,1,1,0,0,0,0,0,0
Jo,1,1,0,0,0,0,0,0
went,1,0,0,0,0,0,0,0
for,1,1,0,0,0,0,0,0


In [20]:
# df.to_csv('output/20200711_text_matrix_330_addendum_depth_k_sisters.tsv', sep='\t')