In [130]:
import sys, os, json, string

import spacy
import pandas as pd
from benepar.spacy_plugin import BeneparComponent

In [2]:
item_to_passage = {}
with open('data/moby-passages-36/passages-with-line-breaks.tsv') as f:
    for line in f:
        item, passage = line.strip().split('\t')
        item_to_passage[item] = passage[:passage.index('#')]

In [3]:
item_to_passage_cleaned = {item: passage.replace('$$', ' ').replace('$', '') for item, passage in item_to_passage.items()}

In [4]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe(BeneparComponent('benepar_en'))

In [5]:
doc_330 = nlp(item_to_passage_cleaned['330'])

In [105]:
def parse_string_to_list(s):
    punct = {'.', ',', '!', '?', "'", ';', ':', '$'}
    rv = s[0]
    for idx in range(1, len(s) - 1):
        prev = s[idx - 1]
        curr = s[idx]
        nxt = s[idx + 1]
        if prev == '(' and (curr.isalpha() or curr.isdigit() or curr in punct):
            rv += "'"
        if (prev.isalpha() or prev.isdigit() or prev in punct) and curr in ')':
            rv += "'"
        rv += curr
        if (curr.isalpha() or curr.isdigit() or curr in punct) and nxt == ' ':
            rv += "'"
        if curr == ' ' and (nxt.isalpha() or nxt.isdigit() or nxt in punct):
            rv += "'"
    rv += s[-1]
    return rv.replace('(', '[').replace(')', ']').replace(' ', ', ')

In [106]:
parse_string = list(doc_330.sents)[0]._.parse_string
lst = eval(parse_string_to_list(parse_string))

In [60]:
def read_pos(root, pos_list):
    tag = root[0]
#     print(tag, end=' ')
    if isinstance(root[1], str):
        pos_list.append(tag)
    else:
        for child in root[1:]:
            read_pos(child, pos_list)

In [61]:
pos_list = []
read_pos(lst, pos_list)
pos_list

['NNP', 'CC', 'NNP', 'VBD', 'IN', 'DT', 'NN', '.']

In [95]:
def read_parent_tags(child):
    tags = []
    curr = child
    while True:
        if curr._.parent:
            tag = curr._.parent._.labels[-1]
            if len(curr._.labels) <= 1:
#             print(tag, curr._.labels, len(curr._.labels), curr, list(curr._.parent._.children))
                tags.append(tag)
            curr = curr._.parent
        else:
            break
    return tags[:-1]

In [96]:
# for sent in list(doc_330.sents):
for token in list(doc_330.sents)[-1]:
    print(read_parent_tags(token))

['NP']
['NP']
['NP']
['VP']
['VP']
['VP']
['NP', 'S']
['NP', 'S']
['VP', 'S']
['ADJP', 'VP', 'S']
['PP', 'ADJP', 'VP', 'S']
['NP', 'PP', 'ADJP', 'VP', 'S']
['NP', 'PP', 'ADJP', 'VP', 'S']
[]


In [128]:
matrix = []
for idx, sentence in enumerate(list(doc_330.sents)):
    rows = []
    for token in sentence:
#         if str(token) in string.punctuation:
#             continue
        parent_tags = set(read_parent_tags(token))
        constituent_of_ADJP = 1 if 'ADJP' in parent_tags else 0
        constituent_of_ADVP = 1 if 'ADVP' in parent_tags else 0
        constituent_of_NP = 1 if 'NP' in parent_tags else 0
        constituent_of_PP = 1 if 'PP' in parent_tags else 0
        constituent_of_S_like = 1 if {'S', 'SBAR'} & parent_tags else 0
        constituent_of_VP = 1 if 'VP' in parent_tags else 0
        parent_depth = len(parent_tags)
        
        row = [
            str(token),
            constituent_of_ADJP,
            constituent_of_ADVP,
            constituent_of_NP,
            constituent_of_PP,
            constituent_of_S_like,
            constituent_of_VP,
            parent_depth
        ]
        
        rows.append(row)
        
    parse_string = sentence._.parse_string
    parse_list = eval(parse_string_to_list(parse_string))
    pos_list = []
    read_pos(parse_list, pos_list)
    pos_gen = (x for x in pos_list)
    rows = [row + [next(pos_gen)] for row in rows]
    assert len(pos_list) == len(sentence)
    for r in rows:
        matrix.append(r)

In [144]:
df = pd.DataFrame(matrix)

In [145]:
df.columns = [
    'token',
    'constituent_of_ADJP',
    'constituent_of_ADVP',
    'constituent_of_NP',
    'constituent_of_PP',
    'constituent_of_S_like',
    'constituent_of_VP',
    'parent_depth',
    'pos_tag'
]

In [146]:
df = pd.concat([df, pd.get_dummies(df.pos_tag, prefix='pos')], axis=1)

In [148]:
df.to_csv('data/20200709_330_text_matrix_addendum.tsv', sep='\t')