# Auxiliary Sentences from Dependency Parsing Results

In [1]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib
from   matplotlib import pyplot as plt
import seaborn as sns

from copy import deepcopy
import pickle
import json
from tqdm import tqdm
from pprint import pprint

import spacy

In [2]:
# Setup for plotting
sns.set(style='darkgrid')
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.size'] = 18
matplotlib.rcParams['figure.figsize'] = (10, 5)

In [3]:
# Setup for spacy
!python -m spacy download en_core_web_sm
scapy_nlp = spacy.load("en_core_web_sm")

2023-04-11 10:39:48.403004: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-11 10:39:49.261909: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/home/utkarsh-am/opt/openmpi/lib
2023-04-11 10:39:49.262007: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/home/utkarsh-am/opt/openmpi/lib
2023-04-11 1

In [4]:
# For caching objects

def load_obj(file_path):
    """Load a pickled object from given path
    :param file_path: Path to the pickle file of the object
    :type file_path: string
    """
    with open(file_path, 'rb') as f:
        return pickle.load(f)

def save_obj(obj, file_path):
    """Save an object to given path via pickling
    :param obj: Object to pickle
    :param file_path: Path for pickling
    :type file_path: string
    """
    with open(file_path, 'wb') as f:
        return pickle.dump(obj, f)

In [5]:
# Load the LOGIC dataset

train_df = pd.read_csv('./dataset/train.csv')
dev_df   = pd.read_csv('./dataset/dev.csv')
test_df  = pd.read_csv('./dataset/test.csv')

In [6]:
train_texts        = list(train_df['text'])
train_labels       = list(train_df['label'])
train_masked_texts = list(train_df['masked_text'])

dev_texts        = list(dev_df['text'])
dev_labels       = list(dev_df['label'])
dev_masked_texts = list(dev_df['masked_text'])

test_texts        = list(test_df['text'])
test_labels       = list(test_df['label'])
test_masked_texts = list(test_df['masked_text'])

In [7]:
# Load the results from dependency parsing

train_parsed, test_parsed, dev_parsed = load_obj('./dataset/dependency_parsing_results.pkl')
print(f'#train: {len(train_parsed)}, #test: {len(test_parsed)}, #dev: {len(dev_parsed)}')

#train: 1849, #test: 300, #dev: 300


In [8]:
# Check whether the entries in the parsed results are at the same indices as
# their corressponding dataframes.

for i in range(len(train_texts)):
    assert(train_texts[i].lower() == train_parsed[i][0])

In [9]:
def transform_data_structure(parsed_result):
    """Transforms the parsed_result 
    
    from:
        (text, ((path_1, words_1, indices_1, orient_1), (path_2, words_2, indices_2, orient_2), ...))

    to:
        (text, {
            path_i: ((words_1, indices_1, orient_1), (words_2, indices_2, orient_2), ...),
        })
    """
    transformed_result = list()

    for text, dp in parsed_result:
        local_dict = dict()
        for path, words, indices, orient in dp:
            if path not in local_dict:
                local_dict[path] = list()
            local_dict[path].append((words, indices, orient))

        # We need to sort this dictionary by decreasing size in value list
        local_list = list()
        for k, v in local_dict.items():
            local_list.append((k, len(v)))
        local_list = sorted(local_list, key=lambda z: z[1], reverse=True)

        ordered_dict = dict()
        for k, _ in local_list:
            ordered_dict[k] = local_dict[k]

        transformed_result.append(ordered_dict)

    return transformed_result

In [10]:
train_parsed_ord = transform_data_structure(train_parsed)
test_parsed_ord  = transform_data_structure(test_parsed)
dev_parsed_ord   = transform_data_structure(dev_parsed)

In [15]:
def create_auxiliary_text(text, parsed, n=-1, masked=False):
    """Create auxiliary sentences from dependency parsing

    n:
        number of auxiliary texts to generate, -1 for all

    masked:
        if True, uses masks like [MASK_i] to shadow the words that exist along
        the dependency path
    """
    doc = scapy_nlp(text.lower())
    tokens = [token.text for token in doc]

    mask = dict()
    auxiliary_texts = list()

    for path in parsed.keys():
        if len(auxiliary_texts) == n:
            break

        local_list = []

        for token in path:
            detailed_token = spacy.explain(token)
            if detailed_token:
                local_list.append(detailed_token)
        

        auxiliary_texts.append(' leads to '.join(local_list))
        
        # for words, indices, orient in parsed[path]:

        #     if len(auxiliary_texts) == n:
        #         break

        #     local_list = []

        #     if not masked:
        #         for j in range(-1, -len(words), -1):
        #             t1 = tokens[indices[j]]
        #             t2 = tokens[indices[j - 1]]
        #             if not orient[j]:
        #                 t1, t2 = t2, t1
        #             r = spacy.explain(path[j])
        #             s = f'{t1} is {r} of {t2}'
        #             local_list.append(s)
        #     else:
        #         for j in range(-1, -len(words), -1):
        #             idx_1 = indices[j]
        #             idx_2 = indices[j - 1]
        #             if idx_1 not in mask:
        #                 mask[idx_1] = f'[MASK_{len(mask)}]'
        #                 tokens[idx_1] = mask[idx_1]
        #             if idx_2 not in mask:
        #                 mask[idx_2] = f'[MASK_{len(mask)}]'
        #                 tokens[idx_2] = mask[idx_2]
        #             m1 = mask[idx_1]
        #             m2 = mask[idx_2]
        #             if not orient[j]:
        #                 m1, m2 = m2, m1
        #             r = spacy.explain(path[j])
        #             s = f'{m1} is {r} of {m2}'
        #             local_list.append(s)
                    
        #     auxiliary_texts.append(' and '.join(local_list))

    main_txt = ' '.join(tokens)
    aux_txt  = '. '.join(auxiliary_texts) + '.'
    
    return main_txt, aux_txt


In [17]:
main_text, aux_text = create_auxiliary_text(train_texts[11], train_parsed_ord[11], masked=False, n=1)

pprint(main_text)
pprint(aux_text)

("you oppose a senator 's proposal to extend government - funded health care "
 'to poor minority children because that senator is a liberal democrat .')
'adverbial clause modifier leads to nominal subject.'


In [18]:
def create_auxiliary_texts(texts, parsed_ord, n=-1, masked=False):
    """Create auxiliary sentences 

    texts:
        train_texts / test_texts / dev_texts

    parsed_ord:
        train_parsed_ord / test_parsed_ord / dev_parsed_ord

    n:
        number of auxiliary texts to generate per entry, -1 for all

    masked:
        if True, uses masks like [MASK_i] to shadow the words that exist along
        the dependency path
    """
    assert(len(texts) == len(parsed_ord))

    result = []

    for text, parsed in zip(texts, parsed_ord):
        result.append(create_auxiliary_text(text, parsed, n, masked))
    return result

In [19]:
# Create auxiliary sentences using top-1, top-2, top-3, top-4, top-5, top-10,
# top-20 and all dependency paths, both with and without masking.

N = [1, 2, 3, 4, 5, 10, 20, -1]

for n in tqdm(N):
    for masked in (False, True):
        train_aux = create_auxiliary_texts(train_texts, train_parsed_ord, n=n, masked=masked)
        test_aux  = create_auxiliary_texts(test_texts,  test_parsed_ord,  n=n, masked=masked)
        dev_aux   = create_auxiliary_texts(dev_texts,   dev_parsed_ord,   n=n, masked=masked)
        result = (train_aux, test_aux, dev_aux)
        save_obj(result, f'./dataset/aux-sentences-n-{n}-masked-{masked}.pkl')

100%|██████████| 8/8 [05:15<00:00, 39.45s/it]
