# Learning representation for grammatical relationships in dependency parsed tree using node2vec

In [1]:
# Install node2vec
!pip install node2vec

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Install spacy
!pip install spacy

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib
from   matplotlib import pyplot as plt
import seaborn as sns

from collections import deque
from copy import deepcopy
import pickle
import json
from tqdm import tqdm
from pprint import pprint
from functools import lru_cache

import spacy
import nltk 
import networkx as nx
from node2vec import Node2Vec

In [4]:
# Setup for plotting
sns.set(style='darkgrid')
matplotlib.rcParams['figure.dpi'] = 300
matplotlib.rcParams['font.size'] = 18
matplotlib.rcParams['figure.figsize'] = (10, 5)

In [5]:
# Setup for nltk
nltk.download('punkt') # For tokenizers
nltk.download('stopwords')
nltk.download('wordnet') # For lemmatizers
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

# Setup for spacy
!python -m spacy download en_core_web_sm
scapy_nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /home/utkarsh-am/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/utkarsh-
[nltk_data]     am/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/utkarsh-
[nltk_data]     am/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/utkarsh-
[nltk_data]     am/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


2023-05-08 23:10:18.264183: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-08 23:10:19.075755: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/home/utkarsh-am/opt/openmpi/lib
2023-05-08 23:10:19.075830: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/home/utkarsh-am/opt/openmpi/lib
2023-05-08 2

In [6]:
# For caching objects

def load_obj(file_path):
    """Load a pickled object from given path
    :param file_path: Path to the pickle file of the object
    :type file_path: string
    """
    with open(file_path, 'rb') as f:
        return pickle.load(f)

def save_obj(obj, file_path):
    """Save an object to given path via pickling
    :param obj: Object to pickle
    :param file_path: Path for pickling
    :type file_path: string
    """
    with open(file_path, 'wb') as f:
        return pickle.dump(obj, f)

In [7]:
# Load the LOGIC dataset

train_df = pd.read_csv('./dataset/train.csv')
dev_df   = pd.read_csv('./dataset/dev.csv')
test_df  = pd.read_csv('./dataset/test.csv')

In [8]:
train_texts        = list(train_df['text'])
train_labels       = list(train_df['label'])
train_masked_texts = list(train_df['masked_text'])

dev_texts        = list(dev_df['text'])
dev_labels       = list(dev_df['label'])
dev_masked_texts = list(dev_df['masked_text'])

test_texts        = list(test_df['text'])
test_labels       = list(test_df['label'])
test_masked_texts = list(test_df['masked_text'])

## Dependency parsing

In [9]:
lemmatizer = WordNetLemmatizer()

In [10]:
global_relation_graph = dict()
# key: (relation_1, relation_2)
# val: weight of the edge

In [11]:
def add_to_global_graph(local_relation_graph):
    """Update `global_relation_graph` using the entries in 
    `local_relation_graph`.

    :param local_relation_graph: Local relation graph for a text
    :type local_relation_graph: dict
    """
    for rel_pair, rel_weight in local_relation_graph.items():
        if rel_pair not in global_relation_graph:
            global_relation_graph[rel_pair] = 0
        global_relation_graph[rel_pair] += rel_weight

In [12]:
def build_relation_graph(doc):
    """Create relationship graph for a scapy doc instance
    :param doc: scapy doc instance
    """
    dependency_edges = list() # (parent_token, child_token, relationship)
    node_count = 0

    for token in doc:
        node_count += 1
        parent = token.head.i
        child = token.i
        relationship = token.dep_
        if relationship == 'ROOT':
            continue
        dependency_edges.append((parent, child, relationship))

    dependency_graph = dict()
    for i in range(node_count): 
        dependency_graph[i] = list()
    for p, c, r in dependency_edges:
        dependency_graph[p].append(r)
        dependency_graph[c].append(r)
    
    relation_graph = dict() # (rel1, rel2) => weight
    for token, relations in dependency_graph.items():
        for i, rel1 in enumerate(relations):
            for j, rel2 in enumerate(relations):
                if i == j:
                    continue 
                rel_pair = (rel1, rel2)
                if rel_pair not in relation_graph:
                    relation_graph[rel_pair] = 0
                relation_graph[rel_pair] += 1
    
    add_to_global_graph(relation_graph)

In [13]:
def dependency_parsing(texts, n_process=8, batch_size=1000):
    """Perform dependency parsing

    :param texts: list of comment body (text)
    :param n_process: No. of processes spawned for processing, refer to pipe utility in spacy
    :param batch_size: Batch size while processing, refer to pipe utility in spacy
    """

    docs = scapy_nlp.pipe(texts, n_process=n_process, batch_size=batch_size)

    for doc in tqdm(docs, total=len(texts)):
        build_relation_graph(doc)

In [14]:
train_texts_lc = [s.lower() for s in train_texts]
test_texts_lc  = [s.lower() for s in test_texts]
dev_texts_lc   = [s.lower() for s in dev_texts]

In [15]:
dependency_parsing(train_texts_lc)
dependency_parsing(test_texts_lc)
dependency_parsing(dev_texts_lc)

100%|██████████| 1849/1849 [00:04<00:00, 461.40it/s] 
100%|██████████| 300/300 [00:01<00:00, 261.92it/s]
100%|██████████| 300/300 [00:01<00:00, 267.81it/s]


In [16]:
print(len(global_relation_graph))

1406


In [17]:
node_encoding = dict()
node_count = 0

for (rel1, rel2) in global_relation_graph.keys():
    if rel1 not in node_encoding:
        node_encoding[rel1] = node_count
        node_count += 1
    if rel2 not in node_encoding:
        node_encoding[rel2] = node_count 
        node_count += 1

print(node_count)

44


In [23]:
save_obj(node_encoding, './dataset/node-encoding.pkl')

In [18]:
graph = nx.DiGraph()

for (rel1, rel2), w in global_relation_graph.items():
    u = node_encoding[rel1]
    v = node_encoding[rel2]
    graph.add_weighted_edges_from([(u, v, w)])

In [19]:
nv = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=8)

Computing transition probabilities:   0%|          | 0/44 [00:00<?, ?it/s]

In [20]:
nv_model = nv.fit(window=10, min_count=1, batch_words=4)

In [21]:
nv_model.wv.most_similar('2')

[('13', 0.9178491234779358),
 ('12', 0.9029867649078369),
 ('42', 0.8844952583312988),
 ('15', 0.8844112753868103),
 ('41', 0.872951090335846),
 ('39', 0.8655798435211182),
 ('4', 0.8630475997924805),
 ('10', 0.8588102459907532),
 ('5', 0.8575478196144104),
 ('3', 0.8512221574783325)]

In [22]:
nv_model.wv.save_word2vec_format('./dataset/node2vec.emb')