# Dependency Parsing
In this notebook, we perform dependency parsing for the texts in the **LOGIC** dataset

In [1]:
# Install `spacy` package to perform dependency parsing
!pip install spacy

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib
from   matplotlib import pyplot as plt
import seaborn as sns

from collections import deque
from copy import deepcopy
import pickle
import json
from tqdm import tqdm
from pprint import pprint
from functools import lru_cache

import spacy
import nltk 

In [None]:
# Setup for plotting
sns.set(style='darkgrid')
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.size'] = 18
matplotlib.rcParams['figure.figsize'] = (10, 5)

In [None]:
# Setup for nltk
nltk.download('punkt') # For tokenizers
nltk.download('stopwords')
nltk.download('wordnet') # For lemmatizers
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

# Setup for spacy
!python -m spacy download en_core_web_sm
scapy_nlp = spacy.load("en_core_web_sm")

In [None]:
# For caching objects

def load_obj(file_path):
    """Load a pickled object from given path
    :param file_path: Path to the pickle file of the object
    :type file_path: string
    """
    with open(file_path, 'rb') as f:
        return pickle.load(f)

def save_obj(obj, file_path):
    """Save an object to given path via pickling
    :param obj: Object to pickle
    :param file_path: Path for pickling
    :type file_path: string
    """
    with open(file_path, 'wb') as f:
        return pickle.dump(obj, f)

In [None]:
# Load the LOGIC dataset

train_df = pd.read_csv('./dataset/train.csv')
dev_df   = pd.read_csv('./dataset/dev.csv')
test_df  = pd.read_csv('./dataset/test.csv')

In [None]:
train_texts        = list(train_df['text'])
train_labels       = list(train_df['label'])
train_masked_texts = list(train_df['masked_text'])

dev_texts        = list(dev_df['text'])
dev_labels       = list(dev_df['label'])
dev_masked_texts = list(dev_df['masked_text'])

test_texts        = list(test_df['text'])
test_labels       = list(test_df['label'])
test_masked_texts = list(test_df['masked_text'])

In [None]:
# Get the distribution of sentence length

text_length_dist = {'length': [], 'dataset': []}

for text in train_texts:
    text_length_dist['length'].append(len(text.strip().split()))
    text_length_dist['dataset'].append('train')

for text in test_texts:
    text_length_dist['length'].append(len(text.strip().split()))
    text_length_dist['dataset'].append('test')

for text in dev_texts:
    text_length_dist['length'].append(len(text.strip().split()))
    text_length_dist['dataset'].append('dev')

ax = sns.barplot(data=text_length_dist, x='dataset', y='length')
ax.set_xlabel('Partition')
ax.set_ylabel('Word length')
ax.set_title('Word length distribution across different partition')
ax.plot()

In [None]:
print(f'#train: {len(train_df)}, #test: {len(test_df)}, #dev: {len(dev_df)}')

##  Dependency Parsing

In [None]:
# Get empath dictionary

!wget -O empath-dictionary.tsv https://drive.google.com/uc?id=1kH7_FVkIvF0NajMp8E09fssRDb4_h4j6

In [None]:
def load_empath_dictionary():
    """
    Returns a dict[str, list] object where keys are categories and values are 
    associated words for that category
    """
    empath_dict = dict()
    with open('./empath-dictionary.tsv', 'r') as f:
        for line in f:
            cols = line.strip().split("\t")
            name = cols[0]
            terms = cols[1:]
            empath_dict[name] = list()
            for t in set(terms):
                empath_dict[name].append(t)
    return empath_dict

In [None]:
empath_dict = load_empath_dictionary()

In [None]:
tokens_count = list()
for v in empath_dict.values():
    tokens_count.append(len(v))
print(f'Average token count {np.average(tokens_count)}, Std. dev {np.std(tokens_count)}')

In [None]:
# Get Slur words dictionary 

!wget -O slur-word-dictionary.json https://drive.google.com/uc?id=1q_cEj_qlAEDnSpY5_dV-Bl3Nvm74bL9K

In [None]:
# SOTA slur word dictionary (from Punyajoy)
with open('./slur-word-dictionary.json') as f:
    slur_words_dict = json.load(f)

In [None]:
# Get hate categories

!wget -O hate-categories.json https://drive.google.com/uc?id=1zsuchbPYSoTUfplkw7G6kVW8omLyf8tU

In [None]:
# Hate-targets broad categories
# Paper: "A Measurement Study of Hate Speech in Social Media", Mainack Mondal
with open('./hate-categories.json') as f:
    hate_targets_dict = json.load(f)
pprint(hate_targets_dict)

In [None]:
hate_targets_raw = dict()
# key: hate_targets
# value: list of raw tokens associated with that target

for k, v in hate_targets_dict.items():
    hate_targets_raw[k] = list()
    for token_type in v:
        if token_type in slur_words_dict:
            hate_targets_raw[k].extend(slur_words_dict[token_type])
        if token_type in empath_dict:
            hate_targets_raw[k].extend(empath_dict[token_type])

In [None]:
lemmatizer = WordNetLemmatizer()
replace_underscores_with_whitespaces = lambda z: ' '.join(z.split('_'))

hate_targets = dict()
# key: hate_targets
# value: list of processed tokens associated with that target

for k, v in hate_targets_raw.items():
    temp = list(map(lemmatizer.lemmatize, v))
    hate_targets[k] = set(map(replace_underscores_with_whitespaces, temp))

# pprint(hate_targets)

In [None]:
def create_dependency_graph(doc):
    """Create dependency graph of tokens from scapy doc
    :param doc: scapy doc instance
    """
    dependency_edges = list() # (parent, child, relationship)
    id_to_text = dict()
    id_to_token = dict()
    root = None
    node_count = 0

    for token in doc:
        node_count += 1
        parent = token.head.i
        child = token.i
        relationship = token.dep_
        id_to_text[child] = lemmatizer.lemmatize(token.text)
        id_to_token[child] = token
        if relationship == 'ROOT':
            root = child
            continue
        dependency_edges.append((parent, child, relationship))

    dependency_graph = dict()
    for i in range(node_count): 
        dependency_graph[i] = list()
    for p, c, r in dependency_edges:
        dependency_graph[p].append((c, r, 0))
        dependency_graph[c].append((p, r, 1))
    
    return dependency_graph, id_to_text, id_to_token, root

In [None]:
def get_noun_ids(id_to_token):
    """Index generator for nouns / personal nouns / pronouns"""
    for k, v in id_to_token.items():
        if v.pos_ == "NOUN" or v.pos_ == "PROPN" or v.pos_=="PRON":
            yield k

In [None]:
def get_personal_pronoun_ids(id_to_token):
    """Index generator for personal pronouns"""
    for k, v in id_to_token.items():
        if v.tag_ == 'PRP': # Personal pronoun tag in scapy
            yield k

In [None]:
def get_pronoun_ids(id_to_token):
    """Index generator for pronouns"""
    for k, v in id_to_token.items():
        if v.pos_ == 'PRON': # Pronoun tag in scapy
            yield k

In [None]:
def get_trigger_ids(id_to_text, trigger_type):
    """Generates ids which are indices of triggers
    :param id_to_text: id_to_text returned by create_dependency_graph
    :type id_to_text: dict
    :param trigger_type: What type of triggers?
    :type trigger_type: str
    """
    for k, v in id_to_text.items():
        if v in hate_targets[trigger_type]:
            yield k

In [None]:
def breadth_first_search(dependency_graph, source):
    """Performs breadth first search
    :param dependency_graph: Dependency graph returned by create_dependency_graph
    :type dependency_graph: dict
    :param source: Source node ID
    :type source: int
    """
    q = deque()
    used = set()
    d = dict() # distance of nodes from source
    p = dict() # parent in bfs
    r = dict() # relation observed

    q.append(source)
    used.add(source)
    p[source] = -1
    d[source] = 0

    while len(q):
        v = q.popleft()
        for u, rel, orient in dependency_graph[v]:
            if u in used:
                continue
            used.add(u)
            q.append(u)
            d[u] = d[v] + 1
            p[u] = v
            r[u] = (rel, orient)

    return d, p, r

In [None]:
def generate_path_from_bfs(source, dest, dist_dict, parent_dict, relation_dict):
    """Generate path from source to dest. Path will contain relationships 
    encountered in bfs.
    """
    assert dist_dict[source] == 0
    assert dest in dist_dict 

    indices_list = list()     # to store indices along the path

    orientation_list = list() # store whether the edge is traversed
                              # from parent to child
                              # or child to parent
    path = list()
    cur = dest
    while cur != source:
        rel, orient = relation_dict[cur]
        path.append(rel)
        indices_list.append(cur)
        orientation_list.append(orient)
        cur = parent_dict[cur]
    indices_list.append(cur)

    return path, indices_list, orientation_list

In [None]:
def dependency_parsing(texts, index_generator, n_process=8, batch_size=1000):
    """Perform dependency parsing

    :param texts: list of comment body (text)
    :param index_generator: `get_personal_pronoun_ids` or `get_pronoun_ids`
    :param n_process: No. of processes spawned for processing, refer to pipe utility in spacy
    :param batch_size: Batch size while processing, refer to pipe utility in spacy
    """

    result = list()
    # `result` will contain [comment_text, [(path, word_list, word_indices, orientation_list), ...]]

    docs = scapy_nlp.pipe(texts, n_process=n_process, batch_size=batch_size)

    for doc in tqdm(docs, total=len(texts)):
        # Parse comment text and create dependency graph
        local_result = list()
        parsed_result = list()

        local_result.append(doc.text)
        dependency_graph, id_to_text, id_to_token, root \
                                             = create_dependency_graph(doc)

        # Extract the indices using iterator
        for index in index_generator(id_to_token):
            dist, parent, relation = breadth_first_search(dependency_graph,
                                                          index)
            for target_id, target_dist in dist.items():
                path, word_indices, orientation = \
                    generate_path_from_bfs(index, target_id, dist,
                                            parent, relation)
                if 'punct' in path:
                    # Ignore dependency paths related to punctuation marks
                    continue

                words = list()
                for word_index in word_indices:
                    words.append(id_to_text[word_index])

                parsed_result.append((tuple(path), tuple(words), tuple(word_indices), tuple(orientation)))

        local_result.append(tuple(parsed_result))
        result.append(tuple(local_result))
    
    return result

In [None]:
train_texts_lc = [s.lower() for s in train_texts]
test_texts_lc  = [s.lower() for s in test_texts]
dev_texts_lc   = [s.lower() for s in dev_texts]

In [None]:
train_texts_parsed = dependency_parsing(train_texts_lc, get_noun_ids)
test_texts_parsed  = dependency_parsing(test_texts_lc,  get_noun_ids)
dev_texts_parsed   = dependency_parsing(dev_texts_lc,   get_noun_ids)

In [None]:
dependency_parsing_results = (train_texts_parsed, test_texts_parsed, dev_texts_parsed)

save_obj(dependency_parsing_results, './dataset/dependency_parsing_results.pkl')