# Empath results for For-against vs. Perspective debates

This notebook is different from the previous notebook in following ways:
* Uses lemmatization for better match
* Extracts noun pharses

In [None]:
# Mount Google drive to Colab
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
# Clone `CreateDebateScraper` library from github
!git clone https://github.com/utkarsh512/CreateDebateScraper.git
%cd CreateDebateScraper/src/nested

In [None]:
from   collections              import namedtuple
from   copy                     import deepcopy
# import cpnet
from   itertools                import accumulate
import json
from   matplotlib               import pyplot as plt
import networkx as nx
import nltk
import numpy as np
import pandas as pd
import pickle
import re
import spacy
from   scipy                    import stats
import textwrap
from   thread                   import Comment, Thread
from   tqdm                     import tqdm
nltk.download('punkt') # For tokenizers
nltk.download('stopwords')
nltk.download('wordnet') # For lemmatizers
nltk.download('omw-1.4')
import matplotlib
from   nltk.stem                import WordNetLemmatizer
from   nltk.tokenize            import TweetTokenizer
from   nltk.corpus              import stopwords
from   pprint                   import pprint
# from   transformers             import BertModel, BertTokenizer
# import shifterator as sh
# import wordcloud
# import skbio
matplotlib.rcParams.update({'font.size': 18})
matplotlib.rcParams["figure.figsize"] = (12, 5)
STOP_WORDS = list(stopwords.words('english'))
!python -m spacy download en_core_web_sm

In [None]:
from spacy import displacy

In [None]:
scapy_nlp = spacy.load("en_core_web_sm")

In [None]:
def load_empath_dictionary():
    """
    Returns a dict[str, list] object where keys are categories and values are 
    associated words for that category
    """
    empath_dict = dict()
    with open('/content/gdrive/MyDrive/DL/empath/dictionary.tsv', 'r') as f:
        for line in f:
            cols = line.strip().split("\t")
            name = cols[0]
            terms = cols[1:]
            empath_dict[name] = list()
            for t in set(terms):
                empath_dict[name].append(t)
    return empath_dict

In [None]:
empath = load_empath_dictionary()

In [None]:
len(empath.keys())

In [None]:
tokens_count = list()
for v in empath.values():
    tokens_count.append(len(v))

In [None]:
print(f'Average token count {np.average(tokens_count)}, Std. dev {np.std(tokens_count)}')

In [None]:
# We hand-picked categories that are more close to ad hominem triggers

empath_selected_tokens = list()

with open('/content/gdrive/MyDrive/DL/empath/empath_selected_categories.txt', 'r') as f:
    for line in f:
        empath_selected_tokens.append(line.strip())

In [None]:
# SOTA slur word dictionary (from Punyajoy)
slur_words_dict = json.load(open('/content/gdrive/MyDrive/DL/slurwords/slur_dictionary.json'))

In [None]:
# Combine selected empath tokens and slur words to create list of triggers

triggers = dict()
triggers.update(slur_words_dict)
for key in empath_selected_tokens:
    triggers[key] = list()
    for token in empath[key]:
        triggers[key].append(' '.join(token.split('_')))

In [None]:
lemmatizer = WordNetLemmatizer()

triggers_lemma = dict()
for k in tqdm(triggers.keys()):
    triggers_lemma[k] = set(map(lemmatizer.lemmatize, triggers[k]))

In [None]:
triggers_lemma

In [None]:
# Custom routine to clean texts scraped from Web.
# It removes hyperlinks, punctuation marks (except apostrophe)

tknz = TweetTokenizer()

def clean_text(text):
    """
    Preprocessing text
    """
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"www\S+", "", text)
    text = re.sub("-", " ", text)
    text = re.sub("\s+", " ", text)
    text = re.sub("\u2018", "X", text) 
    text = re.sub("\u2019", "X", text) 
    text = re.sub("\'", "X", text) 
    wordTokens_ = tknz.tokenize(text)
    wordTokens = list()
    for x in wordTokens_:
        x = ''.join([v for v in x if v.isalnum() or v == ' '])
        if len(x) > 0 and x != 'X':
            x = x.replace('X', '\'')
            wordTokens.append(x)
    return wordTokens

In [None]:
comments = dict()

# Topical forums on CreateDebate. We have scraped comments for all of the
# following forurm.
categories = ['business', 'comedy', 'entertainment', 'health', 'law', 'nsfw',
              'politics2', 'religion', 'science', 'shopping', 'sports',
              'technology', 'travel', 'world']

# However, we will be analyzing comments from selected forum only!
# These forum have at least 10k comments each.
categories_selected = ['politics2', 'religion', 'world', 
                       'science', 'law', 'technology']

for x in categories_selected:
    comments[x] = list()

In [None]:
# Loading comments from select forums

for cat in tqdm(categories_selected):
    fp = open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/threads.log', 'rb')

    # Get all the `Thread` objects pickled while scraping.
    threads = list()
    try:
        while True:
            e = pickle.load(fp)
            threads.append(e)
    except EOFError:
        fp.close()

    # While classifying CreateDebate comments, we used comments as per author mode.
    # Hence, using the same mode to attach classification score with the comments.
    # 
    # score < 0.5 -> ad hominem comment
    #       > 0.5 -> non ad hominem comment
    authors = dict()
    for thread in threads:
        for k, v in thread.comments.items():
            try:
                authors[v.author].append((v, k))
            except:
                authors[v.author] = list()
                authors[v.author].append((v, k))

    ctr = 0
    # Load the classification score of the comments.
    with open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/comments_with_score.log', 'rb') as fp:
        cws = pickle.load(fp)
    # Attach classification score with the comments.
    for author in authors.keys():
        for i in range(len(authors[author])):
            comment, cid = authors[author][i]
            foo = deepcopy(comment.__dict__)
            foo['tag'] = cat
            foo['score'] = cws[ctr][0]
            foo['validation'] = cws[ctr][1][0]
            foo['id'] = int(cid[3:])
            comments[cat].append(foo)
            ctr += 1

In [None]:
ah_score_comments = dict()

for cat in categories_selected:
    for comment in comments[cat]:
        ah_score_comments[comment['id']] = 1 - comment['score']

In [None]:
def parse_tstring(tstring):
    """
    Parses comment's time to an integer to enable
    comparison between comments based on their time of posting
    """
    if tstring == 'Not Available':
        raise ValueError('Invalid posting time for parse_tstring')
    tstring = tstring.replace('T', '-').replace(':', '-').replace('+', '-').split('-')
    return int(''.join(tstring[:-2]))

In [None]:
for_against_debates = dict()
perspective_debates = dict()

for cat in categories_selected:
    for_against_debates[cat] = list()
    perspective_debates[cat] = list()

    for comment in comments[cat]:
        if comment['polarity'] == 'Not Available':
            perspective_debates[cat].append(deepcopy(comment))
        else:
            for_against_debates[cat].append(deepcopy(comment))

In [None]:
# Sanity check
for cat in categories_selected:
    print(len(for_against_debates[cat]), len(perspective_debates[cat]))

In [None]:
def create_dependency_graph(doc):
    """Create dependency graph of tokens using scapy
    """
    dependency_edges = list() # (parent, child, relationship)
    id_to_text = dict()
    id_to_token = dict()
    root = None
    node_count = 0

    for token in doc:
        node_count += 1
        parent = token.head.i
        child = token.i
        relationship = token.dep_
        id_to_text[child] = lemmatizer.lemmatize(token.text)
        id_to_token[child] = token
        if relationship == 'ROOT':
            root = child
            continue
        dependency_edges.append((parent, child, relationship))

    dependency_graph = dict()
    for i in range(node_count): 
        dependency_graph[i] = list()
    for p, c, r in dependency_edges:
        dependency_graph[p].append((c, r))
        dependency_graph[c].append((p, r))
    
    return dependency_graph, id_to_text, id_to_token, root

In [None]:
def get_personal_pronoun_ids(id_to_token):
    """Generates ids which are indices of personal pronouns
    """
    for k, v in id_to_token.items():
        if v.tag_ == 'PRP': # Personal pronoun tag in scapy
            yield k

In [None]:
def get_pronoun_ids(id_to_token):
    """Generates ids which are indices of pronouns
    """
    for k, v in id_to_token.items():
        if v.pos_ == 'PRON': # Pronoun tag in scapy
            yield k

In [None]:
def get_trigger_ids(id_to_text, trigger_type):
    """Generates ids which are indices of triggers

    :param id_to_text: id_to_text returned by create_dependency_graph
    :type id_to_text: dict
    :param trigger_type: What type of triggers? Must be a key of triggers_lemma
    :type trigger_type: str
    """
    for k, v in id_to_text.items():
        if v in triggers_lemma[trigger_type]:
            yield k

In [None]:
from collections import deque

In [None]:
def breadth_first_search(dependency_graph, source):
    """Performs breadth first search

    :param dependency_graph: Dependency graph returned by create_dependency_graph
    :type dependency_graph: dict
    :param source: Source node ID
    :type source: int
    """
    q = deque()
    used = set()
    d = dict() # distance of nodes from source
    p = dict() # parent in bfs
    r = dict() # relation observed

    q.append(source)
    used.add(source)
    p[source] = -1
    d[source] = 0

    while len(q):
        v = q.popleft()
        for u, rel in dependency_graph[v]:
            if u in used:
                continue
            used.add(u)
            q.append(u)
            d[u] = d[v] + 1
            p[u] = v
            r[u] = rel

    return d, p, r

In [None]:
def generate_path_from_bfs(source, dest, dist_dict, parent_dict, relation_dict):
    """Generate path from source to dest. Path will contain relationships 
    encountered in bfs.
    """
    assert dist_dict[source] == 0
    assert dest in dist_dict 

    path = list()
    cur = dest
    while cur != source:
        path.append(relation_dict[cur])
        cur = parent_dict[cur]
    
    return '->'.join(path)

In [None]:
def get_trigger_count(texts, index_generator, n_process=2, batch_size=1000):
    """Computes count of ad hominem triggers associated with indices generated
    by index_generator
    """

    trigger_count = dict()
    for trigger_type in triggers_lemma.keys():
        trigger_count[trigger_type] = 0

    docs = scapy_nlp.pipe(texts, n_process=n_process, batch_size=batch_size)

    for doc in tqdm(docs, total=len(texts)):
        # Parse comment text and create dependency graph
        dependency_graph, id_to_text, id_to_token, root \
                                             = create_dependency_graph(doc)

        # Extract the indices using iterator
        for index in index_generator(id_to_token):
            dist, parent, relation = breadth_first_search(dependency_graph,
                                                          index)
            for trigger_type in triggers_lemma.keys():
                for trigger_id in get_trigger_ids(id_to_text, trigger_type):
                    if trigger_id in dist:
                        trigger_count[trigger_type] += 1
    
    return trigger_count

In [None]:
def get_trigger_count_by_path(texts, index_generator, n_process=2, batch_size=1000):
    """Computes count of ad hominem triggers associated with indices generated
    by index_generator
    """

    trigger_count_by_path = dict()

    docs = scapy_nlp.pipe(texts, n_process=n_process, batch_size=batch_size)

    for doc in tqdm(docs, total=len(texts)):
        # Parse comment text and create dependency graph
        dependency_graph, id_to_text, id_to_token, root \
                                             = create_dependency_graph(doc)

        # Extract the indices using iterator
        for index in index_generator(id_to_token):
            dist, parent, relation = breadth_first_search(dependency_graph,
                                                          index)
            for trigger_type in triggers_lemma.keys():
                for trigger_id in get_trigger_ids(id_to_text, trigger_type):
                    if trigger_id in dist:
                        path = generate_path_from_bfs(index, trigger_id,
                                                      dist, parent, relation)
                        if path in trigger_count_by_path:
                            trigger_count_by_path[path] += 1
                        else:
                            trigger_count_by_path[path] = 1
    
    return trigger_count_by_path

In [None]:
def get_trigger_path_adhominem_score(texts, index_generator, n_process=2, batch_size=1000):
    """Computes count of ad hominem triggers associated with indices generated
    by index_generator
    """

    trigger_path_adhominem_score = dict()

    docs = scapy_nlp.pipe(texts, n_process=n_process, batch_size=batch_size,
                          as_tuples=True)

    for doc, context in tqdm(docs, total=len(texts)):
        # Parse comment text and create dependency graph
        dependency_graph, id_to_text, id_to_token, root \
                                             = create_dependency_graph(doc)

        # Extract the indices using iterator
        for index in index_generator(id_to_token):
            dist, parent, relation = breadth_first_search(dependency_graph,
                                                          index)
            for trigger_type in triggers_lemma.keys():
                for trigger_id in get_trigger_ids(id_to_text, trigger_type):
                    if trigger_id in dist:
                        path = generate_path_from_bfs(index, trigger_id,
                                                      dist, parent, relation)
                        if path not in trigger_path_adhominem_score:
                            trigger_path_adhominem_score[path] = list()
                        trigger_path_adhominem_score[path].append(context['score'])
    
    return trigger_path_adhominem_score

In [None]:
fa_politics_texts = [(comment['body'].lower(), {'score': 1 - comment['score']}) \
                     for comment in for_against_debates['politics2']]

In [None]:
fa_politics_trigger_count = \
      get_trigger_path_adhominem_score(fa_politics_texts, get_personal_pronoun_ids)

In [None]:
with open('/content/gdrive/MyDrive/Temp/61-for-against-debates-trigger-personal-pronoun-count-path-ahscore.pkl', 'wb') as f:
    pickle.dump(fa_politics_trigger_count, f)

In [None]:
with open('/content/gdrive/MyDrive/Temp/61-for-against-debates-trigger-personal-pronoun-count-path-ahscore.pkl', 'rb') as f:
    fa_politics_trigger_count = pickle.load(f)

In [None]:
pers_politics_texts = [(comment['body'].lower(), {'score': 1 - comment['score']}) \
                       for comment in perspective_debates['politics2']]

In [None]:
pers_politics_trigger_count = \
      get_trigger_path_adhominem_score(pers_politics_texts, get_personal_pronoun_ids)

In [None]:
with open('/content/gdrive/MyDrive/Temp/61-perspective-debates-trigger-personal-pronoun-count-path-ahscore.pkl', 'wb') as f:
    pickle.dump(pers_politics_trigger_count, f)

In [None]:
with open('/content/gdrive/MyDrive/Temp/61-perspective-debates-trigger-personal-pronoun-count-path-ahscore.pkl', 'rb') as f:
    pers_politics_trigger_count = pickle.load(f)

In [None]:
def get_ah_relations(trigger_count, n, th):
    s = list()
    for k, v in trigger_count.items():
        if len(v) < th:
            continue
        avg = np.average(v)
        s.append((k, avg))
    s = sorted(s, reverse=True, key=lambda z: z[1])
    for foo in s[:n]:
        yield foo

In [None]:
for x, y in get_ah_relations(pers_politics_trigger_count, 10, 100):
    x = x.split('->')[::-1]
    x = list(map(spacy.explain, x))
    x = '-->'.join(x)
    print(f'{y:.3f}: {x}')

In [None]:
def get_top_relations(trigger_count, th):
    s = list()
    for k, v in trigger_count.items():
        s.append((k, v))
    s = sorted(s, reverse=True, key=lambda z: z[1])
    res = set()
    for k, v in s[:th]:
        res.add(k)
    return res

In [None]:
x = []
y = []

for th in (10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000):
    k1 = get_top_relations(fa_politics_trigger_count, th)
    k2 = get_top_relations(pers_politics_trigger_count, th)
    val = len(k1 & k2)
    x.append(th)
    y.append(val/ th)

In [None]:
def plot(x, y):
    ticks = np.arange(len(x))
    width = 0.60

    fig, ax = plt.subplots()
    subplot1 = ax.bar(ticks, y, width, tick_label=x)

    ax.set_ylabel('Overlap')
    ax.set_xticks(ticks)
    ax.set_xticklabels(x, rotation=45, ha='right')
    ax.legend()
    plt.show()

In [None]:
plot(x, y)

In [None]:
fa_politics_trigger_count

In [None]:
for_against_comment_count = len(for_against_debates['politics2'])
perspective_comment_count = len(perspective_debates['politics2'])

In [None]:
import numpy as np
def plot_trigger_count(trigger_types):
    x = trigger_types
    y1 = [] 
    y2 = [] 

    for trigger_type in trigger_types:
        y1.append(fa_politics_trigger_count[trigger_type] / for_against_comment_count)
        y2.append(pers_politics_trigger_count[trigger_type] / perspective_comment_count)

    ticks = np.arange(len(x))
    width = 0.30

    fig, ax = plt.subplots()
    subplot1 = ax.bar(ticks - width / 2, y1, width, label='for-against', tick_label=x)
    subplot2 = ax.bar(ticks + width / 2, y2, width, label='perspective', tick_label=x)

    ax.set_ylabel('Trigger count')
    ax.set_xticks(ticks)
    ax.set_xticklabels(x, rotation=45, ha='right')
    ax.legend()
    plt.show()

In [None]:
trigger_types = list(triggers_lemma.keys())

In [None]:
plot_trigger_count(trigger_types[50:])

In [None]:
# fa_politics_trigger_count = \
#       get_trigger_count(fa_politics_texts, get_personal_pronoun_ids)
ForAgainstTriggerCount = dict()
PerspectiveTriggerCount = dict()

for category in categories_selected: 
    fa_texts = [comment['body'].lower() \
                for comment in for_against_debates[category]]
    ps_texts = [comment['body'].lower() \
                for comment in perspective_debates[category]]
    ForAgainstTriggerCount[category] = \
                get_trigger_count(fa_texts, get_personal_pronoun_ids)
    PerspectiveTriggerCount[category] = \
                get_trigger_count(ps_texts, get_personal_pronoun_ids)