# Empath results: For-against vs. Perspective Debates
In this notebook, we will use the dictionary used in Empath module and count the occurence of personal pronouns in the vicinity of the words present in the dictinary.

For example, [*you*] **scum**.

**Runtime Type**: CPU

**Author**: Utkarsh Patel

**Date**: 2022-12-27

In [None]:
# Mount Google drive to Colab
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
# Clone `CreateDebateScraper` library from github
!git clone https://github.com/utkarsh512/CreateDebateScraper.git
%cd CreateDebateScraper/src/nested

In [None]:
from   collections              import namedtuple
from   copy                     import deepcopy
# import cpnet
from   itertools                import accumulate
import json
from   matplotlib               import pyplot as plt
import networkx as nx
import nltk
import numpy as np
import pandas as pd
import pickle
import re
from   scipy                    import stats
import textwrap
from   thread                   import Comment, Thread
from   tqdm                     import tqdm
nltk.download('punkt') # For tokenizers
nltk.download('stopwords')
import matplotlib
from   nltk.tokenize            import TweetTokenizer
from   nltk.corpus              import stopwords
from   pprint                   import pprint
# from   transformers             import BertModel, BertTokenizer
# import shifterator as sh
# import wordcloud
# import skbio
matplotlib.rcParams.update({'font.size': 18})
matplotlib.rcParams["figure.figsize"] = (12, 5)
STOP_WORDS = list(stopwords.words('english'))

## Setup for Empath

In [None]:
# Download empath dictinary
# !curl https://raw.githubusercontent.com/Ejhfast/empath-client/master/empath/data/categories.tsv -o /content/gdrive/MyDrive/DL/empath/dictionary.tsv

# NOTE: Dictionary already downloaded on first run!

In [None]:
def load_empath_dictionary():
    """
    Returns a dict[str, list] object where keys are categories and values are 
    associated words for that category
    """
    empath_dict = dict()
    with open('/content/gdrive/MyDrive/DL/empath/dictionary.tsv', 'r') as f:
        for line in f:
            cols = line.strip().split("\t")
            name = cols[0]
            terms = cols[1:]
            empath_dict[name] = list()
            for t in set(terms):
                empath_dict[name].append(t)
    return empath_dict

In [None]:
empath = load_empath_dictionary()

In [None]:
len(empath.keys())

In [None]:
tokens_count = list()
for v in empath.values():
    tokens_count.append(len(v))

In [None]:
print(f'Average token count {np.average(tokens_count)}, Std. dev {np.std(tokens_count)}')

In [None]:
empath_selected_tokens = list()

with open('/content/gdrive/MyDrive/DL/empath/empath_selected_categories.txt', 'r') as f:
    for line in f:
        empath_selected_tokens.append(line.strip())

In [None]:
SLUR_WORDS = {
  "jews": [
    "jews",
    "oven dodger",
    "nazi",
    "dirty jew",
    "holocaust",
    "kikesucker",
    "hook nose",
    "kike"
  ],
  "homosexual": [
    "faggots usually",
    "fucking queer",
    "the biggest faggot",
    "dyke",
    "you fucking faggot",
    "hate faggots",
    "queer",
    "homosexual",
    "the faggots",
    "faggot",
    "faggots usually have",
    "gay",
    "faggots",
    "dykey",
    "ugly dyke",
    "faggots like you",
    "you a fag",
    "lesbian",
    "homo",
    "is a faggot",
    "like a faggot",
    "dykes",
    "faggots like",
    "faggot if you ever"
  ],
  "women": [
    "ugly dyke",
    "woman terrorist",
    "nigress",
    "bitch",
    "slut",
    "women",
    "sheeboon",
    "negress",
    "mud shark",
    "women threat",
    "you a lame bitch",
    "your a cunt",
    "white bitch",
    "niggeress",
    "hoe",
    "dykes",
    "niggress",
    "sheboon",
    "feminazi"
  ],
  "blacks": [
    "pavement ape",
    "the niggers",
    "negress",
    "porch monkey",
    "that nigger",
    "this nigger",
    "sheboon",
    "all niggers",
    "eurafrica",
    "shut up nigger",
    "picaninny",
    "african attack",
    "spearchucker",
    "how many niggers",
    "nigger",
    "africa",
    "niggers are in my",
    "dindu nuffin",
    "stupid nigger",
    "moolie",
    "niggers",
    "bluegum",
    "nigger ass",
    "you niggers",
    "fucking nigger",
    "nigger music",
    "niggress",
    "you a nigger",
    "many niggers are",
    "nigress",
    "blacks",
    "teenaper",
    "sheeboon",
    "dumb nigger",
    "niggeress",
    "pickaninny",
    "nigga"
  ],
  "muslim": [
    "muslim immigrant",
    "islam",
    "mudslime",
    "mooslem",
    "muslim refugee",
    "musslime",
    "shitlam",
    "muslim invasion",
    "moslime",
    "mooslamic",
    "muzzie",
    "allah akbar",
    "mooslime",
    "musloid",
    "mudslimes",
    "muslim",
    "muslimes",
    "moslum",
    "mussie",
    "muzrat",
    "muslim countries",
    "muzzy",
    "moslim",
    "jihadi",
    "muslim country",
    "moslem",
    "muzzrat",
    "mooslim"
  ],
  "arabs": [
    "towel head",
    "goatfucker",
    "arabs",
    "goathumper",
    "raghead",
    "rag head",
    "goathumping",
    "towelhead",
    "camel jockey",
    "sandnigger",
    "camel fucker",
    "sand nigger"
  ],
  "generic": [
    "to rape",
    "raped and",
    "shithole country",
    "get raped",
    "raped",
    "is a fucking",
    "shit skin",
    "raped by",
    "hate you",
    "fake empowerment",
    "abusive women",
    "fuck you too",
    "violence",
    "wit a lame nigga",
    "they all look",
    "alllivesmatter",
    "shithole countries",
    "fucking hate",
    "trailer trash",
    "kill all",
    "terrorist threat",
    "harassment",
    "kill yourself",
    "shitskin",
    "okay to be white",
    "fucking hate you"
  ],
  "white": [
    "full of white",
    "white trash",
    "white devil",
    "white",
    "are all white",
    "white boy",
    "white ass",
    "white bitch",
    "hillbilly",
    "whigger",
    "white christian",
    "white person",
    "all white",
    "white nigger",
    "redneck",
    "white honky",
    "wigger",
    "them white"
  ],
  "economy": [
    "ghetto"
  ],
  "immigrant": [
    "illegal immigrants",
    "immigrant not welcome",
    "immigrant terror",
    "mexcrement",
    "go back to where you come from",
    "muslim refugee",
    "illegal aliens",
    "refugee",
    "protect from immigrants",
    "negro",
    "refugees",
    "immigrant",
    "refugee invasion",
    "go back to where they come from",
    "refugees impact",
    "bring ebola",
    "immigrants",
    "illegal alien",
    "immigrant invasion",
    "bring disease"
  ],
  "mental": [
    "retard",
    "mongoloid",
    "retarded"
  ],
  "asians": [
    "asians",
    "ching chong",
    "chinaman"
  ]
}

In [None]:
TOKEN_GROUP = dict()

TOKEN_GROUP.update(SLUR_WORDS)

for key in empath_selected_tokens:
    TOKEN_GROUP[key] = list()
    for token in empath[key]:
        TOKEN_GROUP[key].append(' '.join(token.split('_')))

In [None]:
TOKEN_GROUP.keys()

In [None]:
personal_pronouns = ('i', 'you', 'she', 'he', 'it', 'they', 'we', 'you', 'they', # subject pronouns
                     'me', 'her', 'him', 'them', 'us')                           # object pronouns

## Setup for For-against and Perspective debates

In [None]:
# Custom routine to clean texts scraped from Web.
# It removes hyperlinks, punctuation marks (except apostrophe)

tknz = TweetTokenizer()

def clean_text(text):
    """
    Preprocessing text
    """
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"www\S+", "", text)
    text = re.sub("-", " ", text)
    text = re.sub("\s+", " ", text)
    text = re.sub("\u2018", "X", text) 
    text = re.sub("\u2019", "X", text) 
    text = re.sub("\'", "X", text) 
    wordTokens_ = tknz.tokenize(text)
    wordTokens = list()
    for x in wordTokens_:
        x = ''.join([v for v in x if v.isalnum() or v == ' '])
        if len(x) > 0 and x != 'X':
            x = x.replace('X', '\'')
            wordTokens.append(x)
    return wordTokens

In [None]:
comments = dict()

# Topical forums on CreateDebate. We have scraped comments for all of the
# following forurm.
categories = ['business', 'comedy', 'entertainment', 'health', 'law', 'nsfw',
              'politics2', 'religion', 'science', 'shopping', 'sports',
              'technology', 'travel', 'world']

# However, we will be analyzing comments from selected forum only!
# These forum have at least 10k comments each.
categories_selected = ['politics2', 'religion', 'world', 
                       'science', 'law', 'technology']

for x in categories_selected:
    comments[x] = list()

In [None]:
# Loading comments from select forums

for cat in tqdm(categories_selected):
    fp = open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/threads.log', 'rb')

    # Get all the `Thread` objects pickled while scraping.
    threads = list()
    try:
        while True:
            e = pickle.load(fp)
            threads.append(e)
    except EOFError:
        fp.close()

    # While classifying CreateDebate comments, we used comments as per author mode.
    # Hence, using the same mode to attach classification score with the comments.
    # 
    # score < 0.5 -> ad hominem comment
    #       > 0.5 -> non ad hominem comment
    authors = dict()
    for thread in threads:
        for k, v in thread.comments.items():
            try:
                authors[v.author].append((v, k))
            except:
                authors[v.author] = list()
                authors[v.author].append((v, k))

    ctr = 0
    # Load the classification score of the comments.
    with open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/comments_with_score.log', 'rb') as fp:
        cws = pickle.load(fp)
    # Attach classification score with the comments.
    for author in authors.keys():
        for i in range(len(authors[author])):
            comment, cid = authors[author][i]
            foo = deepcopy(comment.__dict__)
            foo['tag'] = cat
            foo['score'] = cws[ctr][0]
            foo['validation'] = cws[ctr][1][0]
            foo['id'] = int(cid[3:])
            comments[cat].append(foo)
            ctr += 1

In [None]:
ah_score_comments = dict()

for cat in categories_selected:
    for comment in comments[cat]:
        ah_score_comments[comment['id']] = 1 - comment['score']

In [None]:
def parse_tstring(tstring):
    """
    Parses comment's time to an integer to enable
    comparison between comments based on their time of posting
    """
    if tstring == 'Not Available':
        raise ValueError('Invalid posting time for parse_tstring')
    tstring = tstring.replace('T', '-').replace(':', '-').replace('+', '-').split('-')
    return int(''.join(tstring[:-2]))

In [None]:
for_against_debates = dict()
perspective_debates = dict()

for cat in categories_selected:
    for_against_debates[cat] = list()
    perspective_debates[cat] = list()

    for comment in comments[cat]:
        if comment['polarity'] == 'Not Available':
            perspective_debates[cat].append(deepcopy(comment))
        else:
            for_against_debates[cat].append(deepcopy(comment))

In [None]:
# Sanity check
for cat in categories_selected:
    print(len(for_against_debates[cat]), len(perspective_debates[cat]))

In [None]:
for_against_debates_cleaned = dict()
perspective_debates_cleaned = dict()

for cat in categories_selected:
    for_against_debates_cleaned[cat] = list()
    perspective_debates_cleaned[cat] = list()

    for comment in tqdm(for_against_debates[cat]):
        for_against_debates_cleaned[cat].append(dict(cid=comment['id'], body=' '.join(clean_text(comment['body']))))
    
    for comment in tqdm(perspective_debates[cat]):
        perspective_debates_cleaned[cat].append(dict(cid=comment['id'], body=' '.join(clean_text(comment['body']))))

In [None]:
def count_tokens(comment, token_group):
    """
    Returns count of personal-pronoun-preceded-succeded words under `token_group`
    """
    token_count = 0
    for token in TOKEN_GROUP[token_group]:
        for pronoun in personal_pronouns:
            pre = f'{token} {pronoun}'
            post = f'{pronoun} {token}'

            token_count += comment['body'].count(pre)
            token_count += comment['body'].count(post)
    return token_count

In [None]:
for_against_empath_token_count = dict()
perspective_empath_token_count = dict()

for cat in ['politics2']:
    for_against_empath_token_count[cat] = dict()
    perspective_empath_token_count[cat] = dict()

    for token_group in tqdm(TOKEN_GROUP.keys()):
        for_against_empath_token_count[cat][token_group] = list()
        perspective_empath_token_count[cat][token_group] = list()

        for comment in (for_against_debates_cleaned[cat]):
            for_against_empath_token_count[cat][token_group].append(count_tokens(comment, token_group))
        
        for comment in (perspective_debates_cleaned[cat]):
            perspective_empath_token_count[cat][token_group].append(count_tokens(comment, token_group))

In [None]:
# with open('/content/gdrive/MyDrive/Temp/nb-60-for_against_empath_token_count.pkl', 'wb') as f:
#     pickle.dump(for_against_empath_token_count, f)

# with open('/content/gdrive/MyDrive/Temp/nb-60-perspective_empath_token_count.pkl', 'wb') as f:
#     pickle.dump(perspective_empath_token_count, f)

In [None]:
with open('/content/gdrive/MyDrive/Temp/nb-60-for_against_empath_token_count.pkl', 'rb') as f:
    for_against_empath_token_count = pickle.load(f)

with open('/content/gdrive/MyDrive/Temp/nb-60-perspective_empath_token_count.pkl', 'rb') as f:
    perspective_empath_token_count = pickle.load(f)

In [None]:
for cat in ['religion']:
    for_against_empath_token_count[cat] = dict()
    perspective_empath_token_count[cat] = dict()

    for token_group in tqdm(TOKEN_GROUP.keys()):
        for_against_empath_token_count[cat][token_group] = list()
        perspective_empath_token_count[cat][token_group] = list()

        for comment in (for_against_debates_cleaned[cat]):
            for_against_empath_token_count[cat][token_group].append(count_tokens(comment, token_group))
        
        for comment in (perspective_debates_cleaned[cat]):
            perspective_empath_token_count[cat][token_group].append(count_tokens(comment, token_group))

In [None]:
len(TOKEN_GROUP)

In [None]:
def Plot(token_groups):
    """Comparing two slur probabilities by plotting them
    """
    x = token_groups
    y1 = [] 
    y2 = [] 

    for token_group in token_groups:
        y1.append(np.average(for_against_empath_token_count['politics2'][token_group]))
        y2.append(np.average(perspective_empath_token_count['politics2'][token_group]))

    ticks = np.arange(len(x))
    width = 0.30

    fig, ax = plt.subplots()
    subplot1 = ax.bar(ticks - width / 2, y1, width, label='for-against', tick_label=x)
    subplot2 = ax.bar(ticks + width / 2, y2, width, label='perspective', tick_label=x)

    ax.set_ylabel('Word counts (averaged)')
    ax.set_title(f'Word counts (preceded/succeded by personal pronouns)')
    ax.set_xticks(ticks)
    ax.set_xticklabels(x, rotation=45, ha='right')
    ax.legend()
    plt.show()

In [None]:
lkeys = list(TOKEN_GROUP.keys())

In [None]:
index = 0

In [None]:
Plot(lkeys[50:])
index += 10

In [None]:
for k, v in for_against_empath_token_count['politics2'].items():
    print(f'{k} - {np.average(v)}')