In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
# Clone `CreateDebateScraper` library from github
!git clone https://github.com/utkarsh512/CreateDebateScraper.git
%cd CreateDebateScraper/src/nested/

In [None]:
from   copy                     import deepcopy
from   itertools                import accumulate
import json
from   matplotlib               import pyplot as plt
import networkx as nx
import nltk
import numpy as np
import pandas as pd
import pickle
import re
from   scipy                    import stats
import textwrap
from   thread                   import Comment, Thread
from   tqdm                     import tqdm
nltk.download('punkt') # For tokenizers
import matplotlib
from   nltk.tokenize            import TweetTokenizer
from   pprint                   import pprint
# import skbio
matplotlib.rcParams.update({'font.size': 18})
matplotlib.rcParams["figure.figsize"] = (12, 5)

# Helper functions

In [None]:
tknz = TweetTokenizer()

def clean_text(text):
    """
    Preprocessing text
    """
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"www\S+", "", text)
    text = re.sub("-", " ", text)
    text = re.sub("\s+", " ", text)
    text = re.sub("\u2018", "X", text) 
    text = re.sub("\u2019", "X", text) 
    text = re.sub("\'", "X", text) 
    wordTokens_ = tknz.tokenize(text)
    wordTokens = list()
    for x in wordTokens_:
        x = ''.join([v for v in x if v.isalnum() or v == ' '])
        if len(x) > 0 and x != 'X':
            x = x.replace('X', '\'')
            wordTokens.append(x)
    return wordTokens

In [None]:
SLUR_WORDS = {
  "jews": [
    "jews",
    "oven dodger",
    "nazi",
    "dirty jew",
    "holocaust",
    "kikesucker",
    "hook nose",
    "kike"
  ],
  "homosexual": [
    "faggots usually",
    "fucking queer",
    "the biggest faggot",
    "dyke",
    "you fucking faggot",
    "hate faggots",
    "queer",
    "homosexual",
    "the faggots",
    "faggot",
    "faggots usually have",
    "gay",
    "faggots",
    "dykey",
    "ugly dyke",
    "faggots like you",
    "you a fag",
    "lesbian",
    "homo",
    "is a faggot",
    "like a faggot",
    "dykes",
    "faggots like",
    "faggot if you ever"
  ],
  "women": [
    "ugly dyke",
    "woman terrorist",
    "nigress",
    "bitch",
    "slut",
    "women",
    "sheeboon",
    "negress",
    "mud shark",
    "women threat",
    "you a lame bitch",
    "your a cunt",
    "white bitch",
    "niggeress",
    "hoe",
    "dykes",
    "niggress",
    "sheboon",
    "feminazi"
  ],
  "blacks": [
    "pavement ape",
    "the niggers",
    "negress",
    "porch monkey",
    "that nigger",
    "this nigger",
    "sheboon",
    "all niggers",
    "eurafrica",
    "shut up nigger",
    "picaninny",
    "african attack",
    "spearchucker",
    "how many niggers",
    "nigger",
    "africa",
    "niggers are in my",
    "dindu nuffin",
    "stupid nigger",
    "moolie",
    "niggers",
    "bluegum",
    "nigger ass",
    "you niggers",
    "fucking nigger",
    "nigger music",
    "niggress",
    "you a nigger",
    "many niggers are",
    "nigress",
    "blacks",
    "teenaper",
    "sheeboon",
    "dumb nigger",
    "niggeress",
    "pickaninny",
    "nigga"
  ],
  "muslim": [
    "muslim immigrant",
    "islam",
    "mudslime",
    "mooslem",
    "muslim refugee",
    "musslime",
    "shitlam",
    "muslim invasion",
    "moslime",
    "mooslamic",
    "muzzie",
    "allah akbar",
    "mooslime",
    "musloid",
    "mudslimes",
    "muslim",
    "muslimes",
    "moslum",
    "mussie",
    "muzrat",
    "muslim countries",
    "muzzy",
    "moslim",
    "jihadi",
    "muslim country",
    "moslem",
    "muzzrat",
    "mooslim"
  ],
  "arabs": [
    "towel head",
    "goatfucker",
    "arabs",
    "goathumper",
    "raghead",
    "rag head",
    "goathumping",
    "towelhead",
    "camel jockey",
    "sandnigger",
    "camel fucker",
    "sand nigger"
  ],
  "generic": [
    "to rape",
    "raped and",
    "shithole country",
    "get raped",
    "raped",
    "is a fucking",
    "shit skin",
    "raped by",
    "hate you",
    "fake empowerment",
    "abusive women",
    "fuck you too",
    "violence",
    "wit a lame nigga",
    "they all look",
    "alllivesmatter",
    "shithole countries",
    "fucking hate",
    "trailer trash",
    "kill all",
    "terrorist threat",
    "harassment",
    "kill yourself",
    "shitskin",
    "okay to be white",
    "fucking hate you"
  ],
  "white": [
    "full of white",
    "white trash",
    "white devil",
    "white",
    "are all white",
    "white boy",
    "white ass",
    "white bitch",
    "hillbilly",
    "whigger",
    "white christian",
    "white person",
    "all white",
    "white nigger",
    "redneck",
    "white honky",
    "wigger",
    "them white"
  ],
  "economy": [
    "ghetto"
  ],
  "immigrant": [
    "illegal immigrants",
    "immigrant not welcome",
    "immigrant terror",
    "mexcrement",
    "go back to where you come from",
    "muslim refugee",
    "illegal aliens",
    "refugee",
    "protect from immigrants",
    "negro",
    "refugees",
    "immigrant",
    "refugee invasion",
    "go back to where they come from",
    "refugees impact",
    "bring ebola",
    "immigrants",
    "illegal alien",
    "immigrant invasion",
    "bring disease"
  ],
  "mental": [
    "retard",
    "mongoloid",
    "retarded"
  ],
  "asians": [
    "asians",
    "ching chong",
    "chinaman"
  ]
}

# Loading CreateDebate dataset

In [None]:
comments = dict()

# Topical forums on CreateDebate. We have scraped comments for all of the
# following forurm.
categories = ['business', 'comedy', 'entertainment', 'health', 'law', 'nsfw',
              'politics2', 'religion', 'science', 'shopping', 'sports',
              'technology', 'travel', 'world']

# However, we will be analyzing comments from selected forum only!
# These forum have at least 10k comments each.
categories_selected = ['politics2', 'religion', 'world', 
                       'science', 'law', 'technology']

for x in categories_selected:
    comments[x] = list()

In [None]:
# Loading comments from select forums

for cat in tqdm(categories_selected):
    fp = open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/threads.log', 'rb')

    # Get all the `Thread` objects pickled while scraping.
    threads = list()
    try:
        while True:
            e = pickle.load(fp)
            threads.append(e)
    except EOFError:
        fp.close()

    # While classifying CreateDebate comments, we used comments as per author mode.
    # Hence, using the same mode to attach classification score with the comments.
    # 
    # score < 0.5 -> ad hominem comment
    #       > 0.5 -> non ad hominem comment
    authors = dict()
    for thread in threads:
        for k, v in thread.comments.items():
            try:
                authors[v.author].append(v)
            except:
                authors[v.author] = list()
                authors[v.author].append(v)

    ctr = 0
    # Load the classification score of the comments.
    with open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/comments_with_score.log', 'rb') as fp:
        cws = pickle.load(fp)
    # Attach classification score with the comments.
    for author in authors.keys():
        for i in range(len(authors[author])):
            comment = authors[author][i]
            foo = deepcopy(comment.__dict__)
            foo['tag'] = cat
            foo['score'] = cws[ctr][0]
            foo['validation'] = cws[ctr][1][0]
            comments[cat].append(foo)
            ctr += 1

# Loading slur word statistics for CreateDebate

In [None]:
# Loading computation from cache

with open('/content/gdrive/MyDrive/Temp/47-ah-score.pkl', 'rb') as fp:
    ah_score = pickle.load(fp)

with open('/content/gdrive/MyDrive/Temp/47-slur-count.pkl', 'rb') as fp:
    slur_count = pickle.load(fp)

# Analysis

In [None]:
# Median ah score per category per author
#   key: category -> author
#   value: median ah score
ah_score_median = dict()

for category, author_data in ah_score.items():
    ah_score_median[category] = dict()
    for author, ah_scores in author_data.items():
        ah_score_median[category][author] = np.median(ah_scores)

In [None]:
comment_count = dict()
# key: category -> author
# value: number of comments written by author in the given forum

for category in categories_selected:
    comment_count[category] = dict()

    for comment in comments[category]:
        author = comment['author']
        try:
            comment_count[category][author] += 1
        except KeyError:
            comment_count[category][author] = 1

In [None]:
EXTREME_AH_UPPER = 1
EXTREME_AH_LOWER = 0.95

MODERATE_AH_UPPER = 0.8
MODERATE_AH_LOWER = 0.7

LOW_AH_UPPER = 0.6
LOW_AH_LOWER = 0.5

In [None]:
def partition_users(category, slur_group, ignore_politics=False, ignore_religion=False):
    """
    @brief Return the partition of users for given category based on given flags

    @param category: Category to investigate
    @param slur_group: Slur group to investigate
    @param ignore_politics: if True, remove users who also participate in Politics
    @param ignore_religion: if True, remove users who also participate in Religion

    @return: extreme_ah_users, moderate_ah_users, low_ah_users (set)
    """
    extreme_ah_users = set()
    moderate_ah_users = set()
    low_ah_users = set()

    for author, median_ah_score in ah_score_median[category].items():
        n_politics_cnt = 0
        n_religion_cnt = 0
        try:
            n_politics_cnt = comment_count['politics2'][author]
        except KeyError:
            pass
        try:
            n_religion_cnt = comment_count['religion'][author]
        except KeyError:
            pass

        if ignore_politics and n_politics_cnt > 0:
            continue
        
        if ignore_religion and n_religion_cnt > 0:
            continue

        if LOW_AH_LOWER <= median_ah_score and median_ah_score <= LOW_AH_UPPER:
            if sum(slur_count[category][slur_group][author]) > 0:
                low_ah_users.add(author)

        elif MODERATE_AH_LOWER <= median_ah_score and median_ah_score <= MODERATE_AH_UPPER:
            if sum(slur_count[category][slur_group][author]) > 0:
                moderate_ah_users.add(author)
        
        elif EXTREME_AH_LOWER <= median_ah_score and median_ah_score <= EXTREME_AH_UPPER:
            if sum(slur_count[category][slur_group][author]) > 0:
                extreme_ah_users.add(author)

    return extreme_ah_users, moderate_ah_users, low_ah_users

In [None]:
disappeared_users_e = dict()
disappeared_users_m = dict()
disappeared_users_l = dict()

for category in categories_selected:
    disappeared_users_e[category] = set()
    disappeared_users_m[category] = set()
    disappeared_users_l[category] = set()

    for slur_group in SLUR_WORDS.keys():
        e1, m1, l1 = partition_users(category, slur_group)
        e2, m2, l2 = partition_users(category, slur_group, ignore_politics=True, ignore_religion=True)

        disappeared_users_e[category] |= (e1 - e2)
        disappeared_users_m[category] |= (m1 - m2)
        disappeared_users_l[category] |= (l1 - l2)

In [None]:
def get_top_20_ah_comments(category, user_list):
    comment_ah_scores = [0 for _ in range(20)]
    comment_texts = ["" for _ in range(20)]
    comment_authors = ["" for _ in range(20)]

    for comment in comments[category]:
        author = comment['author']
        if author not in user_list:
            continue

        if comment['tag'] != category:
            continue
        
        ah_score_ = 1 - comment['score']
        text = comment['body']

        min_score = min(comment_ah_scores)
        pos = comment_ah_scores.index(min_score)

        if ah_score_ > min_score:
            comment_ah_scores[pos] = ah_score_
            comment_texts[pos] = text
            comment_authors[pos] = author

    return tuple(comment_ah_scores), tuple(comment_texts), tuple(comment_authors)

## Politics

In [None]:
e = set()
m = set()
l = set()

for slur_group in SLUR_WORDS.keys():
    e1, m1, l1 = partition_users('politics2', slur_group)
    e2, m2, l2 = partition_users('politics2', slur_group, ignore_religion=True)

    e |= (e1 - e2)
    m |= (m1 - m2)
    l |= (l1 - l2)

In [None]:
scores, texts, users = get_top_20_ah_comments('politics2', e)

for text in texts:
    print(f'{text}\n\n')

print(len(e))

In [None]:
scores, texts, users = get_top_20_ah_comments('politics2', m)

for text in texts:
    print(f'{text}\n\n')

print(len(m))

In [None]:
scores, texts, users = get_top_20_ah_comments('politics2', l)

for text in texts:
    print(f'{text}\n\n')

print(len(l))

## Religion

In [None]:
e = set()
m = set()
l = set()

for slur_group in SLUR_WORDS.keys():
    e1, m1, l1 = partition_users('religion', slur_group)
    e2, m2, l2 = partition_users('religion', slur_group, ignore_politics=True)

    e |= (e1 - e2)
    m |= (m1 - m2)
    l |= (l1 - l2)

In [None]:
scores, texts, users = get_top_20_ah_comments('religion', e)

for text in texts:
    print(f'{text}\n\n')

print(len(e))

In [None]:
scores, texts, users = get_top_20_ah_comments('religion', m)

for text in texts:
    print(f'{text}\n\n')

print(len(m))

In [None]:
scores, texts, users = get_top_20_ah_comments('religion', l)

for text in texts:
    print(f'{text}\n\n')

print(len(l))

## World news

In [None]:
scores, texts, users = get_top_20_ah_comments('world', disappeared_users_e['world'])

for text in texts:
    print(f'{text}\n\n')

print(len(disappeared_users_e['world']))

In [None]:
scores, texts, users = get_top_20_ah_comments('world', disappeared_users_m['world'])

for text in texts:
    print(f'{text}\n\n')

print(len(disappeared_users_m['world']))

In [None]:
scores, texts, users = get_top_20_ah_comments('world', disappeared_users_l['world'])

for text in texts:
    print(f'{text}\n\n')

print(len(disappeared_users_l['world']))

## Science

In [None]:
scores, texts, users = get_top_20_ah_comments('science', disappeared_users_e['science'])

for text in texts:
    print(f'{text}\n\n')

print(len(disappeared_users_e['science']))

In [None]:
scores, texts, users = get_top_20_ah_comments('science', disappeared_users_m['science'])

for text in texts:
    print(f'{text}\n\n')

print(len(disappeared_users_m['science']))

In [None]:
scores, texts, users = get_top_20_ah_comments('science', disappeared_users_l['science'])

for text in texts:
    print(f'{text}\n\n')

print(len(disappeared_users_l['science']))

## Law

In [None]:
scores, texts, users = get_top_20_ah_comments('law', disappeared_users_e['law'])

for text in texts:
    print(f'{text}\n\n')

print(len(disappeared_users_e['law']))

In [None]:
scores, texts, users = get_top_20_ah_comments('law', disappeared_users_m['law'])

for text in texts:
    print(f'{text}\n\n')

print(len(disappeared_users_m['law']))

In [None]:
scores, texts, users = get_top_20_ah_comments('law', disappeared_users_l['law'])

for text in texts:
    print(f'{text}\n\n')

print(len(disappeared_users_l['law']))

## Technology

In [None]:
scores, texts, users = get_top_20_ah_comments('technology', disappeared_users_e['technology'])

for text in texts:
    print(f'{text}\n\n')

print(len(disappeared_users_e['technology']))

In [None]:
scores, texts, users = get_top_20_ah_comments('technology', disappeared_users_m['technology'])

for text in texts:
    print(f'{text}\n\n')

print(len(disappeared_users_m['technology']))

In [None]:
scores, texts, users = get_top_20_ah_comments('technology', disappeared_users_l['technology'])

for text in texts:
    print(f'{text}\n\n')

print(len(disappeared_users_l['technology']))