## Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
!git clone https://github.com/utkarsh512/CreateDebateScraper.git
!pip install shifterator
!pip install langid
!pip install wordcloud
%cd CreateDebateScraper/src/nested/

In [None]:
from thread import Thread, Comment
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
import json
from copy import deepcopy
from itertools import accumulate
import textwrap 
import nltk
nltk.download('punkt') # For tokenizers
from nltk.tokenize import TweetTokenizer
import shifterator as sh
from pprint import pprint
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
import langid
import matplotlib
matplotlib.rcParams.update({'font.size': 18})

## Loading comments

In [None]:
comments = dict()

categories = ['business', 'comedy', 'entertainment', 'health', 'law', 'nsfw',
              'politics2', 'religion', 'science', 'shopping', 'sports',
              'technology', 'travel', 'world']

categories_selected = ['politics2', 'religion', 'world', 'science', 'law', 'technology']
categories_labels = ['politics', 'religion', 'world', 'science', 'law', 'technology']

for x in categories:
    comments[x] = list()

In [None]:
for cat in tqdm(categories_selected):
    fp = open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/threads.log', 'rb')
    threads = list()
    try:
        while True:
            e = pickle.load(fp)
            threads.append(e)
    except EOFError:
        fp.close()
    print(f'{cat} - {len(threads)}')
    authors = dict()
    for thread in threads:
        for k, v in thread.comments.items():
            try:
                authors[v.author].append(v)
            except:
                authors[v.author] = list()
                authors[v.author].append(v)
    ctr = 0
    with open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/comments_with_score.log', 'rb') as fp:
        cws = pickle.load(fp)
    for author in authors.keys():
        for i in range(len(authors[author])):
            comment = authors[author][i]
            foo = deepcopy(comment.__dict__)
            foo['tag'] = cat
            foo['score'] = cws[ctr][0]
            foo['validation'] = cws[ctr][1][0]
            comments[cat].append(foo)
            ctr += 1

In [None]:
min_date = '2022-01-13'
max_date = '2000-01-01'

for cat in categories_selected:
    for comment in comments[cat]:
        x = comment['time']
        if x == 'Not Available':
            continue 
        x = x[:10]
        min_date = min(min_date, x)
        max_date = max(max_date, x)
print(min_date, max_date)

In [None]:
# number and percentage of ad hominem comments per category

for cat in categories_selected:
    ad_hominem_comment_count = 0
    for comment in comments[cat]:
        if comment['score'] < 0.5:
            ad_hominem_comment_count += 1
    print(f'{cat} - {ad_hominem_comment_count} - {ad_hominem_comment_count * 100 / len(comments[cat])} - {len(comments[cat])}')

In [None]:
# number and percentage of ad hominem users per category

global_ah_authors = set()

for cat in categories_selected:
    all_authors = set()
    ah_authors = set()
    for comment in comments[cat]:
        all_authors.add(comment['author'])
        if comment['score'] < 0.5:
            ah_authors.add(comment['author'])
    global_ah_authors = (global_ah_authors | ah_authors)
    print(f'{cat} - {len(ah_authors)} - {len(all_authors)} - {100 * len(ah_authors) / len(all_authors)}')
print(len(global_ah_authors))

In [None]:
# sampling some ad hominem comments

ad_hominem_comments = list()

for comment in comments['technology']:
    if comment['score'] < 0.5:
        ad_hominem_comments.append((comment['score'], comment['body']))

In [None]:
ad_hominem_comments.sort()

In [None]:
for i in range(100):
    print(ad_hominem_comments[i][1])
    print()

In [None]:
data = dict()
for i in range(len(categories_selected)):
    data[categories_selected[i]] = deepcopy(comments[categories_selected[i]])

In [None]:
adHominemCount = 0

for k, v in data.items():
    for comment in v:
        if comment['score'] < 0.5:
            adHominemCount += 1
print(adHominemCount)

## Helper functions

In [None]:
# For cleaning text
tknz = TweetTokenizer()

# For generating word clouds
wc = WordCloud(background_color='white', width=1920, height=1080)

In [None]:
def moving_average(a, n=3) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

In [None]:
def delta(a):
    x = list()
    for i in range(1, len(a)):
        x.append(a[i] - a[i - 1])
    return x

In [None]:
def strip_array(a):
    idx = 0
    while a[idx] == 0:
        idx += 1 
    print(f"stripped {idx} elements")
    return a[idx:]

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"www\S+", "", text)
    text = re.sub("-", " ", text)
    text = re.sub("\s+", " ", text)
    text = re.sub("\u2018", "X", text) 
    text = re.sub("\u2019", "X", text) 
    text = re.sub("\'", "X", text) 
    wordTokens_ = tknz.tokenize(text)
    wordTokens = list()
    for x in wordTokens_:
        x = ''.join([v for v in x if v.isalnum() or v == ' '])
        if len(x) > 0 and x != 'X':
            x = x.replace('X', '\'')
            wordTokens.append(x)
    return wordTokens

# Language Detection

In [None]:
languages = dict()

for cat in tqdm(categories):
    for comment in comments[cat]:
        x = ' '.join(clean_text(comment['body']))
        language, score = langid.classify(x)
        try:
            languages[language] += 1
        except:
            languages[language] = 1

In [None]:
len(languages)

In [None]:
l_count = 0
for k, v in languages.items():
    l_count += v

In [None]:
eng_count = languages['en']

print(eng_count, l_count - eng_count)

In [None]:
305931 / (305931 + 4108)

## Grouping comments by their time of posting

In [None]:
data_pm = dict()
labels = list()

MONTH = ['Jan ', 'Feb ', 'Mar ', 'Apr ', 'May ', 'Jun ', 'Jul ', 'Aug ', 'Sep ', 'Oct ', 'Nov ', 'Dec ']

analysis = ['n_comments',     # number of comments
            'p_ah_comment',   # percentage of ad hominem comments
            'p_ah_user_med',  # percentage of users who posted more ad hominem comments wrt normal comments
            'p_ah_user_1'     # percentage of users who posted at least one ad hominem comment
            ]

for year in range(2008, 2022):
    for month in range(1, 13):
        x = str(month)
        if len(x) == 1:
            x = '0' + x
        label = f'{year}-{x}'
        new_label = MONTH[month - 1] + str(year)[2:]
        labels.append(label)
        data_pm[label] = dict()
        for cat in categories_selected:
            data_pm[label][cat] = list()

In [None]:
data_wot = dict() # comments for which posting time is not known
for cat in categories_selected:
    data_wot[cat] = list()

In [None]:
for cat in categories_selected:
    for comment in data[cat]:
        label = comment['time']
        if label == 'Not Available':
            data_wot[cat].append(comment)
            continue
        label = label[:7]
        data_pm[label][cat].append(comment)

## Creating dataset for KLM Quarter-wise Language Modelling
The dataset already exists. Don't run this cell again!

In [None]:
quarters = list()
for year in range(2008, 2022):
    for q in range(1, 5):
        val = f'{year}_Q{q}'
        quarters.append(val)

prefix_addr = '/content/gdrive/MyDrive/DL/CreateDebate/temporal/'

for cat in categories_selected:
    for i in tqdm(range(0, len(labels) // 3), desc=f'{cat}'):
        with open(prefix_addr + quarters[i] + f'_{cat}_ah.txt', 'w', encoding='utf-8') as f:
            for comment in data_pm[labels[3 * i]][cat]:
                if comment['score'] > 0.5:
                    continue
                text = comment['body']
                text = text.strip()
                text = clean_text(text)
                if len(text) < 10:
                    continue
                text = ' '.join(text)
                f.write(text + '\n')
            for comment in data_pm[labels[3 * i + 1]][cat]:
                if comment['score'] > 0.5:
                    continue
                text = comment['body']
                text = text.strip()
                text = clean_text(text)
                if len(text) < 10:
                    continue
                text = ' '.join(text)
                f.write(text + '\n')
        with open(prefix_addr + quarters[i] + f'_{cat}_none.txt', 'w', encoding='utf-8') as f:
            for comment in data_pm[labels[3 * i]][cat]:
                if comment['score'] < 0.5:
                    continue
                text = comment['body']
                text = text.strip()
                text = clean_text(text)
                if len(text) < 10:
                    continue
                text = ' '.join(text)
                f.write(text + '\n')
            for comment in data_pm[labels[3 * i + 1]][cat]:
                if comment['score'] < 0.5:
                    continue
                text = comment['body']
                text = text.strip()
                text = clean_text(text)
                if len(text) < 10:
                    continue
                text = ' '.join(text)
                f.write(text + '\n')
        with open(prefix_addr + quarters[i] + f'_{cat}_test.txt', 'w', encoding='utf-8') as f:
            for comment in data_pm[labels[3 * i + 2]][cat]:
                text = comment['body']
                text = text.strip()
                text = clean_text(text)
                if len(text) < 10:
                    continue
                text = ' '.join(text)
                f.write(text + '\n')

## Plots with Moving Averages

In [None]:
labels2 = ['Dec 2008']

MONTH = ['Jan ', 'Feb ', 'Mar ', 'Apr ', 'May ', 'Jun ', 'Jul ', 'Aug ', 'Sep ', 'Oct ', 'Nov ', 'Dec ']

for year in range(2009, 2022):
    for month in range(1, 13):
        labels2.append(MONTH[month - 1] + str(year))

### Number of comments posted per month

In [None]:
data_tp = dict()
for cat in categories_selected:
    data_tp[cat] = list()
    for label in labels:
        data_tp[cat].append(len(data_pm[label][cat]))
    data_tp[cat] = strip_array(data_tp[cat])
    data_tp[cat] = moving_average(data_tp[cat]) # comment this line if moving averages are not required

In [None]:
# 2 subplots in 1 row and 2 columns
fig, ax = plt.subplots(figsize=(15, 5))

ax.set_xlabel('month')
ax.set_ylabel('# comments')
ax.set_xticks(ticks=np.arange(0, len(labels2), 1))
ax.set_xticklabels(labels=[labels2[i] if i in range(0, len(labels2), 15) else '' for i in range(len(labels2))], rotation=90)

for cat, label in zip(categories_selected, categories_labels):
    ax.plot(labels[-len(data_tp[cat]):], data_tp[cat], label=label) # replace labels[2:] -> labels, if moving averages not required
ax.grid(axis='y')

_handles, _labels = ax.get_legend_handles_labels()
fig.legend(_handles, _labels, loc='upper right')
#plt.savefig('temporal_data_create_debate.pdf', format='pdf', bbox_inches="tight")
#plt.savefig('temporal_data_create_debate.eps', format='eps')


In [None]:
# plot for slides
plt.figure(figsize=(18, 5))
plt.xlabel('month')
plt.ylabel('#comments posted')
plt.xticks(rotation=90)
for cat, label in zip(categories_selected, categories_labels):
    plt.plot(labels[-len(data_tp[cat]):], data_tp[cat], label=label) # replace labels[2:] -> labels, if moving averages not required
plt.legend()
plt.grid()
plt.show()

In [None]:
# plot for paper
# plt.figure(figsize=(30, 12))
plt.xlabel('month')
plt.ylabel('#comments posted')
plt.xticks(rotation=90)
plt.xticks(np.arange(0, len(labels), 1), [labels[i] if i in range(0, len(labels), 10) else '' for i in range(len(labels))])
for cat, label in zip(categories_selected, categories_labels):
    plt.plot(labels[-len(data_tp[cat]):], data_tp[cat], label=label) # replace labels[2:] -> labels, if moving averages not required
plt.legend()
plt.grid(axis='y')
plt.savefig('number_of_comments_posted_per_month_on_create_debate.eps', format='eps')

### Percentage of ad hominem comments per month

In [None]:
data_tp = dict()
for cat in categories_selected:
    data_tp[cat] = [0]
    for label in labels:
        ah = 0
        for comment in data_pm[label][cat]:
            if comment['score'] < 0.5:
                ah += 1
        val = None
        if len(data_pm[label][cat]):
            val = ah * 100 / len(data_pm[label][cat])
        data_tp[cat].append(val)
    data_tp[cat] = list(accumulate(data_tp[cat], lambda x, y: y or x))
    data_tp[cat] = strip_array(data_tp[cat])
    data_tp[cat] = moving_average(data_tp[cat], n=12) # comment this line if moving averages are not required
_comments = deepcopy(data_tp)

In [None]:
# plot for slides
plt.figure(figsize=(36, 12))
plt.xlabel('month')
plt.ylabel('% ad hominem comments')
plt.xticks(rotation=90)
for cat, label in zip(categories_selected, categories_labels):
    plt.plot(labels[-len(data_tp[cat]):], data_tp[cat], 'o-', label=label) 
plt.legend()
plt.grid()
plt.show()

In [None]:
# plot for paper
# plt.figure(figsize=(30, 12))
plt.xlabel('month')
plt.ylabel('% ad hominem comments')
plt.xticks(rotation=90)
plt.xticks(np.arange(0, len(labels), 1), [labels[i] if i in range(0, len(labels), 10) else '' for i in range(len(labels))])
for cat, label in zip(categories_selected, categories_labels):
    plt.plot(labels[-len(data_tp[cat]):], data_tp[cat], label=label) # replace labels[2:] -> labels, if moving averages not required
plt.legend()
plt.grid(axis='y')
plt.savefig('percent_ad_hominem_post_per_month_on_create_debate.eps', format='eps')

### Percentage of users using ad-hominem per month (using median)


In [None]:
data_tp = dict()
for cat in categories_selected:
    data_tp[cat] = [0]
    for label in labels:
        authors = dict()
        for comment in data_pm[label][cat]:
            try:
                authors[comment['author']].append(comment['score'])
            except:
                authors[comment['author']] = list()
                authors[comment['author']].append(comment['score'])
        val = None
        if len(authors):
            ah = 0
            for k, v in authors.items():
                median = np.median(v)
                if median < 0.5:
                    ah += 1
            val = ah / len(authors)
        data_tp[cat].append(val)
    data_tp[cat] = list(accumulate(data_tp[cat], lambda x, y: y or x))
    data_tp[cat] = strip_array(data_tp[cat])
    data_tp[cat] = moving_average(data_tp[cat], n=12) # comment this line if moving averages are not required
_users = deepcopy(data_tp)

In [None]:
# plot for slides
plt.figure(figsize=(36, 12))
plt.xlabel('month')
plt.ylabel('% ad hominem users')
plt.xticks(rotation=90)
for cat, label in zip(categories_selected, categories_labels):
    plt.plot(labels[-len(data_tp[cat]):], data_tp[cat], 'o-', label=label) 
plt.legend()
plt.grid()
plt.show()

In [None]:
# plot for paper
# plt.figure(figsize=(30, 12))
plt.xlabel('month')
plt.ylabel('% ad hominem users')
plt.xticks(rotation=90)
plt.xticks(np.arange(0, len(labels), 1), [labels[i] if i in range(0, len(labels), 10) else '' for i in range(len(labels))])
for cat, label in zip(categories_selected, categories_labels):
    plt.plot(labels[-len(data_tp[cat]):], data_tp[cat], label=label) # replace labels[2:] -> labels, if moving averages not required
plt.legend()
plt.grid(axis='y')
plt.savefig('percent_ad_hominem_users_by_median_per_month_on_create_debate.eps', format='eps')

In [None]:
labels2 = ['Dec 2008']

MONTH = ['Jan ', 'Feb ', 'Mar ', 'Apr ', 'May ', 'Jun ', 'Jul ', 'Aug ', 'Sep ', 'Oct ', 'Nov ', 'Dec ']

for year in range(2009, 2022):
    for month in range(1, 13):
        labels2.append(MONTH[month - 1] + str(year))

In [None]:
import matplotlib

In [None]:
matplotlib.rcParams.update({'font.size': 18})

In [None]:
# 2 subplots in 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(22, 5))

ax1.set_xlabel('month')
ax1.set_ylabel('% ad hominem comments')
ax1.set_xticks(ticks=np.arange(0, len(labels2), 1))
ax1.set_xticklabels(labels=[labels2[i] if i in range(0, len(labels2), 15) else '' for i in range(len(labels2))], rotation=90)

for cat, label in zip(categories_selected, categories_labels):
    ax1.plot(labels[-len(_comments[cat]):], _comments[cat], label=label) # replace labels[2:] -> labels, if moving averages not required
ax1.grid(axis='y')

ax2.set_xlabel('month')
ax2.set_ylabel('% ad hominem users')
ax2.set_xticks(ticks=np.arange(0, len(labels2), 1))
ax2.set_xticklabels(labels=[labels2[i] if i in range(0, len(labels2), 15) else '' for i in range(len(labels2))], rotation=90)

for cat, label in zip(categories_selected, categories_labels):
    ax2.plot(labels[-len(_users[cat]):], np.array(_users[cat]) * 100, label=label) # replace labels[2:] -> labels, if moving averages not required
ax2.grid(axis='y')

_handles, _labels = ax2.get_legend_handles_labels()
fig.legend(_handles, _labels, loc='upper right')
plt.savefig('temporal_data_create_debate.pdf', format='pdf', bbox_inches="tight")
#plt.savefig('temporal_data_create_debate.eps', format='eps')


### Percentage of users using ad-hominem per month (at least one ad hominem comment)


In [None]:
data_tp = dict()
for cat in categories_selected:
    data_tp[cat] = [0]
    for label in labels:
        authors = dict()
        for comment in data_pm[label][cat]:
            try:
                authors[comment['author']].append(comment['score'])
            except:
                authors[comment['author']] = list()
                authors[comment['author']].append(comment['score'])
        val = None
        if len(authors):
            ah = 0
            for k, v in authors.items():
                x = deepcopy(v)
                x = sorted(x)
                if x[0] < 0.5:
                    ah += 1
            val = ah / len(authors)
        data_tp[cat].append(val)
    data_tp[cat] = list(accumulate(data_tp[cat], lambda x, y: y or x))
    data_tp[cat] = strip_array(data_tp[cat])
    data_tp[cat] = moving_average(data_tp[cat], n=12) # comment this line if moving averages are not required

In [None]:
plt.figure(figsize=(36, 12))
plt.xlabel('month')
plt.ylabel('% ad hominem users')
plt.xticks(rotation=90)
for cat, label in zip(categories_selected, categories_labels):
    plt.plot(labels[-len(data_tp[cat]):], data_tp[cat], 'o-', label=label) 
plt.legend()
plt.grid()
plt.show()

In [None]:
# plot for paper
plt.figure(figsize=(30, 12))
plt.xlabel('month')
plt.ylabel('% ad hominem users')
#plt.xticks(rotation=90)
plt.xticks(np.arange(0, len(labels), 1), [labels[i] if i in range(0, len(labels), 5) else '' for i in range(len(labels))])
for cat, label in zip(categories_selected, categories_labels):
    plt.plot(labels[-len(data_tp[cat]):], data_tp[cat], 'o-', label=label) # replace labels[2:] -> labels, if moving averages not required
plt.legend()
plt.grid()
plt.savefig('percent_ad_hominem_users_atleast_one_per_month_on_create_debate.eps', format='eps')

## Case Studies

In [None]:
data_cs = dict()
for cat in categories_selected:
    data_cs[cat] = [0]
    for label in labels:
        ah = 0
        for comment in data_pm[label][cat]:
            if comment['score'] < 0.5:
                ah += 1
        val = None
        if len(data_pm[label][cat]):
            val = ah * 100 / len(data_pm[label][cat])
        data_cs[cat].append(val)
    data_cs[cat] = list(accumulate(data_cs[cat], lambda x, y: y or x))
    data_cs[cat] = strip_array(data_cs[cat])

### Politics

In [None]:
comments_cs = data_cs['politics2']
del_1 = delta(comments_cs)

In [None]:
plt.figure(figsize=(36, 12))
plt.xlabel('month')
plt.ylabel('delta')
plt.xticks(rotation=90)
plt.plot(labels[-len(comments_cs):], comments_cs, 'o-')
plt.grid()
plt.show()

In [None]:
idx1 = -1
idx2 = -1
sep1 = '2017-03'
sep2 = '2019-10'
for i, label in enumerate(labels):
    if label == sep1:
        idx1 = i
    if label == sep2:
        idx2 = i
print(idx1, idx2)

In [None]:
pre_trump = ''
post_trump = ''
covid = ''

for i in range(idx1):
    for comment in data_pm[labels[i]]['politics2']:
        if comment['score'] > 0.5:
            continue
        pre_trump += comment['body'].strip() + '\n'

for i in range(idx1, idx2):
    for comment in data_pm[labels[i]]['politics2']:
        if comment['score'] > 0.5:
            continue
        post_trump += comment['body'].strip() + '\n'

for i in range(idx2, len(labels)):
    for comment in data_pm[labels[i]]['politics2']:
        if comment['score'] > 0.5:
            continue
        covid += comment['body'].strip() + '\n'

In [None]:
pre_trump_ = clean_text(pre_trump)
post_trump_ = clean_text(post_trump)
covid_ = clean_text(covid)

In [None]:
pre_trump_dict = dict()
post_trump_dict = dict()
covid_dict = dict()

In [None]:
for token in pre_trump_:
    try:
        pre_trump_dict[token] += 1
    except KeyError:
        pre_trump_dict[token] = 1 

for token in post_trump_:
    try:
        post_trump_dict[token] += 1 
    except KeyError:
        post_trump_dict[token] = 1 

for token in covid_:
    try:
        covid_dict[token] += 1 
    except KeyError:
        covid_dict[token] = 1

In [None]:
jsd_shift_1 = sh.JSDivergenceShift(type2freq_1=pre_trump_dict,
                                   type2freq_2=post_trump_dict,
                                   weight_1=0.5,
                                   weight_2=0.5,
                                   base=2,
                                   alpha=1)

In [None]:
jsd_shift_1.get_shift_graph(title='Jensen-Shannon Divergence Shifts', filename='word_shift_graph_pre_trump_vs_post_trump.eps', format='eps')

In [None]:
jsd_shift_2 = sh.JSDivergenceShift(type2freq_1=post_trump_dict,
                                   type2freq_2=covid_dict,
                                   weight_1=0.5,
                                   weight_2=0.5,
                                   base=2,
                                   alpha=1)

In [None]:
jsd_shift_2.get_shift_graph(title='Jensen-Shannon Divergence Shifts', filename='word_shift_graph_post_trump_vs_covid.eps', format='eps')

In [None]:
jsd_shift_3 = sh.JSDivergenceShift(type2freq_1=pre_trump_dict,
                                   type2freq_2=covid_dict,
                                   weight_1=0.5,
                                   weight_2=0.5,
                                   base=2,
                                   alpha=1)

In [None]:
jsd_shift_3.get_shift_graph(title='Jensen-Shannon Divergence Shifts', filename='word_shift_graph_pre_trump_vs_covid.eps', format='eps')

## Why there is a rise in ad hominem comments for every category with time?


### Religion

In [None]:
religion_pre_trump = ''
religion_post_trump = ''
religion_covid = ''

In [None]:
for i in range(idx1):
    for comment in data_pm[labels[i]]['religion']:
        if comment['score'] > 0.5:
            continue
        religion_pre_trump += comment['body'].strip() + '\n\n'

for i in range(idx1, idx2):
    for comment in data_pm[labels[i]]['religion']:
        if comment['score'] > 0.5:
            continue
        religion_post_trump += comment['body'].strip() + '\n\n'

for i in range(idx2, len(labels)):
    for comment in data_pm[labels[i]]['religion']:
        if comment['score'] > 0.5:
            continue
        religion_covid += comment['body'].strip() + '\n\n'

In [None]:
religion_pre_trump_ = ' '.join(clean_text(religion_pre_trump))
religion_post_trump_ = ' '.join(clean_text(religion_post_trump))
religion_covid_ = ' '.join(clean_text(religion_covid))

In [None]:
wc.generate_from_text(religion_covid_)

In [None]:
wc.to_image()

In [None]:
with open('religion_pre_trump.txt', 'w') as f:
    f.write(religion_pre_trump)

In [None]:
with open('religion_post_trump.txt', 'w') as f:
    f.write(religion_post_trump)

In [None]:
with open('religion_covid.txt', 'w') as f:
    f.write(religion_covid)