In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
!git clone https://github.com/utkarsh512/CreateDebate-Scraper.git

In [None]:
%cd CreateDebate-Scraper/src/nested/

In [None]:
from thread import Thread, Comment
import pickle as pkl
import numpy as np
from tqdm import tqdm
from copy import deepcopy
import matplotlib.pyplot as plt
import matplotlib.colors as colors

In [None]:
data_dir = {'politics': '/content/gdrive/MyDrive/DL/CreateDebate/politics2/threads.log',
            'religion': '/content/gdrive/MyDrive/DL/CreateDebate/religion/threads.log',
            'world': '/content/gdrive/MyDrive/DL/CreateDebate/world/threads.log',
            'science': '/content/gdrive/MyDrive/DL/CreateDebate/science/threads.log',
            'law': '/content/gdrive/MyDrive/DL/CreateDebate/law/threads.log',
            'technology': '/content/gdrive/MyDrive/DL/CreateDebate/technology/threads.log'}

In [None]:
threads = dict()

for k, v in data_dir.items():
    threads[k] = list()
    with open(v, 'rb') as f:
        try:
            while True:
                threads[k].append(pkl.load(f))
        except EOFError:
            pass

In [None]:
author_comment_count = dict()

for k, v in tqdm(threads.items()):
    author_comment_count[k] = dict()

    # looping over threads
    for e in v:

        # looping over comments
        for cid, cmnt in e.comments.items():
            athr = cmnt.author
            try:
                author_comment_count[k][athr] += 1
            except:
                author_comment_count[k][athr] = 1

In [None]:
def get_author_set(category, n):
    res = set()
    for k, v in author_comment_count[category].items():
        if v >= n:
            res.add(k)
    return res

In [None]:
author_set = dict()

for category in tqdm(author_comment_count.keys()):
    author_set[category] = dict()
    for n in [1, 2, 5, 10, 20, 50, 100, 200, 500]:
        author_set[category][n] = get_author_set(category, n)

In [None]:
def get_overlap(category, n):
    a = deepcopy(author_set['politics'][n])
    b = deepcopy(author_set[category][n])
    i = (a & b)
    return 100 * len(i) / len(b)

In [None]:
valN = [1, 2, 5, 10, 20, 50, 100, 200]
N = len(valN)
C = len(threads) - 1

In [None]:
overlap = [[0 for j in range(N)] for i in range(C)]

In [None]:
for i, category in enumerate(['religion', 'world', 'science', 'law', 'technology']):
    for j, n in enumerate(valN):
        try:
            overlap[i][j] = get_overlap(category, n)
        except:
            overlap[i][j] = 0

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(overlap, interpolation='nearest')
fig.colorbar(cax)
ax.set_xticks(np.arange(N))
ax.set_yticks(np.arange(C))
ax.set_xticklabels(valN)
ax.set_yticklabels(['religion', 'world', 'science', 'law', 'tech'])
ax.set_xlabel("# comments")
plt.setp(ax.get_xticklabels(), rotation=90)
plt.savefig('overlap_with_politics.eps', format='eps')

In [None]:
threadsByYear = dict()
errorCount = 0

for e in threads['politics']:
    try:
        threadTime = list(e.comments.items())[0][1].time
        if threadTime == 'Not Available':
            continue 
        threadTime = threadTime[:4]
        try:
            threadsByYear[threadTime].append(e)
        except:
            threadsByYear[threadTime] = list()
            threadsByYear[threadTime].append(e)
    except:
        assert len(list(e.comments.items())) == 0
        errorCount += 1
print(errorCount)

In [None]:
threadsBefore = list()
threadsAfter = list()

In [None]:
for key in ['2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008']:
    for e in threadsByYear[key]:
        threadsBefore.append(e)

for key in ['2021', '2020', '2019', '2018', '2017']:
    for e in threadsByYear[key]:
        threadsAfter.append(e)

In [None]:
authorBefore = dict()
authorAfter = dict()

In [None]:
for e in threadsBefore:
    for cid, cmnt in e.comments.items():
        try:
            authorBefore[cmnt.author] += 1
        except:
            authorBefore[cmnt.author] = 1

In [None]:
for e in threadsAfter:
    for cid, cmnt in e.comments.items():
        try:
            authorAfter[cmnt.author] += 1
        except:
            authorAfter[cmnt.author] = 1

In [None]:
authorBefore200 = set()
authorBefore500 = set()
authorAfter200 = set()
authorAfter500 = set()

In [None]:
for k, v in authorBefore.items():
    if v >= 500:
        authorBefore500.add(k)
    elif v >= 200:
        authorBefore200.add(k)

In [None]:
for k, v in authorAfter.items():
    if v >= 500:
        authorAfter500.add(k) 
    elif v >= 200:
        authorAfter200.add(k)

In [None]:
len(authorBefore200)

In [None]:
len(authorBefore500)

In [None]:
len(authorAfter200)

In [None]:
len(authorAfter500)

In [None]:
a500 = authorBefore500 & authorAfter500

In [None]:
len(a500)

In [None]:
for i in authorBefore200:
    print(i)

In [None]:
for i in authorAfter200:
    print(i)

In [None]:
len(authorBefore200 & authorAfter200)

In [None]:
len(authorBefore200 & authorAfter500)