__Objective__: Change point detection in CreateDebate 

__Runtime__: CPU

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
!git clone https://github.com/utkarsh512/CreateDebateScraper.git
%cd CreateDebateScraper/src/nested/

In [None]:
from thread import Thread, Comment
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm 
from copy import deepcopy

# Loading CreateDebate and generating signals

In [None]:
def exp_avg(a, alpha=0.5):
    """Performs exponential averaging"""
    b = [a[0]] 
    for i in range(1, len(a)):
        b.append(alpha * a[i] + (1 - alpha) * b[-1])
    return b

In [None]:
comments = dict()

categories_selected = ['politics2', 'religion', 'world', 'science', 'law', 'technology']
categories_labels = ['politics', 'religion', 'world', 'science', 'law', 'technology']

for x in categories_selected:
    comments[x] = list()

In [None]:
for cat in tqdm(categories_selected):
    fp = open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/threads.log', 'rb')
    threads = list()
    try:
        while True:
            e = pickle.load(fp)
            threads.append(e)
    except EOFError:
        fp.close()
    print(f'{cat} - {len(threads)}')
    authors = dict()
    for thread in threads:
        for k, v in thread.comments.items():
            try:
                authors[v.author].append(v)
            except:
                authors[v.author] = list()
                authors[v.author].append(v)
    ctr = 0
    with open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/comments_with_score.log', 'rb') as fp:
        cws = pickle.load(fp)
    for author in authors.keys():
        for i in range(len(authors[author])):
            comment = authors[author][i]
            foo = deepcopy(comment.__dict__)
            foo['tag'] = cat
            foo['score'] = cws[ctr][0]
            foo['validation'] = cws[ctr][1][0]
            comments[cat].append(foo)
            ctr += 1

In [None]:
min_date = '2022-01-13'
max_date = '2000-01-01'

for cat in categories_selected:
    for comment in comments[cat]:
        x = comment['time']
        if x == 'Not Available':
            continue 
        x = x[:10]
        min_date = min(min_date, x)
        max_date = max(max_date, x)
print(min_date, max_date)

In [None]:
# number and percentage of ad hominem comments per category

for cat in categories_selected:
    ad_hominem_comment_count = 0
    for comment in comments[cat]:
        if comment['score'] < 0.5:
            ad_hominem_comment_count += 1
    print(f'{cat} - {ad_hominem_comment_count} - {ad_hominem_comment_count * 100 / len(comments[cat])} - {len(comments[cat])}')

In [None]:
# number and percentage of ad hominem users per category

global_ah_authors = set()

for cat in categories_selected:
    all_authors = set()
    ah_authors = set()
    for comment in comments[cat]:
        all_authors.add(comment['author'])
        if comment['score'] < 0.5:
            ah_authors.add(comment['author'])
    global_ah_authors = (global_ah_authors | ah_authors)
    print(f'{cat} - {len(ah_authors)} - {len(all_authors)} - {100 * len(ah_authors) / len(all_authors)}')
print(len(global_ah_authors))

In [None]:
data = dict()
for i in range(len(categories_selected)):
    data[categories_selected[i]] = deepcopy(comments[categories_selected[i]])

In [None]:
data_pm = dict()
labels = list()

MONTH = ['Jan ', 'Feb ', 'Mar ', 'Apr ', 'May ', 'Jun ', 'Jul ', 'Aug ', 'Sep ', 'Oct ', 'Nov ', 'Dec ']

analysis = ['n_comments',     # number of comments
            'p_ah_comment',   # percentage of ad hominem comments
            'p_ah_user_med',  # percentage of users who posted more ad hominem comments wrt normal comments
            'p_ah_user_1'     # percentage of users who posted at least one ad hominem comment
            ]

for year in range(2008, 2022):
    for month in range(1, 13):
        x = str(month)
        if len(x) == 1:
            x = '0' + x
        label = f'{year}-{x}'
        new_label = MONTH[month - 1] + str(year)[2:]
        labels.append(label)
        data_pm[label] = dict()
        for cat in categories_selected:
            data_pm[label][cat] = list()

In [None]:
data_wot = dict() # comments for which posting time is not known
for cat in categories_selected:
    data_wot[cat] = list()

In [None]:
for cat in categories_selected:
    for comment in data[cat]:
        label = comment['time']
        if label == 'Not Available':
            data_wot[cat].append(comment)
            continue
        label = label[:7]
        data_pm[label][cat].append(comment)

In [None]:
n_comments_pm = dict()
for cat in categories_selected:
    n_comments_pm[cat] = list()
    for label in labels:
        n_comments_pm[cat].append(len(data_pm[label][cat]))

In [None]:
plt.figure(figsize=(36, 12))
plt.xlabel('month')
plt.ylabel('#comments posted')
plt.xticks(rotation=90)
for cat, label in zip(categories_selected, categories_labels):
    plt.plot(labels, n_comments_pm[cat], label=label) 
plt.legend()
# plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(36, 12))
plt.xlabel('month')
plt.ylabel('#comments posted')
plt.xticks(rotation=90)
for cat, label in zip(categories_selected, categories_labels):
    plt.plot(labels, exp_avg(n_comments_pm[cat]), label=label) 
plt.legend()
# plt.grid()
plt.show()

In [None]:
p_ahcomments_pm = dict()
for cat in categories_selected:
    p_ahcomments_pm[cat] = list()
    for label in labels:
        ah = 0
        for comment in data_pm[label][cat]:
            if comment['score'] < 0.5:
                ah += 1
        val = 0
        if len(data_pm[label][cat]):
            val = ah * 100 / len(data_pm[label][cat])
        p_ahcomments_pm[cat].append(val)

In [None]:
plt.figure(figsize=(36, 12))
plt.xlabel('month')
plt.ylabel('% ah comments posted')
plt.xticks(rotation=90)
for cat, label in zip(categories_selected, categories_labels):
    plt.plot(labels, p_ahcomments_pm[cat], label=label) 
plt.legend()
# plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(36, 12))
plt.xlabel('month')
plt.ylabel('% ah comments posted')
plt.xticks(rotation=90)
for cat, label in zip(categories_selected, categories_labels):
    plt.plot(labels, exp_avg(p_ahcomments_pm[cat], 0.25), label=label) 
plt.legend()
# plt.grid()
plt.show()

In [None]:
p_ahusers_pm_1 = dict()
for cat in categories_selected:
    p_ahusers_pm_1[cat] = []
    for label in labels:
        authors = dict()
        for comment in data_pm[label][cat]:
            try:
                authors[comment['author']].append(comment['score'])
            except:
                authors[comment['author']] = list()
                authors[comment['author']].append(comment['score'])
        val = 0
        if len(authors):
            ah = 0
            for k, v in authors.items():
                median = np.median(v)
                if median < 0.5:
                    ah += 1
            val = ah * 100 / len(authors)
        p_ahusers_pm_1[cat].append(val)

In [None]:
plt.figure(figsize=(36, 12))
plt.xlabel('month')
plt.ylabel('% ah users (> 50%)')
plt.xticks(rotation=90)
for cat, label in zip(categories_selected, categories_labels):
    plt.plot(labels, p_ahusers_pm_1[cat], label=label) 
plt.legend()
# plt.grid()
plt.show()

In [None]:
p_ahusers_pm_2 = dict()
for cat in categories_selected:
    p_ahusers_pm_2[cat] = list()
    for label in labels:
        authors = dict()
        for comment in data_pm[label][cat]:
            try:
                authors[comment['author']].append(comment['score'])
            except:
                authors[comment['author']] = list()
                authors[comment['author']].append(comment['score'])
        val = 0
        if len(authors):
            ah = 0
            for k, v in authors.items():
                x = deepcopy(v)
                x = sorted(x)
                if x[0] < 0.5:
                    ah += 1
            val = ah * 100 / len(authors)
        p_ahusers_pm_2[cat].append(val)

In [None]:
plt.figure(figsize=(36, 12))
plt.xlabel('month')
plt.ylabel('% ah users (at least once)')
plt.xticks(rotation=90)
for cat, label in zip(categories_selected, categories_labels):
    plt.plot(labels, p_ahusers_pm_2[cat], label=label) 
plt.legend()
# plt.grid()
plt.show()

In [None]:
signal = [] 

for cat in categories_selected:
    signal.append(np.array(n_comments_pm[cat])) 
    signal.append(np.array(p_ahcomments_pm[cat]))
    signal.append(np.array(p_ahusers_pm_1[cat])) 
    signal.append(np.array(p_ahusers_pm_2[cat])) 

signal = np.transpose(np.array(signal))
print(signal.shape)

# Change-point detection

In [None]:
!pip install ruptures

In [None]:
import ruptures as rpt

bkps = [] # 2016-end, 2019-end

## Dynamic programming algorithm

### L1 cost function

In [None]:
algo = rpt.Dynp(model="rbf").fit(signal)
result = algo.predict(n_bkps=2)
rpt.display(signal, bkps, result)
plt.show()
print(result)

In [None]:
print(labels[110], labels[140])