# Grouping Create Debate Comments for Google Form Generation for Annotation Task on Prolific
* __Objective__: Grouping the comments of Create Debate corpus to make balanced set of 20 comments (10 ad-hominem + 10 none), with small, medium and large comments being in 50:30:20 ratio respectively.
* __File Management__: Using Google Drive
* __Runtime Type__: CPU

## Mounting Google Drive and loading data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
!git clone https://github.com/utkarsh512/CreateDebate-Scraper.git

In [None]:
%cd CreateDebate-Scraper/src/nested/

In [None]:
import numpy as np
import re
import pickle
from thread import Thread, Comment

In [None]:
dir = '/content/gdrive/MyDrive/DL/CreateDebate/Politics/{}.log'
comments_with_score = list()

with open(dir.format('comments_with_score'), 'rb') as f:
    comments_with_score = pickle.load(f)

In [None]:
reader_addr = '/content/gdrive/MyDrive/DL/CreateDebate/Politics/threads.log'
reader = open(reader_addr, 'rb')
threads = []
e = Thread()
try:
    while True:
        e = pickle.load(reader)
        threads.append(e)
except:
    reader.close()

In [None]:
authors = dict()

tot_comment_cnt = 0

idx = -1

for thread in threads:
    idx += 1
    for key in thread.comments.keys():
        tot_comment_cnt += 1
        comment = thread.comments[key]
        cur_text = comment.body
        cur_author = comment.author
        url = thread.url
        try:
            authors[cur_author].append((cur_text, url, idx))
        except:
            authors[cur_author] = list()
            authors[cur_author].append((cur_text, url, idx))

In [None]:
cur_author_cnt = 0
cur_comment_cnt = 0
tot_author_cnt = len(authors.keys())

comments_with_url = list()

for author in authors.keys():
    cur_author_cnt += 1
    for i in range(len(authors[author])):
        cur_comment_cnt += 1
        text = [authors[author][i][0]]
        url = authors[author][i][1]
        idx = authors[author][i][2]
        comments_with_url.append((url, text, idx))

In [None]:
idx = np.random.randint(len(comments_with_score))
print(comments_with_score[idx][1][0])
print(comments_with_url[idx][1][0])

In [None]:
v = list()
for i in range(len(comments_with_score)):
    score = comments_with_score[i][0]
    text = comments_with_score[i][1]
    url = comments_with_url[i][0]
    idx = comments_with_url[i][2]
    v.append((score, text, url, idx))

## Extracting top 1000 ad-hominem and none comments

In [None]:
v = sorted(v)
top_ah_comments = []
top_none_comments = []

for i in range(1000):
    top_ah_comments.append(v[i])
    top_none_comments.append(v[-(i + 1)])

In [None]:
# Random shuffle of top comments
np.random.RandomState(seed=42).shuffle(top_ah_comments)
np.random.RandomState(seed=42).shuffle(top_none_comments)

## Dividing the comments into small, medium and large categories

In [None]:
small_ah_comments = []
medium_ah_comments = []
large_ah_comments = []

small_none_comments = []
medium_none_comments = []
large_none_comments = []

In [None]:
for x in top_ah_comments:
    c = len(x[1][0].strip().split())
    if c < 38:
        small_ah_comments.append(x)
    elif c >= 80:
        large_ah_comments.append(x)
    else:
        medium_ah_comments.append(x)

In [None]:
for x in top_none_comments:
    c = len(x[1][0].strip().split())
    if c < 34:
        small_none_comments.append(x)
    elif c >= 72:
        large_none_comments.append(x)
    else:
        medium_none_comments.append(x)

In [None]:
total_groups_possible = int(len(small_ah_comments) / 5)
total_groups_possible = min(total_groups_possible, int(len(small_none_comments) / 5))
total_groups_possible = min(total_groups_possible, int(len(large_ah_comments) / 2))
total_groups_possible = min(total_groups_possible, int(len(large_none_comments) / 2))
total_groups_possible = min(total_groups_possible, int(len(medium_ah_comments) / 3))
total_groups_possible = min(total_groups_possible, int(len(medium_none_comments) / 3))
print(total_groups_possible)

## Making balanced groups with 20 comments each

In [None]:
groups = []

small_ah_count = 0
small_none_count = 0
medium_ah_count = 0
medium_none_count = 0
large_ah_count = 0
large_none_count = 0

In [None]:
for i in range(96):
    group = []
    for j in range(5):
        group.append(small_ah_comments[small_ah_count])
        small_ah_count += 1
        group.append(small_none_comments[small_none_count])
        small_none_count += 1
    for j in range(3):
        group.append(medium_ah_comments[medium_ah_count])
        medium_ah_count += 1
        group.append(medium_none_comments[medium_none_count])
        medium_none_count += 1
    for j in range(2):
        group.append(large_ah_comments[large_ah_count])
        large_ah_count += 1
        group.append(large_none_comments[large_none_count])
        large_none_count += 1  
    np.random.RandomState(seed=42).shuffle(group)
    groups.append(group)
np.random.RandomState(seed=42).shuffle(groups)

In [None]:
delimiter = '@#@#@'
delimiter2 = '##$$##@@'

# Each group is balanced class-wise as well as length-wise and has 20 comments

low, high = 0, 1  # index of groups which will be used in the form

addr = 'https://utkarsh512.github.io/pages/staticPages/comment{}.txt'

ctr = 0

with open('/content/gdrive/MyDrive/DL/CreateDebate/Politics/CommentsForGoogleForm.txt', 'w', encoding='utf-8') as f:
    content = []
    for i in range(low, high):
        content_ = []
        for x in groups[i]:
            content_.append(f'{x[1][0].strip()}{delimiter2}{addr.format(ctr)}')
            ctr += 1
        content_ = delimiter.join([x for x in content_])
        content.append(content_)
    content = delimiter.join([x for x in content])
    content = re.sub("\s+", " ", content)
    f.write(content)
    print(content)

In [None]:
# Constructing static webpages for comments for context

addr = '/content/gdrive/MyDrive/DL/CreateDebate/Politics/staticPages/comment{}.txt'

ctr = 0

for i in range(low, high):
    for x in groups[i]:
        with open(addr.format(ctr), 'w', encoding='utf-8') as f:
            f.write(str(threads[x[3]]))
            ctr += 1

## Uploading these comments on Google Form
* Download this `.txt` file and upload it on the [website](https://github.com/utkarsh512/utkarsh512.github.io/tree/master/docs/dataset).
* Then, visit [this](https://github.com/utkarsh512/Hate-Speech/tree/main/utils/autoform).

In [None]:
punyajoysaha1998@gmail.com 
mithun.rcciit@gmail.com