# Comment Annotation on Prolific
* __Objective__: Create mini-batches (of 20) of comments to be used for annotation on Prolific
* __File Management__: Google Drive
* __Runtime Type__: GPU

## Training BERT Model in PyTorch

In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
import os

def read_split(dir):
    texts = []
    labels = []
    with open(dir, 'r', encoding='utf-8') as f:
        lines = f.readlines()[1:]
        for line in lines:
            line = line.strip().split(",")
            assert(len(line) == 2)
            label = line[0].strip()
            text = line[1].strip()
            texts.append(text)
            if label == 'AH':
                labels.append(1)
            else:
                labels.append(0)
    return texts, labels

In [None]:
train_texts, train_labels = read_split('/content/gdrive/MyDrive/DL/dataset/pytorch/train.csv')
test_texts, test_labels = read_split('/content/gdrive/MyDrive/DL/dataset/pytorch/test.csv')

In [None]:
!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
max_seq_length = 64
train_encodings = tokenizer(train_texts, truncation=True, max_length=max_seq_length, padding="max_length")
test_encodings = tokenizer(test_texts, truncation=True, max_length=max_seq_length, padding="max_length")

In [None]:
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.to(device)
model.train()

In [None]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
from tqdm import tqdm

In [None]:
for epoch in range(4):
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

In [None]:
import numpy as np
!pip install datasets
from datasets import load_metric

In [None]:
metric= load_metric("accuracy")
model.eval()
eval_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
for batch in eval_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './utkbert/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

## Extracting Attention Scores

In [None]:
from transformers import BertModel, BertTokenizer

In [None]:
model_version = 'utkbert'
do_lower_case = True
model = BertModel.from_pretrained(model_version, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)

In [None]:
def attention_scores(text, layers=None, heads=None):
    sentence_a = text
    inputs = tokenizer.encode_plus(sentence_a, None, return_tensors='pt', add_special_tokens=True)
    input_ids = inputs['input_ids']
    attention = model(input_ids)[-1]
    input_id_list = input_ids[0].tolist() # Batch index 0
    tokens = tokenizer.convert_ids_to_tokens(input_id_list) 
    sz = len(tokens)
    matrix = [0 for j in range(sz)]
    if layers is None:
        layers = [x for x in range(12)]
    if heads is None:
        heads = [x for x in range(12)]
    for layer in layers:
        for head in heads:
            for j in range(sz):
                matrix[j] += attention[layer][0, head, 0, j].item()
    for j in range(sz):
        matrix[j] = (matrix[j]) / (len(layers) * len(heads))
    return (tokens, matrix)

## Loading Create Debate Comments

In [None]:
!git clone https://github.com/utkarsh512/CreateDebate-Scraper.git

In [None]:
%cd CreateDebate-Scraper/src/nested/

In [None]:
import re
import pickle
from thread import Thread, Comment
import numpy as np
import pandas as pd

In [None]:
dir = '/content/gdrive/MyDrive/DL/CreateDebate/Politics/{}.log'
comments_with_score = list()

with open(dir.format('comments_with_score'), 'rb') as f:
    comments_with_score = pickle.load(f)

In [None]:
reader_addr = '/content/gdrive/MyDrive/DL/CreateDebate/Politics/threads.log'
reader = open(reader_addr, 'rb')
threads = list()
e = Thread()

try:
    while True:
        e = pickle.load(reader)
        threads.append(e)
except:
    reader.close()

authors = dict()
tot_comment_cnt = 0
idx = -1

for thread in threads:
    idx += 1
    for key in thread.comments.keys():
        tot_comment_cnt += 1
        comment = thread.comments[key]
        cur_text = comment.body
        cur_author = comment.author
        url = thread.url
        try:
            authors[cur_author].append((cur_text, url, idx))
        except:
            authors[cur_author] = list()
            authors[cur_author].append((cur_text, url, idx))

cur_author_cnt = 0
cur_comment_cnt = 0
tot_author_cnt = len(authors.keys())
comments_with_url = list()

for author in authors.keys():
    cur_author_cnt += 1
    for i in range(len(authors[author])):
        cur_comment_cnt += 1
        text = [authors[author][i][0]]
        url = authors[author][i][1]
        idx = authors[author][i][2]
        comments_with_url.append((url, text, idx))

In [None]:
idx = np.random.randint(len(comments_with_score))
print(comments_with_score[idx][1][0])
print(comments_with_url[idx][1][0])

In [None]:
v = list()
for i in range(len(comments_with_score)):
    score = comments_with_score[i][0]
    text = comments_with_score[i][1]
    url = comments_with_url[i][0]
    idx = comments_with_url[i][2]
    v.append((score, text, url, idx))

In [None]:
v = sorted(v)
top_ah_comments = []
top_none_comments = []

for i in range(1000):
    top_ah_comments.append(v[i])
    top_none_comments.append(v[-(i + 1)])

In [None]:
# Random shuffle of top comments
np.random.RandomState(seed=42).shuffle(top_ah_comments)
np.random.RandomState(seed=42).shuffle(top_none_comments)

In [None]:
small_ah_comments = []
medium_ah_comments = []
large_ah_comments = []

small_none_comments = []
medium_none_comments = []
large_none_comments = []

for x in top_ah_comments:
    c = len(x[1][0].strip().split())
    if c < 38:
        small_ah_comments.append(x)
    elif c >= 80:
        large_ah_comments.append(x)
    else:
        medium_ah_comments.append(x)

for x in top_none_comments:
    c = len(x[1][0].strip().split())
    if c < 34:
        small_none_comments.append(x)
    elif c >= 72:
        large_none_comments.append(x)
    else:
        medium_none_comments.append(x)

total_groups_possible = int(len(small_ah_comments) / 5)
total_groups_possible = min(total_groups_possible, int(len(small_none_comments) / 5))
total_groups_possible = min(total_groups_possible, int(len(large_ah_comments) / 2))
total_groups_possible = min(total_groups_possible, int(len(large_none_comments) / 2))
total_groups_possible = min(total_groups_possible, int(len(medium_ah_comments) / 3))
total_groups_possible = min(total_groups_possible, int(len(medium_none_comments) / 3))
print(total_groups_possible)

In [None]:
groups = []

small_ah_count = 0
small_none_count = 0
medium_ah_count = 0
medium_none_count = 0
large_ah_count = 0
large_none_count = 0

for i in range(96):
    group = []
    for j in range(5):
        group.append(small_ah_comments[small_ah_count])
        small_ah_count += 1
        group.append(small_none_comments[small_none_count])
        small_none_count += 1
    for j in range(3):
        group.append(medium_ah_comments[medium_ah_count])
        medium_ah_count += 1
        group.append(medium_none_comments[medium_none_count])
        medium_none_count += 1
    for j in range(2):
        group.append(large_ah_comments[large_ah_count])
        large_ah_count += 1
        group.append(large_none_comments[large_none_count])
        large_none_count += 1  
    np.random.RandomState(seed=42).shuffle(group)
    groups.append(group)
    
np.random.RandomState(seed=42).shuffle(groups)

In [None]:
delimiter = '@#@#@'
delimiter2 = '##$$##@@'

# Each group is balanced class-wise as well as length-wise and has 20 comments

low, high = 0, 1  # index of groups which will be used in the form
group_id = 5

addr = 'https://utkarsh512.github.io/pages/createdebate_{}/comment_{}.txt'

ctr = 0

with open('/content/gdrive/MyDrive/DL/CreateDebate/Politics/CommentsForGoogleForm.txt', 'w', encoding='utf-8') as f:
    content = []
    for i in range(low, high):
        content_ = []
        for x in groups[i]:
            content_.append(f'{x[1][0].strip()}{delimiter2}{addr.format(group_id, ctr + 1)}')
            ctr += 1
        content_ = delimiter.join([x for x in content_])
        content.append(content_)
    content = delimiter.join([x for x in content])
    content = re.sub("\s+", " ", content)
    f.write(content)
    print(content)

In [None]:
# Constructing static webpages for comments for context

addr = '/content/gdrive/MyDrive/DL/CreateDebate/Politics/staticPages/comment_{}.txt'

ctr = 0

for i in range(low, high):
    for x in groups[i]:
        with open(addr.format(ctr + 1), 'w', encoding='utf-8') as f:
            f.write(str(threads[x[3]]))
            ctr += 1

In [None]:
def clean_array(w, a):
    W = []
    A = []
    for i in range(len(w)):
        if (w[i].startswith('##')):
            W[len(W) - 1] += w[i][2:]
            A[len(A) - 1] = (A[len(A) - 1] + a[i]) / 2
        else:
            W.append(w[i])
            A.append(a[i])
    return (W, A)

In [None]:
def sanitize(x):
    x = x.lower()
    x = re.sub("\s+", " ", x)  # converting space-like character to single white space
    x = ''.join([y for y in x if y.isalnum() or y ==' '])
    return x

In [None]:
delim3 = '$#$#$#$#$#$#@@@@'
delim4 = '@#$$#@@#@@#'

def top_three_tokens(text):
    text = sanitize(text)
    words, attentions = attention_scores(text)
    words = words[1:-1] # Remove start and end tags
    attentions = attentions[1:-1]
    words, attentions = clean_array(words, attentions)
    top_tokens = list()
    for i in range(len(words)):
        top_tokens.append((attentions[i], i))
    top_tokens = sorted(top_tokens, reverse=True)
    ind = [0]
    cur = 1
    while len(ind) < 3:
        take = True
        for ids in ind:
            take = take and abs(top_tokens[ids][1] - top_tokens[cur][1]) > 2
        if take:
            ind.append(cur)
        cur += 1
    top_three_tkns = list()
    for i in range(3):
        idx = top_tokens[ind[i]][1]
        cur = ''
        if idx > 1:
            cur += '...'
        if idx != 0:
            cur += words[idx - 1] + ' '
        cur += words[idx]
        if idx != len(words) - 1:
            cur += ' ' + words[idx + 1]
        if idx < len(words) - 2:
            cur += '...'
        top_three_tkns.append(cur)
    return top_three_tkns

In [None]:
groups[0][9][1][0]

In [None]:
top_three_tokens(groups[0][4][1][0])

In [None]:
# Constructing static webpages for comments for context

addr = '/content/gdrive/MyDrive/DL/CreateDebate/Politics/staticPages/trigrams.txt'

content = []
for i in range(low, high):
    for x in groups[i]:
        trigrams = top_three_tokens(x[1][0])
        np.random.shuffle(trigrams)
        content_ = delim3.join(trigrams)
        print(content_)
        content.append(content_)
content = delim4.join(content)
with open(addr, 'w', encoding='utf-8') as f:
    f.write(content)

## Annotation Analysis (Accuracy Metrics)

In [None]:
cols = ['Do you think this is an ad-hominem comment?']
for i in range(1, 20):
    cols.append(cols[0] + '.' + str(i))

In [None]:
scores = []
y_pred = []

low = 1
high = 6
for i in range(low, high):
    for x in groups[i]:
        scores.append(x[0])

for i in range(len(scores)):
    if scores[i] < 0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)
y_pred = np.array(y_pred)

In [None]:
addr = '/content/gdrive/MyDrive/DL/CreateDebate/Politics/annotations/batch{}.csv'

In [None]:
y = []
for i in range(1, 6):
    df = pd.read_csv(addr.format(i))
    for x in cols:
        lbl = df[x].value_counts().idxmax()
        if lbl == 'Yes':
            y.append(1)
        else:
            y.append(0)
y = np.array(y)

In [None]:
y.shape

In [None]:
y_pred.shape

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
accuracy_score(y, y_pred)

In [None]:
precision_score(y, y_pred)

In [None]:
recall_score(y, y_pred)

In [None]:
f1_score(y, y_pred)

## Annotation Analysis (Annotator Agreement)

In [None]:
from sklearn.metrics import cohen_kappa_score

kappa = []
acc = []

In [None]:
for i in range(6):
    df = pd.read_csv(addr.format(i))
    df = df[cols]
    for col in cols:
        df[col] = df[col].apply(lambda z: 1 if z == 'Yes' else 0)
    ann = df.to_numpy()
    scores = []
    y_pred = []

    low = i
    high = i + 1
    for i in range(low, high):
        for x in groups[i]:
            scores.append(x[0])

    for i in range(len(scores)):
        if scores[i] < 0.5:
            y_pred.append(1)
        else:
            y_pred.append(0)
    y_pred = np.array(y_pred)
    kappa_scores = []
    accuracy_scores = []
    ann_count = 3 if i else 5
    for i in range(ann_count):
        kappa_scores.append(cohen_kappa_score(y_pred, ann[i]))
        accuracy_scores.append(accuracy_score(y_pred, ann[i]))

In [None]:
for z in kappa:
    print(z)

## Annotation Analysis (Common Annotators)

In [None]:
annotators = []
for i in range(1, 5):
    df = pd.read_csv(addr.format(i))
    annotators.append(set(df['Enter your Prolific ID'].to_numpy().squeeze().tolist()))

In [None]:
print(annotators)

In [None]:
comm = np.zeros((4, 4))

In [None]:
for i in range(0, 4):
    for j in range(0, 4):
        comm[i][j] = len(annotators[i] & annotators[j])

In [None]:
comm

In [None]:
for z in comm.astype(int).tolist():
    print(z)