# Facebook Comment Annotation on Prolific
* __Objective__: Create mini-batches (of 20) of comments to be used for annotation on Prolific
* __File Management__: Google Drive
* __Runtime Type__: GPU

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

## Training BERT Model in PyTorch

In [None]:
!nvidia-smi

In [None]:
import os

def read_split(dir):
    texts = []
    labels = []
    with open(dir, 'r', encoding='utf-8') as f:
        lines = f.readlines()[1:]
        for line in lines:
            line = line.strip().split(",")
            assert(len(line) == 2)
            label = line[0].strip()
            text = line[1].strip()
            texts.append(text)
            if label == 'AH':
                labels.append(1)
            else:
                labels.append(0)
    return texts, labels

In [None]:
train_texts, train_labels = read_split('/content/gdrive/MyDrive/DL/dataset/pytorch/train.csv')
test_texts, test_labels = read_split('/content/gdrive/MyDrive/DL/dataset/pytorch/test.csv')

In [None]:
!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
max_seq_length = 64
train_encodings = tokenizer(train_texts, truncation=True, max_length=max_seq_length, padding="max_length")
test_encodings = tokenizer(test_texts, truncation=True, max_length=max_seq_length, padding="max_length")

In [None]:
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.to(device)
model.train()

In [None]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
from tqdm import tqdm

In [None]:
for epoch in range(4):
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

In [None]:
import numpy as np
!pip install datasets
from datasets import load_metric

In [None]:
metric= load_metric("accuracy")
model.eval()
eval_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
for batch in eval_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './utkbert/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

## Extracting attention scores

In [None]:
from transformers import BertModel, BertTokenizer

In [None]:
model_version = 'utkbert'
do_lower_case = True
model = BertModel.from_pretrained(model_version, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)

In [None]:
def attention_scores(text, layers=None, heads=None):
    sentence_a = text
    inputs = tokenizer.encode_plus(sentence_a, None, return_tensors='pt', add_special_tokens=True)
    input_ids = inputs['input_ids']
    attention = model(input_ids)[-1]
    input_id_list = input_ids[0].tolist() # Batch index 0
    tokens = tokenizer.convert_ids_to_tokens(input_id_list) 
    sz = len(tokens)
    matrix = [0 for j in range(sz)]
    if layers is None:
        layers = [x for x in range(12)]
    if heads is None:
        heads = [x for x in range(12)]
    for layer in layers:
        for head in heads:
            for j in range(sz):
                matrix[j] += attention[layer][0, head, 0, j].item()
    for j in range(sz):
        matrix[j] = (matrix[j]) / (len(layers) * len(heads))
    return (tokens, matrix)

## Loading Facebook comments

In [None]:
import pickle
import matplotlib.pyplot as plt
import re

In [None]:
comments = pickle.load(open('/content/gdrive/MyDrive/DL/Facebook/dataset/classified_comments.pkl', 'rb'))

In [None]:
comments[0]

In [None]:
comments = sorted(comments, key = lambda x: x['score'])

In [None]:
top_ah_comments = []
top_none_comments = []

for i in range(1000):
    top_ah_comments.append(comments[i])
    top_none_comments.append(comments[-(i + 1)])

In [None]:
np.random.RandomState(seed=42).shuffle(top_ah_comments)
np.random.RandomState(seed=42).shuffle(top_none_comments)

In [None]:
X = []
Y = []

for x in top_ah_comments:
    c = len(x['text'].strip().split())
    if c >= 20 and c <= 80:
        X.append(x)

for x in top_none_comments:
    c = len(x['text'].strip().split())
    if c >= 20 and c <= 80:
        Y.append(x)

In [None]:
print(len(X), len(Y))

In [None]:
pageX = dict()
pageY = dict()

for x in X:
    try:
        pageX[x['page']].append(x)
    except:
        pageX[x['page']] = []
        pageX[x['page']].append(x)

for y in Y:
    try:
        pageY[y['page']].append(y)
    except:
        pageY[y['page']] = []
        pageY[y['page']].append(y)

In [None]:
for k, v in pageX.items():
    print(f'{k}: {len(v)}')
for k, v in pageY.items():
    print(f'{k}: {len(v)}')

In [None]:
pages = ['DonaldTrump', 'FoxNews', 'Breitbart', 'joebiden', 'barackobama']

In [None]:
groups = []

cnt = dict()

for k in pages:
    cnt[k] = 0

for i in range(5):
    group = []
    for k in pages:
        group.append(pageX[k][cnt[k]])
        group.append(pageY[k][cnt[k]])
        cnt[k] += 1
    for k in pages:
        group.append(pageX[k][cnt[k]])
        group.append(pageY[k][cnt[k]])
        cnt[k] += 1
    np.random.RandomState(seed=i+40).shuffle(group)
    groups.append(group)
    
np.random.RandomState(seed=42).shuffle(groups)

In [None]:
delimiter = '@#@#@'
delimiter2 = '##$$##@@'

# Each group is balanced class-wise as well as length-wise and has 20 comments

low, high = 4, 5  # index of groups which will be used in the form

ctr = 0

with open('/content/gdrive/MyDrive/DL/Facebook/dataset/CommentsForGoogleForm.txt', 'w', encoding='utf-8') as f:
    content = []
    for i in range(low, high):
        content_ = []
        for x in groups[i]:
            content_.append(f'{x["text"]}{delimiter2}{x["link"]}')
            ctr += 1
        content_ = delimiter.join([x for x in content_])
        content.append(content_)
    content = delimiter.join([x for x in content])
    content = re.sub("\s+", " ", content)
    f.write(content)
    print(content)

In [None]:
def clean_array(w, a):
    W = []
    A = []
    for i in range(len(w)):
        if (w[i].startswith('##')):
            W[len(W) - 1] += w[i][2:]
            A[len(A) - 1] = (A[len(A) - 1] + a[i]) / 2
        else:
            W.append(w[i])
            A.append(a[i])
    return (W, A)

In [None]:
def sanitize(x):
    x = x.lower()
    x = re.sub("\s+", " ", x)  # converting space-like character to single white space
    x = ''.join([y for y in x if y.isalnum() or y ==' '])
    return x

In [None]:
delim3 = '$#$#$#$#$#$#@@@@'
delim4 = '@#$$#@@#@@#'

def top_three_tokens(text):
    text = sanitize(text)
    words, attentions = attention_scores(text)
    words = words[1:-1] # Remove start and end tags
    attentions = attentions[1:-1]
    words, attentions = clean_array(words, attentions)
    top_tokens = list()
    for i in range(len(words)):
        top_tokens.append((attentions[i], i))
    top_tokens = sorted(top_tokens, reverse=True)
    ind = [0]
    cur = 1
    while len(ind) < 3:
        take = True
        for ids in ind:
            take = take and abs(top_tokens[ids][1] - top_tokens[cur][1]) > 2
        if take:
            ind.append(cur)
        cur += 1
    top_three_tkns = list()
    for i in range(3):
        idx = top_tokens[ind[i]][1]
        cur = ''
        if idx > 1:
            cur += '...'
        if idx != 0:
            cur += words[idx - 1] + ' '
        cur += words[idx]
        if idx != len(words) - 1:
            cur += ' ' + words[idx + 1]
        if idx < len(words) - 2:
            cur += '...'
        top_three_tkns.append(cur)
    return top_three_tkns

In [None]:
content = []
for i in range(low, high):
    for x in groups[i]:
        print(x['text'])
        trigrams = top_three_tokens(x['text'])
        np.random.shuffle(trigrams)
        content_ = delim3.join(trigrams)
        print(content_)
        content.append(content_)
content = delim4.join(content)
with open('/content/gdrive/MyDrive/DL/Facebook/dataset/trigrams.txt', 'w', encoding='utf-8') as f:
    f.write(content)