In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

## Training PyTorch BERT model for heat map generation

In [None]:
import os

def read_split(dir):
    texts = []
    labels = []
    with open(dir, 'r', encoding='utf-8') as f:
        lines = f.readlines()[1:]
        for line in lines:
            line = line.strip().split(",")
            assert(len(line) == 2)
            label = line[0].strip()
            text = line[1].strip()
            texts.append(text)
            if label == 'AH':
                labels.append(1)
            else:
                labels.append(0)
    return texts, labels

In [None]:
train_texts, train_labels = read_split('/content/gdrive/MyDrive/DL/dataset/pytorch/train.csv')
test_texts, test_labels = read_split('/content/gdrive/MyDrive/DL/dataset/pytorch/test.csv')

In [None]:
!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_seq_length = 64
train_encodings = tokenizer(train_texts, truncation=True, max_length=max_seq_length, padding="max_length")
test_encodings = tokenizer(test_texts, truncation=True, max_length=max_seq_length, padding="max_length")

In [None]:
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.to(device)
model.train()

In [None]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
from tqdm import tqdm
for epoch in range(3):
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

In [None]:
import numpy as np
!pip install datasets
from datasets import load_metric

In [None]:
metric= load_metric("accuracy")
model.eval()
eval_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
for batch in eval_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './utkbert/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

## Loading FB Data

In [None]:
import pandas as pd 

In [None]:
# reading the file containing the classified comments
df = pd.read_csv('/content/gdrive/MyDrive/DL/Facebook/fbscraper/nytimes/2016/2016c.csv')

In [None]:
df

In [None]:
# filtering the comments who have confidence score in range (0.4, 0.6)
reqdf = df[df['score'] > 0.4]
reqdf = reqdf[reqdf['score'] < 0.6]

In [None]:
reqdf

In [None]:
# percentage of comment in 0.4-0.6 range 

len(reqdf) / len(df) * 100

In [None]:
texts = list(reqdf['processedText'])
scores = list(reqdf['score'])

In [None]:
# separating adhominem comments from the rest
adHominem = list()
none = list()

for text, score in zip(texts, scores):
    if score < 0.5:
        adHominem.append(text)
    else:
        none.append(text)

In [None]:
len(adHominem), len(none)

In [None]:
from transformers import BertModel, BertTokenizer
import re

In [None]:
model_version = 'utkbert'
do_lower_case = True
model = BertModel.from_pretrained(model_version, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)

In [None]:
INTENSITY = 70

def attention_scores(text, layers=None, heads=None):
    sentence_a = text
    inputs = tokenizer.encode_plus(sentence_a, None, return_tensors='pt', add_special_tokens=True)
    input_ids = inputs['input_ids']
    attention = model(input_ids)[-1]
    input_id_list = input_ids[0].tolist() # Batch index 0
    tokens = tokenizer.convert_ids_to_tokens(input_id_list) 
    sz = len(tokens)
    matrix = [0 for j in range(sz)]
    if layers is None:
        layers = [x for x in range(12)]
    if heads is None:
        heads = [x for x in range(12)]
    for layer in layers:
        for head in heads:
            for j in range(sz):
                matrix[j] += attention[layer][0, head, 0, j].item()
    for j in range(sz):
        matrix[j] = (matrix[j]) / (len(layers) * len(heads))
    return (tokens, matrix)

In [None]:
def clean_array(w, a):
    W = []
    A = []
    for i in range(len(w)):
        if (w[i].startswith('##')):
            W[len(W) - 1] += w[i][2:]
            A[len(A) - 1] = (A[len(A) - 1] + a[i]) / 2
        else:
            W.append(w[i])
            A.append(a[i])
    return clean_apos(W, A)

def clean_apos(w, a):
    W = []
    A = []
    ctr = 0
    while ctr != len(w):
        if w[ctr] == '\'':
            W[-1] += w[ctr] + w[ctr + 1]
            A[-1] = min(INTENSITY, A[-1] + a[ctr] + a[ctr + 1])
            ctr += 2
        else:
            W.append(w[ctr])
            A.append(a[ctr])
            ctr += 1
    return W, A

In [None]:
def top_three_tokens(text):
    words, attentions = attention_scores(text)
    words = words[1:-1] # Remove start and end tags
    attentions = attentions[1:-1]
    assert len(words) == len(attentions)
    words, attentions = clean_array(words, attentions)
    assert len(words) == len(attentions)
    top_tokens = list()
    for i in range(len(words)):
        top_tokens.append((attentions[i], i))
    top_tokens = sorted(top_tokens, reverse=True)
    ind = [0]
    cur = 1
    while len(ind) < 3:
        take = True
        for ids in ind:
            take = take and abs(top_tokens[ids][1] - top_tokens[cur][1]) > 2
        if take:
            ind.append(cur)
        cur += 1
    xx = []
    for x in ind:
        xx.append(top_tokens[x][1])
    scores = [0 for i in range(len(words))]
    for w in xx:
        lst = [w - 1, w, w + 1]
        for j in lst:
            if j >= 0 and j < len(words):
                scores[j] = INTENSITY
    return words, scores

In [None]:
def clean_word(word_list):
  new_word_list = []
  for word in word_list:
    for latex_sensitive in ["\\", "%", "&", "^", "#", "_",  "{", "}"]:
      if latex_sensitive in word:
        word = word.replace(latex_sensitive, '\\'+latex_sensitive)
    new_word_list.append(word)
  return new_word_list

In [None]:
import string

def sanitize(text):
    text = text.lower()
    text = re.sub("\s+", " ", text)  # converting space-like character to single white space
    text = re.sub("\u2018", '\'', text)    # encoding apostrophe to X
    text = re.sub("\u2019", '\'', text)    # encoding apostrophe to X
    xx = ''
    for x in text:
        if x in string.punctuation and x != '\'':
            xx += ' '
        xx += x
    text = xx
    text = text.split()
    new_text = []
    for x in text:
        ok = False
        for y in x:
            ok = ok or y.isalnum()
        if ok:
            for c in string.punctuation:
                x = x.strip(c)
            new_text.append(x)
    return ' '.join(clean_word(new_text))

In [None]:
sanitize("'Lol, who ain't you bro??'")

In [None]:
header = r'''\documentclass[10pt,a4paper]{article}
\usepackage[left=1.00cm, right=1.00cm, top=1.00cm, bottom=2.00cm]{geometry}
\usepackage{color}
\usepackage{tcolorbox}
\usepackage{CJK}
\usepackage{adjustbox}
\tcbset{width=0.9\textwidth,boxrule=0pt,colback=red,arc=0pt,auto outer arc,left=0pt,right=0pt,boxsep=5pt}
\begin{document}
\begin{CJK*}{UTF8}{gbsn}''' + '\n\n'

footer = r'''\end{CJK*}
\end{document}'''

def heatmap(word_list, attention_list, label_list, latex_file, title, batch_size=20, color='blue'):
    '''Routine to generate attention heatmaps for given texts
    ---------------------------------------------------------
    Input:
    :param word_list: array of texts
    :param attention_list: array of attention scores for each text
    :param label_list: label for each text
    :param latex_file: name of the latex file
    :param title: title of latex file
    :param batch_size: Number of comments in each batch
    '''
    with open(latex_file, 'w', encoding='utf-8') as f:
        f.write(header)
        f.write('\\section{%s}\n\n' % title)

        n_examples = len(word_list)
        n_batches = n_examples // batch_size

        for i in range(n_batches):
            batch_word_list = word_list[i * batch_size: (i + 1) * batch_size]
            batch_attention_list = attention_list[i * batch_size: (i + 1) * batch_size]
            batch_label_list = label_list[i * batch_size: (i + 1) * batch_size]
            f.write('\\subsection{Batch %d}\n\n' % (i + 1))
            for j in range(batch_size):
                f.write('\\subsubsection{Comment %d - %s}\n\n' % (j + 1, batch_label_list[j]))
                sentence = batch_word_list[j]
                score = batch_attention_list[j]
                assert len(sentence) == len(score)
                f.write('\\noindent')
                for k in range(len(sentence)):
                    f.write('\\colorbox{%s!%s}{' % (color, score[k]) + '\\strut ' + sentence[k] + '} ')
                f.write('\n\n')

        f.write(footer)

In [None]:
random100AdHominem = adHominem[:100]
random100None = none[:100]

In [None]:
vTexts = list()
vScores = list()

for text in tqdm(random100None):
    sent = sanitize(text)
    try:
        texts_, scores_ = top_three_tokens(sent)
        vTexts.append(texts_)
        vScores.append(scores_)
    except:
        pass

In [None]:
vTexts = vTexts[:50]
vScores = vScores[:50]

In [None]:
heatmap(vTexts, vScores, ['Not ad hominem'] * len(vTexts), 'none.tex', 'Non ad hominem comments having low confidence score', color='orange')