# Extracting probabilities of classification for LOGIC dataset labels on CreateDebate Politics comments

- In this notebook, PyTorch BERT (base, uncased) is fine-tuned over LOGIC dataset ([paper](https://arxiv.org/abs/2202.13758), [github](https://github.com/causalNLP/logical-fallacy)). 
- We then use the fine-tuned model to classify the comments in CreateDebate. At the same time, we also extract probabilities of various classes for each comment.

**Device**: GPU

# Environment setup

In [1]:
# Mount Google drive to Colab
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
# Clone `CreateDebateScraper` library from github
!git clone https://github.com/utkarsh512/CreateDebateScraper.git
%cd CreateDebateScraper/src/nested

Cloning into 'CreateDebateScraper'...
remote: Enumerating objects: 176, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 176 (delta 5), reused 4 (delta 4), pack-reused 170[K
Receiving objects: 100% (176/176), 207.95 KiB | 29.71 MiB/s, done.
Resolving deltas: 100% (61/61), done.
/content/CreateDebateScraper/src/nested


In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m91.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [33]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns

from thread import (
    Comment, 
    Thread
)

from tqdm import tqdm
import pickle
from pprint import pprint
from copy import deepcopy

import torch
from torch.utils.data import DataLoader
from transformers import (
    BertTokenizer as Tokenizer,
    BertForSequenceClassification as Model,
    pipeline,
    AdamW
) 

In [5]:
# Setup plot size and font size
matplotlib.rcParams.update({'font.size': 18})
matplotlib.rcParams["figure.figsize"] = (12, 5)

# **LOGIC** dataset

In [6]:
# Dataset used in Logical Fallacy Detection (Zhijing Jin et al.)
# We will use this dataset to train BERT for detecting different
# categories of logical fallacy.
!curl https://raw.githubusercontent.com/causalNLP/logical-fallacy/main/data/edu_all.csv -o fallacies.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  750k  100  750k    0     0  4717k      0 --:--:-- --:--:-- --:--:-- 4717k


In [7]:
# Analyze LOGIC dataset
df = pd.read_csv('fallacies.csv')

In [8]:
df

Unnamed: 0,updated_label,original_url,old_label,source_article,explanations,rationale
0,faulty generalization,https://quizizz.com/admin/quiz/5f948dcbedafcd0...,hasty generalization,"""Annie must like Starbucks because all white g...",,
1,faulty generalization,https://quizizz.com/admin/quiz/601ab1436b68c30...,hasty generalization,It is warmer this year in Las Vegas as compare...,,
2,faulty generalization,https://quizizz.com/admin/quiz/5f886ce55bec100...,hasty generalization,"""The two courses I took at UF were not very in...",,
3,faulty generalization,https://quizizz.com/admin/quiz/601d66c86088e40...,hasty generalization,A driver with a New York license plate cuts yo...,,
4,faulty generalization,https://quizizz.com/admin/quiz/5e7d3bd586911a0...,hasty generalization,"""Four out of five dentists recommend Happy Glo...",,
...,...,...,...,...,...,...
2447,fallacy of credibility,https://examples.yourdictionary.com/examples-o...,"My dad went to school, so he knows everything ...",attaching an argument to a person of false or ...,,
2448,ad populum,https://examples.yourdictionary.com/examples-o...,"Lots of people bought this album, so it must b...",claiming that an idea or belief is true simply...,,
2449,fallacy of relevance,https://examples.yourdictionary.com/examples-o...,You're complaining because you have to walk to...,uses irrelevant information or other technique...,,
2450,fallacy of logic,https://examples.yourdictionary.com/examples-o...,You let my sister pierce her ears when she was...,employs analogies between things that are not ...,,


In [12]:
# Number of classes
df['updated_label'].unique()

array(['faulty generalization', 'false causality', 'circular reasoning',
       'ad populum', 'ad hominem', 'fallacy of logic',
       'appeal to emotion', 'false dilemma', 'equivocation',
       'fallacy of extension', 'fallacy of relevance',
       'fallacy of credibility', 'intentional', 'miscellaneous'],
      dtype=object)

In [13]:
# Remove texts with class label 'miscellaneous'
df = df[df['updated_label'] != 'miscellaneous']

In [20]:
# Distribution of class labels in the dataset
labels = list(df['updated_label'].unique())
label_stats = list()
for label in labels:
    frac = len(df[df["updated_label"] == label]) / len(df)
    label_stats.append((label, frac))
for label, frac in sorted(label_stats, reverse=True, key=lambda z: z[1]):
    print(f'Label: {label:>24}, {frac * 100:.2f}%')

Label:    faulty generalization, 18.01%
Label:               ad hominem, 12.33%
Label:               ad populum, 9.47%
Label:          false causality, 8.82%
Label:       circular reasoning, 6.98%
Label:        appeal to emotion, 6.82%
Label:     fallacy of relevance, 6.61%
Label:         fallacy of logic, 6.21%
Label:              intentional, 5.84%
Label:            false dilemma, 5.76%
Label:     fallacy of extension, 5.76%
Label:   fallacy of credibility, 5.39%
Label:             equivocation, 2.00%


In [21]:
# Encoding labels as integer, required for transformers model
label_map = {
    'faulty generalization': 0,
    'false causality': 1,
    'circular reasoning': 2, 
    'ad populum': 3,
    'ad hominem': 4,
    'fallacy of logic': 5,
    'appeal to emotion': 6,
    'false dilemma': 7,
    'equivocation': 8,
    'fallacy of extension': 9,
    'fallacy of relevance': 10,
    'fallacy of credibility': 11,
    'intentional': 12,
}

inverse_label_map = dict()
for k, v in label_map.items():
    inverse_label_map[v] = k

In [22]:
# Create training set, here the class labels are encoded using ``label_map``
texts, labels = list(df['source_article']), \
 [z for z in map(lambda x: label_map[x], list(df['updated_label']))]

In [23]:
texts[0], labels[0]

('"Annie must like Starbucks because all white girls like Starbucks."', 0)

# Training Bert model

In [24]:
tokenizer = Tokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
encodings = tokenizer(texts, max_length=64, truncation=True, padding="max_length")

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [25]:
class TrainingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset = TrainingDataset(encodings, labels)

In [26]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
assert(str(device) == 'cuda')

In [27]:
model = Model.from_pretrained('bert-base-uncased',
                              num_labels=len(label_map))
model.to(device)
model.train()

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [28]:
loader = DataLoader(dataset, batch_size=64, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(5):
    for batch in tqdm(loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels_ = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels_)
        loss = outputs[0]
        loss.backward()
        optim.step()

100%|██████████| 39/39 [00:26<00:00,  1.49it/s]
100%|██████████| 39/39 [00:24<00:00,  1.59it/s]
100%|██████████| 39/39 [00:25<00:00,  1.55it/s]
100%|██████████| 39/39 [00:24<00:00,  1.61it/s]
100%|██████████| 39/39 [00:24<00:00,  1.61it/s]


# Model inference on CreateDebate dataset

In [30]:
# Creating inference pipeline
pipe = pipeline(task='text-classification',
                model=model,
                tokenizer=tokenizer,
                top_k=None, # Need probability for each class label
                device=0)

In [34]:
comments = dict()
# key: category
# value: list of comments (each comment represented as a ``dict``)

# Topical forums on CreateDebate. We have scraped comments for all of the
# following forurm.
categories = ('business', 'comedy', 'entertainment', 'health', 'law', 'nsfw',
              'politics2', 'religion', 'science', 'shopping', 'sports',
              'technology', 'travel', 'world')

# However, we will be analyzing comments from selected forum only!
# These forum have at least 10k comments each.
categories_selected = ('politics2', 'religion', 'world', 
                       'science', 'law', 'technology')

for x in categories_selected:
    comments[x] = list()

In [35]:
# Loading comments from select forums

for cat in tqdm(categories_selected):
    fp = open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/threads.log', 'rb')

    # Get all the `Thread` objects pickled while scraping.
    threads = list()
    try:
        while True:
            e = pickle.load(fp)
            threads.append(e)
    except EOFError:
        fp.close()

    # While classifying CreateDebate comments, we used comments as per author mode.
    # Hence, using the same mode to attach classification score with the comments.
    # 
    # score < 0.5 -> ad hominem comment
    #       > 0.5 -> non ad hominem comment
    authors = dict()
    for thread in threads:
        for k, v in thread.comments.items():
            try:
                authors[v.author].append((v, k))
            except:
                authors[v.author] = list()
                authors[v.author].append((v, k))

    ctr = 0
    # Load the classification score of the comments.
    with open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/comments_with_score.log', 'rb') as fp:
        cws = pickle.load(fp)
    # Attach classification score with the comments.
    for author in authors.keys():
        for i in range(len(authors[author])):
            comment, cid = authors[author][i]
            foo = deepcopy(comment.__dict__)
            foo['tag'] = cat
            foo['score'] = cws[ctr][0]
            foo['validation'] = cws[ctr][1][0]
            foo['id'] = int(cid[3:])
            comments[cat].append(foo)
            ctr += 1

100%|██████████| 6/6 [00:12<00:00,  2.10s/it]


In [36]:
for_against_debates = dict()
perspective_debates = dict()

for cat in categories_selected:
    for_against_debates[cat] = list()
    perspective_debates[cat] = list()

    for comment in comments[cat]:
        if comment['polarity'] == 'Not Available':
            perspective_debates[cat].append(deepcopy(comment))
        else:
            for_against_debates[cat].append(deepcopy(comment))

In [37]:
for_against_politics_texts = [comment['body'] for comment in \
                              for_against_debates['politics2']]
perspective_politics_texts = [comment['body'] for comment in \
                              perspective_debates['politics2']]

In [38]:
class InferenceDataset(torch.utils.data.Dataset):
    def __init__(self, text_list):
        self._list = text_list

    def __len__(self):
        return len(self._list)

    def __getitem__(self, i):
        return self._list[i]

In [39]:
for_against_dataset = InferenceDataset(for_against_politics_texts)
perspective_dataset = InferenceDataset(perspective_politics_texts)

In [41]:
def get_labels_and_scores(inference_dataset):
    labels_and_scores = list()
    for out in tqdm(pipe(inference_dataset, batch_size=64, max_length=64, \
                         truncation=True), total=len(inference_dataset)):
        labels_and_scores.append(out)
    return labels_and_scores

In [46]:
def save_obj(obj, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(obj, f)

def load_obj(obj, file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

In [42]:
for_against_labels_and_scores = get_labels_and_scores(for_against_dataset)

100%|██████████| 88829/88829 [07:42<00:00, 192.00it/s]


In [47]:
save_obj(for_against_labels_and_scores,
         '/content/gdrive/MyDrive/Temp/63-for_against_labels_and_scores.pkl')

In [48]:
perspective_labels_and_scores = get_labels_and_scores(perspective_dataset)

100%|██████████| 31021/31021 [02:39<00:00, 194.93it/s]


In [49]:
save_obj(perspective_labels_and_scores,
         '/content/gdrive/MyDrive/Temp/63-perspective_labels_and_scores.pkl')