In [None]:
# Mount Google drive to Colab
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
# Clone `CreateDebateScraper` library from github
!git clone https://github.com/utkarsh512/CreateDebateScraper.git
%cd CreateDebateScraper/src/nested

In [None]:
!pip install transformers

In [None]:
# Dataset used in Logical Fallacy Detection (Zhijing Jin et al.)
# We will use this dataset to train BERT for detecting different
# categories of logical fallacy.
!curl https://raw.githubusercontent.com/causalNLP/logical-fallacy/main/data/edu_all.csv -o fallacies.csv

In [None]:
from   collections              import namedtuple
from   copy                     import deepcopy
# import cpnet
from   itertools                import accumulate
import json
from   matplotlib               import pyplot as plt
import networkx as nx
import nltk
import numpy as np
import pandas as pd
import pickle
import re
# import spacy
from   scipy                    import stats
import textwrap
from   thread                   import Comment, Thread
from   tqdm                     import tqdm

nltk.download('punkt') # For tokenizers
nltk.download('stopwords')
nltk.download('wordnet') # For lemmatizers
nltk.download('omw-1.4')

import matplotlib
from   nltk.stem                import WordNetLemmatizer
from   nltk.tokenize            import TweetTokenizer
from   nltk.corpus              import stopwords
from   pprint                   import pprint

# import shifterator as sh
# import wordcloud
# import skbio

import torch
from transformers import (
    BertTokenizer as Tokenizer,
    BertForSequenceClassification as Model,
    pipeline
) 

from torch.utils.data import DataLoader
from transformers import AdamW

import seaborn as sns

matplotlib.rcParams.update({'font.size': 18})
matplotlib.rcParams["figure.figsize"] = (12, 5)
STOP_WORDS = list(stopwords.words('english'))

In [None]:
# Let's see the fallacies.csv

df = pd.read_csv('fallacies.csv')

In [None]:
df

In [None]:
# Number of class

labels = list(df['updated_label'].unique())
for label in labels:
    print(label)

In [None]:
labels = list(df['updated_label'].unique())
label_stats = list()
for label in labels:
    frac = len(df[df["updated_label"] == label]) / len(df)
    label_stats.append((label, frac))

In [None]:
for label, frac in sorted(label_stats, reverse=True, key=lambda z: z[1]):
    print(f'Label: {label:>24}, {frac * 100:.2f}%')

In [None]:
# Encoding labels as integer
label_map = {
    'faulty generalization': 0,
    'false causality': 1,
    'circular reasoning': 2, 
    'ad populum': 3,
    'ad hominem': 4,
    'fallacy of logic': 5,
    'appeal to emotion': 6,
    'false dilemma': 7,
    'equivocation': 8,
    'fallacy of extension': 9,
    'fallacy of relevance': 10,
    'fallacy of credibility': 11,
    'intentional': 12,
}

inverse_label_map = dict()
for k, v in label_map.items():
    inverse_label_map[v] = k

In [None]:
# Create training set
texts, labels = list(df['source_article']), [z for z in map(lambda x: label_map[x], list(df['updated_label']))]

In [None]:
texts[0], labels[0]

In [None]:
tokenizer = Tokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
encodings = tokenizer(texts, max_length=64, truncation=True, padding="max_length")

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset = CustomDataset(encodings, labels)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

In [None]:
model = Model.from_pretrained('bert-base-uncased',
                              num_labels=len(label_map))
model.to(device)
model.train()

In [None]:
loader = DataLoader(dataset, batch_size=64, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
for epoch in range(5):
    for batch in tqdm(loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels_ = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels_)
        loss = outputs[0]
        loss.backward()
        optim.step()

In [None]:
# Creating inference pipeline
pipe = pipeline(task='text-classification',
                model=model,
                tokenizer=tokenizer,
                device=0)

In [None]:
comments = dict()

# Topical forums on CreateDebate. We have scraped comments for all of the
# following forurm.
categories = ['business', 'comedy', 'entertainment', 'health', 'law', 'nsfw',
              'politics2', 'religion', 'science', 'shopping', 'sports',
              'technology', 'travel', 'world']

# However, we will be analyzing comments from selected forum only!
# These forum have at least 10k comments each.
categories_selected = ['politics2', 'religion', 'world', 
                       'science', 'law', 'technology']

for x in categories_selected:
    comments[x] = list()

In [None]:
# Loading comments from select forums

for cat in tqdm(categories_selected):
    fp = open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/threads.log', 'rb')

    # Get all the `Thread` objects pickled while scraping.
    threads = list()
    try:
        while True:
            e = pickle.load(fp)
            threads.append(e)
    except EOFError:
        fp.close()

    # While classifying CreateDebate comments, we used comments as per author mode.
    # Hence, using the same mode to attach classification score with the comments.
    # 
    # score < 0.5 -> ad hominem comment
    #       > 0.5 -> non ad hominem comment
    authors = dict()
    for thread in threads:
        for k, v in thread.comments.items():
            try:
                authors[v.author].append((v, k))
            except:
                authors[v.author] = list()
                authors[v.author].append((v, k))

    ctr = 0
    # Load the classification score of the comments.
    with open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/comments_with_score.log', 'rb') as fp:
        cws = pickle.load(fp)
    # Attach classification score with the comments.
    for author in authors.keys():
        for i in range(len(authors[author])):
            comment, cid = authors[author][i]
            foo = deepcopy(comment.__dict__)
            foo['tag'] = cat
            foo['score'] = cws[ctr][0]
            foo['validation'] = cws[ctr][1][0]
            foo['id'] = int(cid[3:])
            comments[cat].append(foo)
            ctr += 1

In [None]:
for_against_debates = dict()
perspective_debates = dict()

for cat in categories_selected:
    for_against_debates[cat] = list()
    perspective_debates[cat] = list()

    for comment in comments[cat]:
        if comment['polarity'] == 'Not Available':
            perspective_debates[cat].append(deepcopy(comment))
        else:
            for_against_debates[cat].append(deepcopy(comment))

In [None]:
for_against_politics_texts = [comment['body'] for comment in \
                              for_against_debates['politics2']]
perspective_politics_texts = [comment['body'] for comment in \
                              perspective_debates['politics2']]

In [None]:
class ListDataset(torch.utils.data.Dataset):
    def __init__(self, text_list):
        self._list = text_list

    def __len__(self):
        return len(self._list)

    def __getitem__(self, i):
        return self._list[i]

In [None]:
for_against_dataset = ListDataset(for_against_politics_texts)
perspective_dataset = ListDataset(perspective_politics_texts)

In [None]:
def get_labels_and_scores(list_dataset):
    labels_and_scores = list()
    for out in tqdm(pipe(list_dataset, batch_size=64, max_length=64, truncation=True), total=len(list_dataset)):
        labels_and_scores.append(out)
    return labels_and_scores

In [None]:
for_against_labels_and_scores = get_labels_and_scores(for_against_dataset)

In [None]:
perspective_labels_and_scores = get_labels_and_scores(perspective_dataset)

In [None]:
for_against_labeled = {
    'texts': list(),
    'labels': list(),
    'scores': list(),
}

perspective_labeled = {
    'texts': list(),
    'labels': list(),
    'scores': list(),
}

In [None]:
for text, label_and_score in zip(for_against_politics_texts, for_against_labels_and_scores):
    label = inverse_label_map[int(label_and_score['label'][6:])] # Remove LABEL_ prefix
    score = label_and_score['score']
    for_against_labeled['texts'].append(text)
    for_against_labeled['labels'].append(label)
    for_against_labeled['scores'].append(score)

In [None]:
for text, label_and_score in zip(perspective_politics_texts, perspective_labels_and_scores):
    label = inverse_label_map[int(label_and_score['label'][6:])] # Remove LABEL_ prefix
    score = label_and_score['score']
    perspective_labeled['texts'].append(text)
    perspective_labeled['labels'].append(label)
    perspective_labeled['scores'].append(score)

In [None]:
sns.histplot(data=perspective_labeled, x="scores", hue='labels', multiple='stack')

In [None]:
def plot_class_dist_number(data1, data2):
    labels = [k for k in label_map.keys()]

    def get_class_freq(data):
        freq = dict()
        for label in labels:
            freq[label] = 0
        for label in data['labels']:
            freq[label] += 1
        return freq

    freq1 = get_class_freq(data1)
    freq2 = get_class_freq(data2)

    x = labels
    y1 = [freq1[label] for label in label_map]
    y2 = [freq2[label] for label in label_map]

    ticks = np.arange(len(x))
    width = 0.30

    fig, ax = plt.subplots()
    subplot1 = ax.bar(ticks - width / 2, y1, width, label='for-against', tick_label=x)
    subplot2 = ax.bar(ticks + width / 2, y2, width, label='perspective', tick_label=x)

    ax.set_ylabel('# Comment')
    ax.set_xticks(ticks)
    ax.set_xticklabels(x, rotation=45, ha='right')
    ax.legend()
    plt.show()

In [None]:
def plot_class_dist_frac(data1, data2):
    labels = [k for k in label_map.keys()]

    def get_class_freq(data):
        freq = dict()
        for label in labels:
            freq[label] = 0
        for label in data['labels']:
            freq[label] += 1
        return freq

    def normalize(data):
        total = 0
        for v in data.values():
            total += v
        for k in labels:
            data[k] /= total
        return data

    freq1 = normalize(get_class_freq(data1))
    freq2 = normalize(get_class_freq(data2))

    x = labels
    y1 = [freq1[label] for label in label_map]
    y2 = [freq2[label] for label in label_map]

    ticks = np.arange(len(x))
    width = 0.30

    fig, ax = plt.subplots()
    subplot1 = ax.bar(ticks - width / 2, y1, width, label='for-against', tick_label=x)
    subplot2 = ax.bar(ticks + width / 2, y2, width, label='perspective', tick_label=x)

    ax.set_ylabel('% Comment')
    ax.set_xticks(ticks)
    ax.set_xticklabels(x, rotation=45, ha='right')
    ax.legend()
    plt.show()

In [None]:
plot_class_dist_frac(for_against_labeled, perspective_labeled)

In [None]:
for_against_ordered = dict()
perspective_ordered = dict()

for label in label_map.keys():
    for_against_ordered[label] = list()
    perspective_ordered[label] = list()

In [None]:
for text, label, score in zip(for_against_labeled['texts'],
                              for_against_labeled['labels'],
                              for_against_labeled['scores']):
    for_against_ordered[label].append((text, score))

In [None]:
for text, label, score in zip(perspective_labeled['texts'],
                              perspective_labeled['labels'],
                              perspective_labeled['scores']):
    perspective_ordered[label].append((text, score))

In [None]:
for label in label_map.keys():
    for_against_ordered[label] = sorted(for_against_ordered[label],
                                        reverse=True,
                                        key=lambda z: z[1])
    perspective_ordered[label] = sorted(perspective_ordered[label],
                                        reverse=True,
                                        key=lambda z: z[1])

In [None]:
label_map = {
    'faulty generalization': 0,
    'false causality': 1,
    'circular reasoning': 2, 
    'ad populum': 3,
    'ad hominem': 4,
    'fallacy of logic': 5,
    'appeal to emotion': 6,
    'false dilemma': 7,
    'equivocation': 8,
    'fallacy of extension': 9,
    'fallacy of relevance': 10,
    'fallacy of credibility': 11,
    'intentional': 12,
}

In [None]:
current = 'equivocation'

for text, score in for_against_ordered[current][:20]:
    print(f'[{score:.2f}]: {text}')
    print()

In [None]:
for text, score in perspective_ordered[current][:20]:
    print(f'[{score:.2f}]: {text}')
    print()