In [1]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib
from   matplotlib import pyplot as plt
import seaborn as sns

from copy import deepcopy
import pickle
import json
from tqdm import tqdm
from pprint import pprint

import torch
from transformers import (
    BertTokenizer as Tokenizer,
    BertModel as Model,
    pipeline
) 

from torch.utils.data import DataLoader

from sklearn.metrics import (f1_score, precision_score, recall_score, 
                             accuracy_score)

In [2]:
# Setup for plotting
sns.set(style='darkgrid')
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.size'] = 18
matplotlib.rcParams['figure.figsize'] = (10, 5)

In [3]:
# For caching objects

def load_obj(file_path):
    """Load a pickled object from given path
    :param file_path: Path to the pickle file of the object
    :type file_path: string
    """
    with open(file_path, 'rb') as f:
        return pickle.load(f)

def save_obj(obj, file_path):
    """Save an object to given path via pickling
    :param obj: Object to pickle
    :param file_path: Path for pickling
    :type file_path: string
    """
    with open(file_path, 'wb') as f:
        return pickle.dump(obj, f)

In [4]:
# Change value of `N`

N = 20
train_aux, test_aux, dev_aux = load_obj(f'./dataset/aux-sentences-n-{N}-simple.pkl')

In [5]:
# Get node2vec data
node_encoding = load_obj('./dataset/node-encoding.pkl')
node_embedding = dict()

D = -1

with open('./dataset/node2vec.emb', 'r', encoding='utf-8') as f:
    n, d = map(int, f.readline().strip().split())
    D = d
    assert(n == len(node_encoding))
    for i in range(n):
        s = f.readline().strip().split()
        u = int(s[0])
        vec = np.array(list(map(float, s[1:])))
        node_embedding[u] = vec 

assert(len(node_embedding) == len(node_encoding))


In [6]:
def get_embeddings_from_path(path):
    """Returns a numpy vector associated with the path.

    The resultant vector is trimmed / padded such that its length becomes
    `5 * N * D`

    :param path: dependency path`
    :type path: str
    """
    # Check for empty path
    if not path:
        return np.zeros(5 * N * D)
    # Create a matrix with node embeddings
    result = list()
    for rel in path.split(','):
        if not rel:
            continue
        u = node_encoding[rel]
        result.append(node_embedding[u])
    result = np.array(result)

    # Reshape the matrix into a vector
    nx, ny = result.shape
    result = result.reshape((nx * ny, 1)).squeeze()

    # Trim/pad such that length of vector is `5 * N * D`
    limit = 5 * N * D
    result = result[:limit]
    pad_length = max(0, limit - result.shape[0])
    result = np.pad(result, (0, pad_length), 'constant', constant_values=(0, 0))
    return result


In [7]:
# Load the LOGIC dataset

train_df = pd.read_csv('./dataset/train.csv')
dev_df   = pd.read_csv('./dataset/dev.csv')
test_df  = pd.read_csv('./dataset/test.csv')

In [8]:
train_texts        = list(train_df['text'])
train_labels       = list(train_df['label'])
train_masked_texts = list(train_df['masked_text'])

dev_texts        = list(dev_df['text'])
dev_labels       = list(dev_df['label'])
dev_masked_texts = list(dev_df['masked_text'])

test_texts        = list(test_df['text'])
test_labels       = list(test_df['label'])
test_masked_texts = list(test_df['masked_text'])

In [9]:
# Encoding labels as integer
label_map = {
    'faulty generalization': 0,
    'false causality': 1,
    'circular reasoning': 2, 
    'ad populum': 3,
    'ad hominem': 4,
    'fallacy of logic': 5,
    'appeal to emotion': 6,
    'false dilemma': 7,
    'equivocation': 8,
    'fallacy of extension': 9,
    'fallacy of relevance': 10,
    'fallacy of credibility': 11,
    'intentional': 12,
}

inverse_label_map = dict()
for k, v in label_map.items():
    inverse_label_map[v] = k

In [10]:
train_labels_encoded = [label_map[z] for z in train_labels]
test_labels_encoded  = [label_map[z] for z in test_labels]
dev_labels_encoded   = [label_map[z] for z in dev_labels]

In [11]:
tokenizer = Tokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [12]:
class TrainingDataset(torch.utils.data.Dataset):
    def __init__(self, texts, aux_texts, labels, max_length=128):
        self.n2v = [get_embeddings_from_path(path) for path in aux_texts] # node2vec embeddings
        self.labels = labels
        self.encodings = tokenizer(texts, max_length=max_length, 
                                   truncation=True, padding="max_length")

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        item['n2v'] = torch.tensor(self.n2v[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [13]:
train_main_texts = []
train_aux_texts = []

for main_txt, aux_txt in train_aux:
    train_main_texts.append(main_txt)
    train_aux_texts.append(aux_txt)

In [14]:
test_main_texts = []
test_aux_texts = []

for main_txt, aux_txt in test_aux:
    test_main_texts.append(main_txt)
    test_aux_texts.append(aux_txt)

In [15]:
train_dataset = TrainingDataset(texts=train_main_texts, aux_texts=train_aux_texts,
                                labels=train_labels_encoded, max_length=128)

test_dataset = TrainingDataset(texts=test_main_texts, aux_texts=test_aux_texts,
                               labels=test_labels_encoded, max_length=128)

In [16]:
class StructureAwareBertModel(torch.nn.Module):
    """Structure-aware custom Bert Model"""

    def __init__(self):
        super().__init__()
        self.bert = Model.from_pretrained('bert-base-uncased')
        # New layers
        self.linear = torch.nn.Linear(768 + 5 * N * D, len(label_map))

    def forward(self, ids, mask, n2v):
        output = self.bert(ids, attention_mask=mask).last_hidden_state
        # Get the CLS token
        cls = output[:, 0, :].view(-1, 768)
        # Combine CLS with embedding from node2vec
        emb = torch.cat((cls, n2v), dim=1)
        output = self.linear(emb.float())
        return output

In [17]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda:1


In [18]:
model = StructureAwareBertModel()
model.to(device)
model.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


StructureAwareBertModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [19]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

In [20]:
for epoch in range(10):
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        n2v = batch['n2v'].to(device)
        outputs = model(ids=input_ids, mask=attention_mask, n2v=n2v)
        outputs = torch.nn.functional.log_softmax(outputs, dim=1)
        loss = criterion(outputs, labels)
        loss.backward()
        optim.step()

100%|██████████| 29/29 [00:21<00:00,  1.38it/s]
100%|██████████| 29/29 [00:20<00:00,  1.39it/s]
100%|██████████| 29/29 [00:20<00:00,  1.39it/s]
100%|██████████| 29/29 [00:20<00:00,  1.39it/s]
100%|██████████| 29/29 [00:20<00:00,  1.39it/s]
100%|██████████| 29/29 [00:20<00:00,  1.39it/s]
100%|██████████| 29/29 [00:20<00:00,  1.39it/s]
100%|██████████| 29/29 [00:20<00:00,  1.39it/s]
100%|██████████| 29/29 [00:20<00:00,  1.39it/s]
100%|██████████| 29/29 [00:20<00:00,  1.39it/s]


In [21]:
def evaluate(dataset):
    model.eval()
    eval_loader = DataLoader(dataset, batch_size=64, shuffle=False)

    pred_list = []
    true_list = []

    for batch in eval_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        n2v = batch['n2v'].to(device)
        with torch.no_grad():
            outputs = model(ids=input_ids, mask=attention_mask, n2v=n2v)
            outputs = torch.nn.functional.log_softmax(outputs, dim=1)
            pred = torch.argmax(outputs, dim=-1).to(torch.device('cpu'))
        labels = batch['labels']
        pred_list.extend(pred.tolist())
        true_list.extend(labels.tolist())

    return f1_score(true_list, pred_list, average='macro')

In [22]:
evaluate(train_dataset)

0.9965579316474241

In [23]:
evaluate(test_dataset)

0.5683840601206132