In [5]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m118.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [6]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [7]:
train = '/content/drive/MyDrive/NLP-final-project/implementation/data/train.csv'
dev = '/content/drive/MyDrive/NLP-final-project/implementation/data/dev.csv'

In [15]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from ast import literal_eval
import json
import numpy as np


In [9]:
class FactCheckDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        tokens = literal_eval(row['tokens'])
        span_start = json.loads(row['span_start_index'])
        span_end = json.loads(row['span_end_index'])
        labels = ['O'] * len(tokens)
        for i in range(len(tokens)):
            for j in range(len(span_start)):
                if span_start[j] < i <= span_end[j]:
                    labels[i] = 'I'
                elif i in span_start:
                    labels[i] = 'B'
        encoding = self.tokenizer.encode_plus(
            ' '.join(tokens),
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        label_dict = {'O': 0, 'B': 1, 'I': 2}
        labels = [label_dict[label] for label in labels]
        labels += [0] * (self.max_len - len(labels))
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }


In [10]:
class FactCheckModel(nn.Module):
    def __init__(self, n_classes):
        super(FactCheckModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.lstm = nn.LSTM(768, 256, batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(256*2, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output, (hidden, _) = self.lstm(outputs.last_hidden_state)
        output = self.classifier(output)
        return output


In [11]:
def train_model(model, data_loader, loss_fn, optimizer, device, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=2)
        loss = loss_fn(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)


In [12]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=2)
            loss = loss_fn(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [16]:
# Load data
df_train = pd.read_csv(train)
df_val = pd.read_csv(dev)

# Create a BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create instances of the dataset
train_dataset = FactCheckDataset(df_train, tokenizer, max_len=128)
val_dataset = FactCheckDataset(df_val, tokenizer, max_len=128)

# Create data loaders
train_data_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Create the model
model = FactCheckModel(n_classes=3)
model = model.to(device)

# Define the loss function and optimizer
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Train the model
for epoch in range(3):
    print(f'Epoch {epoch + 1}/3')
    print('-' * 10)
    train_acc, train_loss = train_model(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        len(df_train)
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/3
----------
Train loss 0.17785227074036522 accuracy 118.42935142289875
Val   loss 0.13129891036078334 accuracy 120.59920634920634

Epoch 2/3
----------
Train loss 0.12061681157894551 accuracy 121.32445400397089
Val   loss 0.12303353908161323 accuracy 121.14021164021163

Epoch 3/3
----------
Train loss 0.09600228604382624 accuracy 122.98709463931172
Val   loss 0.12218256465469797 accuracy 121.33597883597884



In [17]:
def evaluate_model(model, data_loader, device):
    model = model.eval()
    predicted_labels = []
    true_labels = []
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=2)
            predicted_labels.extend(preds.view(-1).tolist())
            true_labels.extend(labels.view(-1).tolist())
    return predicted_labels, true_labels

predicted_labels, true_labels = evaluate_model(model, val_data_loader, device)

# Remove padding (label 0)
predicted_labels_no_pad = [pred for pred, true in zip(predicted_labels, true_labels) if true != 0]
true_labels_no_pad = [true for true in true_labels if true != 0]

# Compute metrics
report = classification_report(true_labels_no_pad, predicted_labels_no_pad, digits=4)
f1 = f1_score(true_labels_no_pad, predicted_labels_no_pad, average='micro')
precision = precision_score(true_labels_no_pad, predicted_labels_no_pad, average='micro')
recall = recall_score(true_labels_no_pad, predicted_labels_no_pad, average='micro')

# Print metrics
print(report)
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         0
           1     0.8571    0.3789    0.5255       966
           2     0.9611    0.8005    0.8735      9384

    accuracy                         0.7612     10350
   macro avg     0.6061    0.3931    0.4663     10350
weighted avg     0.9514    0.7612    0.8410     10350

F1 Score: 0.7612
Precision: 0.7612
Recall: 0.7612


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
