# Installing Libraries

In [1]:
%pip install transformers
%pip install datasets
%pip install torch

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


# Step 1: Load the dataset

Libraries

In [2]:
# Step 1: Load the dataset
import torch
from datasets import load_dataset
from transformers import BertForSequenceClassification, BertTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


Load Data

In [3]:
# Load the Circa dataset
dataset = load_dataset("circa")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['context', 'question-X', 'canquestion-X', 'answer-Y', 'judgements', 'goldstandard1', 'goldstandard2'],
        num_rows: 34268
    })
})


# Step 2: Preprocess the data

Preprocess

In [4]:
train_data = dataset["train"].to_pandas()
train_data, dev_data = train_test_split(train_data, test_size=0.2, random_state=42)

Load Model

In [5]:
# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
num_classes = 9
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing and Encoding

In [7]:
# Tokenize and encode dataset
def encode_dataset(data):
    questions = data["question-X"].tolist()
    answers = data["answer-Y"].tolist()
    encoded_data = tokenizer(questions, answers, truncation=True, padding=True)
    encoded_data["labels"] = data["goldstandard1"].tolist()
    return encoded_data

train_encoded_data = encode_dataset(train_data)

Filtering

In [8]:
# Filter out examples with the invalid target label -1
filtered_train_encoded_data = {
    key: [value[i] for i, label in enumerate(train_encoded_data["labels"]) if label != -1]
    if key != "labels"  # Exclude labels from filtering
    else [label for label in train_encoded_data["labels"] if label != -1]
    for key, value in train_encoded_data.items()
}
train_encoded_data["labels"] = [label if label in [0, 1, 2, 3, 4, 5, 6, 7] else -1 for label in train_encoded_data["labels"]]

# Step 3: Create DataLoader for training data

In [9]:
train_inputs = torch.tensor(filtered_train_encoded_data["input_ids"])
train_token_types = torch.tensor(filtered_train_encoded_data["token_type_ids"])
train_masks = torch.tensor(filtered_train_encoded_data["attention_mask"])
train_labels = torch.tensor(filtered_train_encoded_data["labels"])
train_dataset = TensorDataset(train_inputs, train_token_types, train_masks, train_labels)
batch_size = 16
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

# Step 4: Define optimizer and scheduler

In [None]:
#optimizer = AdamW(model.parameters(), lr=2e-5)
#num_epochs = 3
#total_steps = len(train_dataloader) * num_epochs
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [10]:
# Define hyperparameters
learning_rate = 2e-5  # Specified learning rate
num_epochs = 3
batch_size = 32  # Specified training batch size

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Step 5: Define loss function and device

In [11]:
criterion = torch.nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 6: Train the model

In [12]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "token_type_ids": batch[1], "attention_mask": batch[2]}
        targets = batch[3]
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = criterion(outputs.logits, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {avg_train_loss:.4f}")

Epoch 1/3, Average Training Loss: 0.7320
Epoch 2/3, Average Training Loss: 0.4269
Epoch 3/3, Average Training Loss: 0.2867


# Step 7: Evaluate the model

In [13]:
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "token_type_ids": batch[1], "attention_mask": batch[2]}
        targets = batch[3]
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(targets.tolist())
    return classification_report(true_labels, predictions, output_dict=True)

# Encode the validation dataset
dev_encoded_data = encode_dataset(dev_data)

# Create DataLoader for validation data
dev_inputs = torch.tensor(dev_encoded_data["input_ids"])
dev_token_types = torch.tensor(dev_encoded_data["token_type_ids"])
dev_masks = torch.tensor(dev_encoded_data["attention_mask"])
dev_labels = torch.tensor(dev_encoded_data["labels"])
dev_dataset = TensorDataset(dev_inputs, dev_token_types, dev_masks, dev_labels)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size)

Evaluate the model

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Evaluate the model
report = evaluate_model(model, dev_dataloader, device)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Print the evaluation report

In [27]:

# Print the evaluation report in tabular form
print("Evaluation Report:")
for metric, value in report.items():
    print(f"{metric}: {value}")

Evaluation Report:
-1: {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 523.0}
0: {'precision': 0.8114457831325301, 'recall': 0.9147707979626486, 'f1-score': 0.8600159616919394, 'support': 2945.0}
1: {'precision': 0.7712283594394065, 'recall': 0.8610216290842154, 'f1-score': 0.8136551424222657, 'support': 2173.0}
2: {'precision': 0.5462962962962963, 'recall': 0.42142857142857143, 'f1-score': 0.47580645161290325, 'support': 140.0}
3: {'precision': 0.5306122448979592, 'recall': 0.4425531914893617, 'f1-score': 0.48259860788863107, 'support': 235.0}
4: {'precision': 0.35664335664335667, 'recall': 0.2361111111111111, 'f1-score': 0.2841225626740947, 'support': 216.0}
5: {'precision': 0.8558558558558559, 'recall': 0.9259259259259259, 'f1-score': 0.8895131086142322, 'support': 513.0}
6: {'precision': 0.839622641509434, 'recall': 0.967391304347826, 'f1-score': 0.898989898989899, 'support': 92.0}
7: {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 17.0}
accuracy: 0.7795

Extract accuracy from the report

In [28]:
accuracy = report["accuracy"] * 100
print(f"Test Accuracy: {accuracy:.4f} %")

Test Accuracy: 77.9545 %
