In [3]:
from dataclasses import dataclass
from typing import List, Tuple

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import PreTrainedTokenizerFast, AutoTokenizer
from datasets import load_dataset
from transformers import RobertaTokenizer
import evaluate
from torch.optim import AdamW
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Global variables
VAL_DATASET_LENGTH = 200
USE_SMALL_DATASET = True
BATCH_SIZE = 32

In [5]:
# load dataset from datasets/strategyqa_train_filtered.json
dataset = load_dataset("json", data_files={"train": "../datasets/strategyqa_train_filtered.json", "test": "../datasets/strategyqa_test.json"})
print(dataset)
# initialize training, validation, and testing dataset
train_dataset = dataset['train'].select(indices=range(len(dataset['train']) - VAL_DATASET_LENGTH))
val_dataset = dataset['train'].select(indices=range(len(dataset['train']) - VAL_DATASET_LENGTH, len(dataset['train'])))
test_dataset = dataset['test']
if USE_SMALL_DATASET:
    train_dataset = train_dataset.select(range(100)) # we use the first 100 entries to test the code
    val_dataset = val_dataset.select(range(100)) # we use the first 100 entries to test the code
    test_dataset = test_dataset.select(range(100)) # we use the first 100 entries to test the code
print(dataset['train'][0])
print(dataset['train'][0]['question'])
print(dataset['train'][0]['answer'])
print(dataset['test'][0])
print(dataset['test'][0]['question'])



DatasetDict({
    train: Dataset({
        features: ['qid', 'term', 'description', 'question', 'answer'],
        num_rows: 2821
    })
    test: Dataset({
        features: ['qid', 'term', 'description', 'question', 'answer'],
        num_rows: 490
    })
})
{'qid': '872', 'term': 'Swastika', 'description': 'a geometrical figure and an ancient religious icon in the cultures of Eurasia and 20th-century symbol of Nazism', 'question': 'Did the Hopi Indians use a symbol that was similar to the swastika?', 'answer': True}
Did the Hopi Indians use a symbol that was similar to the swastika?
True
{'qid': '564959490dd0b8316a88', 'term': None, 'description': None, 'question': 'can you use Microsoft Office without internet?', 'answer': None}
can you use Microsoft Office without internet?


In [6]:
# load tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# tokenize the dataset
def tokenize_function(batch, tokenizer=tokenizer, field_name="question"):
    return tokenizer(batch[field_name], padding="max_length", truncation=True)

    
def add_labels(tokenized_dataset):
    tokenized_dataset["labels"] = 1 if tokenized_dataset["answer"] else 0 # Assuming "answer" exists
    return tokenized_dataset
# load training dataset


# tokenize the datasets
tokenized_datasets = {}
tokenized_datasets["train"] = train_dataset.map(tokenize_function, batched=True).map(add_labels)
tokenized_datasets["val"] = val_dataset.map(tokenize_function, batched=True).map(add_labels)
tokenized_datasets["test"] = test_dataset.map(tokenize_function, batched=True).map(add_labels)
print(tokenized_datasets["train"][0]["labels"])

Map: 100%|██████████| 100/100 [00:00<00:00, 4111.66 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 5262.81 examples/s]

1





In [7]:
# %pip install evaluate

In [8]:
# check if GPU is available
# ! nvidia-smi
# ! nvcc --version
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
print(torch.cuda.current_device())

2.6.0+cu126
12.6
True
1
NVIDIA GeForce RTX 3060 Laptop GPU
0


In [9]:
! pip install torch torchvision torchaudio accelerate>=0.26.0




In [11]:
# def train_one_epoch(model: nn.Module, dataloader: DataLoader, optimizer: Optimizer, epoch: int):
#     model.train()

#     with tqdm(dataloader, desc=f"Train Ep {epoch}", total=len(dataloader)) as tq:
#         for batch in tq:
#             # TODO: retrieve the data from your batch and send it to the same device as your model (i.e., model.device).
#             # Hint: model.device should point to 'cuda' as you set it as such in the main function below.
#             #       However, please use `model.device` and don't hard code it to 'cuda' as the auto-grader will put the model on CPU.
#             # text_encoding = {key: val.to(model.device) for key, val in batch.items() if key != "labels"}
#             input_ids = batch["text_encoding"]["input_ids"].to(model.device)
#             attention_mask = batch["text_encoding"]["attention_mask"].to(model.device)
#             label_encoding = batch["label_encoding"].to(model.device)

#             # TODO: Compute loss by running model with text_encoding and label_encoding.
#             output = model(input_ids=input_ids, attention_mask=attention_mask, labels=label_encoding)
#             loss = output.loss

#             # TODO: compute gradients and update parameters using optimizer.
#             # Hint: you need three lines of code here!
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#             tq.set_postfix({"loss": loss.detach().item()}) # for printing better-looking progress bar

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Convert logits to class labels
    return {"accuracy": (predictions == labels).mean()}

In [12]:

# training

learning_rate = 5e-5
num_train_epochs = 3


model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

model = model.cuda()

optimizer = AdamW(model.parameters(), lr=learning_rate)

training_args = TrainingArguments("results", num_train_epochs=3, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, logging_dir="logs", logging_steps=10)

metric = evaluate.load("accuracy")

trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["val"], compute_metrics=metric)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# This cell clears GPU memory, do this when GPU out of memory

# from numba import cuda
import gc
gc.collect()
# torch.cuda.empty_cache()

475

In [13]:

BATCH_SIZE = 32

In [None]:
# Train the model
trainer.train()

Step,Training Loss
