In [1]:
pip install transformers datasets tqdm torch


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
print(torch.__version__)

2.5.1+cu124


In [3]:
import datasets
import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import json


In [4]:
import json

# Open and read the JSON file
# file available here:
# https://www.dropbox.com/scl/fi/neput5su6btwxbsrm7dd1/qa_dataset_fromabstracts.json?rlkey=y7y2c3gg389rwadl2r1466vt9&dl=0

with open('qa_dataset_fromabstracts.json', 'r') as file:
    data = json.load(file)


from sklearn.model_selection import train_test_split

# Split the data into train, test, and validation sets
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)  # 80% train
test_data, val_data = train_test_split(temp_data, test_size=0.5, random_state=42)  # 10% test, 10% validation


In [5]:
from sklearn.model_selection import train_test_split

# Split the data into train, test, and validation sets
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)  # 80% train
test_data, val_data = train_test_split(temp_data, test_size=0.5, random_state=42)  # 10% test, 10% validation

# Save the splits back to JSON files (optional)
#with open("train_data.json", "w") as file:
#    json.dump(train_data, file, indent=4)
#
#with open("test_data.json", "w") as file:
#    json.dump(test_data, file, indent=4)
#
#with open("val_data.json", "w") as file:
#    json.dump(val_data, file, indent=4)


In [6]:
len(train_data)

233901

In [7]:
# Convert the list of dictionaries into a dictionary of lists
formatted_train_data = {
    "context": [item["context"] for item in train_data],
    "question": [item["question"] for item in train_data],
    "answers": [item["answer"] for item in train_data],
    "start_positions": [item["start_positions"] for item in train_data],
    "end_positions": [item["end_positions"] for item in train_data],
}

# Create a Hugging Face dataset
train_dataset = Dataset.from_dict(formatted_train_data)

In [8]:
# same for test dataset
formatted_test_data = {
    "context": [item["context"] for item in test_data],
    "question": [item["question"] for item in test_data],
    "answers": [item["answer"] for item in test_data],
    "start_positions": [item["start_positions"] for item in test_data],
    "end_positions": [item["end_positions"] for item in test_data],
}

# Create a Hugging Face dataset
test_dataset = Dataset.from_dict(formatted_test_data)

In [9]:
import torch
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


# Load a pre-trained model
model_name = "bert-base-uncased"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Move the model to GPU (optional, Trainer handles this automatically if GPU is available)
model = model.to(device)


cuda


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import Trainer

def compute_loss(model, inputs, return_outputs=False):
    # Forward pass
    outputs = model(**inputs)
    start_positions = inputs.get('start_positions')
    end_positions = inputs.get('end_positions')
    
    # Compute loss
    start_loss = outputs.loss[0]  # Start token loss
    end_loss = outputs.loss[1]    # End token loss
    
    # You can log both losses if needed
    total_loss = start_loss + end_loss
    
    if return_outputs:
        return total_loss, outputs
    return total_loss


In [11]:
# Tokenize the dataset
def preprocess_function(examples):
    tokenized_inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=384,
    )
    tokenized_inputs["start_positions"] = examples["start_positions"]
    tokenized_inputs["end_positions"] = examples["end_positions"]
    return tokenized_inputs

tokenized_train_datasets = train_dataset.map(preprocess_function, batched=True)
tokenized_test_datasets = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/233901 [00:00<?, ? examples/s]

Map:   0%|          | 0/29238 [00:00<?, ? examples/s]

In [12]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="/storage/homefs/alibert/CAS_NLP/project_week4/results_fine_tuning/",          # Output directory
    eval_strategy="steps",    # Evaluate every epoch
    learning_rate=2e-5,             # Learning rate
    per_device_train_batch_size=32, # Batch size per GPU
    per_device_eval_batch_size=32,  # Evaluation batch size
    num_train_epochs=1,             # Number of epochs
    weight_decay=0.01,              # Weight decay for optimizer
    logging_dir="/storage/homefs/alibert/CAS_NLP/project_week4/logs",           # Logging directory
    logging_steps=1000,               # Log every 10 steps
    save_steps=1000,                    # Save weights every 50 steps
    save_total_limit=100,               # Keep only the last 2 checkpoints
    save_strategy="steps",          # Save checkpoint every epoch
    load_best_model_at_end=True,    # Load the best model after training
    fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
    disable_tqdm=False,                      # Ensure tqdm is enabled
    report_to='none'                # no link to wandb
)


In [13]:
from transformers import TrainerCallback

class LossCallback(TrainerCallback):
    def __init__(self):
        self.training_losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            self.training_losses.append(logs["loss"])

loss_callback = LossCallback()


In [14]:
#import logging
#from transformers import TrainingArguments

# Set logging level to INFO to display logs in Jupyter
#logging.basicConfig(level=logging.INFO)


# Define training arguments
training_args = TrainingArguments(
    output_dir="/storage/homefs/alibert/CAS_NLP/project_week4/results_fine_tuning/",          # Output directory
    eval_strategy="steps",    # Evaluate every epoch
    eval_steps = 1000,
    learning_rate=2e-5,             # Learning rate
    per_device_train_batch_size=64, # Batch size per GPU
    per_device_eval_batch_size=64,  # Evaluation batch size
    num_train_epochs=20,             # Number of epochs
    weight_decay=0.01,              # Weight decay for optimizer
    logging_dir="/storage/homefs/alibert/CAS_NLP/project_week4/logs",           # Logging directory
    logging_steps=1,                # Log every step
    #log_level="info",               # Ensure Trainer logs are visible
    save_steps=1000,                    # Save weights every 50 steps
    save_total_limit=100,               # Keep only the last 2 checkpoints
    save_strategy="steps",          # Save checkpoint every epoch
    load_best_model_at_end=True,    # Load the best model after training
    fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
    disable_tqdm=False,                      # Ensure tqdm is enabled
    report_to='none'                # no link to wandb
)



In [15]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset = tokenized_test_datasets,
    callbacks=[loss_callback]
)



In [16]:
# Fine-tune the model
trainer.train()

training_losses = loss_callback.training_losses

Step,Training Loss,Validation Loss
1000,4.7762,4.821261
2000,4.6165,4.392002
3000,4.3789,4.152595
4000,3.8391,3.995791
5000,3.9836,3.821716
6000,3.7855,3.722317
7000,4.1585,3.656851
8000,3.5229,3.62869
9000,3.4287,3.540486
10000,3.4032,3.512425


In [17]:
import numpy as np
np.save("training_losses.npy", np.array(loss_callback.training_losses))


In [18]:
# Save the best model in a special directory
output_dir_best="/storage/homefs/alibert/CAS_NLP/project_week4/results_fine_tuning_best/"
trainer.model.save_pretrained(output_dir_best)
trainer.tokenizer.save_pretrained(output_dir_best)  # Optional: Save tokenizer
print(f"Best model saved to {output_dir_best}")

AttributeError: 'NoneType' object has no attribute 'save_pretrained'