In [1]:
!pip install transformers datasets peft accelerate torch

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m968.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting accelerate
  Downloading accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_

In [3]:

import torch
from transformers import BertTokenizerFast

# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def preprocess_data(data):
    tokenized_data = []
    for item in data:
        # Tokenize the question and context together with offset mapping
        inputs = tokenizer(
            item['question'],
            item['context'],
            max_length=512,
            truncation=True,
            padding='max_length',
            return_offsets_mapping=True,  # This is crucial
            return_tensors='pt'
        )

        offset_mapping = inputs.pop('offset_mapping')  # Extract offset mapping
        input_ids = inputs['input_ids'].squeeze()  # Remove batch dimension

        # Convert character indices to token indices for the answer
        start_char = item['answer_start_index']
        end_char = item['answer_end_index']

        start_token_idx, end_token_idx = None, None

        for i, (start, end) in enumerate(offset_mapping.squeeze().tolist()):
            if start_char >= start and start_char < end:
                start_token_idx = i
            if end_char > start and end_char <= end:
                end_token_idx = i
                break  # Stop once the end position is found

        # Ensure valid token indices
        if start_token_idx is None or end_token_idx is None:
            continue  # Skip this example if indices are not found

        tokenized_data.append({
            'input_ids': input_ids,
            'attention_mask': inputs['attention_mask'].squeeze(),
            'token_type_ids': inputs['token_type_ids'].squeeze(),
            'start_positions': torch.tensor([start_token_idx]),
            'end_positions': torch.tensor([end_token_idx])
        })
    
    return tokenized_data

# Example usage:
import json

# Load dataset
def load_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# Load your dataset
data = load_dataset('qa_london_data2.json')

# Preprocess the data
tokenized_datasets = preprocess_data(data)
from transformers import BertForQuestionAnswering, Trainer, TrainingArguments

# Prepare model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Training arguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # Disable evaluation
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,  # Only training dataset
    tokenizer=tokenizer,
)

trainer.train()


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,0.3093


TrainOutput(global_step=585, training_loss=0.26500008636050754, metrics={'train_runtime': 120.4137, 'train_samples_per_second': 77.632, 'train_steps_per_second': 4.858, 'total_flos': 2442602081968128.0, 'train_loss': 0.26500008636050754, 'epoch': 3.0})

In [4]:
model.save_pretrained("./fine_tuned_bert")
tokenizer.save_pretrained("./fine_tuned_bert")


('./fine_tuned_bert/tokenizer_config.json',
 './fine_tuned_bert/special_tokens_map.json',
 './fine_tuned_bert/vocab.txt',
 './fine_tuned_bert/added_tokens.json',
 './fine_tuned_bert/tokenizer.json')

In [5]:
from transformers import pipeline

# Load the fine-tuned model
qa_pipeline = pipeline("question-answering", model="./fine_tuned_bert", tokenizer="./fine_tuned_bert")

# Test on a sample question
result = qa_pipeline({
    "question": "To which category does the Christmas Lights by Night Open-Top Bus Tour belong?",
    "context": "Christmas Lights by Night Open-Top Bus Tour is an activity of type guided tour. It lasts 1.5 hours..."
})

print(result)


Device set to use cuda:0


{'score': 0.9995142817497253, 'start': 67, 'end': 78, 'answer': 'guided tour'}


