In [1]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
import pandas as pd
import numpy as np
import re
import os

  from .autonotebook import tqdm as notebook_tqdm


### Fine Tuning with Full Text

In [2]:
traing_path = '/Users/aaroncui/Desktop/UCL/NLP/NLP_project/data/hp1.txt'
with open (traing_path, 'r') as f:
    training_text = f.read()

text_data = re.sub(r'\n+', '\n', training_text).strip()  # Remove excess newline characters
print(text_data[:1000])

Chapter 1
The Boy Who Lived
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be in­volved in anything strange or mysterious, because they just didn’t hold with such nonsense.
Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, al­though he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy any­where.
The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn’t think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley’s sister, but th

In [3]:
def load_dataset(file_path, tokenizer, block_size = 128):
    # Define a function to load a dataset from a file using a specified tokenizer and optional block size.
    dataset = TextDataset(
        tokenizer = tokenizer,  # Assign the tokenizer to be used for tokenizing the text data.
        file_path = file_path,  # Specify the path to the file containing the text data.
        block_size = block_size,  # Set the block size for splitting the text into chunks. Default is 128.
    )

    # Create an instance of TextDataset with the provided tokenizer, file path, and block size.
    return dataset  # Return the created dataset object.

def load_data_collator(tokenizer, mlm = False):
    # Define a function named load_data_collator that takes two parameters: tokenizer and mlm (with a default value of False)
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,   # Pass the tokenizer object to the DataCollatorForLanguageModeling constructor
        mlm=mlm,             # Pass the mlm (Masked Language Modeling) flag to the DataCollatorForLanguageModeling constructor
    )

    # Create an instance of DataCollatorForLanguageModeling with the provided tokenizer and mlm flag
    return data_collator  # Return the created data collator instance


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
    # Load the tokenizer from the specified model
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    # Load the training dataset using the specified file path and tokenizer
    train_dataset = load_dataset(train_file_path, tokenizer)
    # Load the data collator which is used to format the dataset for training
    data_collator = load_data_collator(tokenizer)

    # Save the tokenizer to the specified output directory
    tokenizer.save_pretrained(output_dir)
      
    # Load the model from the specified model name
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Save the model to the specified output directory
    model.save_pretrained(output_dir)

    # Define the training arguments
    training_args = TrainingArguments(
          output_dir=output_dir,  # Directory to save the model and logs
          overwrite_output_dir=overwrite_output_dir,  # Whether to overwrite the output directory
          per_device_train_batch_size=per_device_train_batch_size,  # Batch size for training
          num_train_epochs=num_train_epochs,  # Number of training epochs
      )

    # Initialize the Trainer with the model, training arguments, data collator, and training dataset
    trainer = Trainer(
          model=model,  # The model to train
          args=training_args,  # Training arguments
          data_collator=data_collator,  # Data collator for formatting the dataset
          train_dataset=train_dataset,  # Training dataset
  )
      
    # Start the training process
    trainer.train()
    # Save the trained model
    trainer.save_model()

Performance Not improved! Don't run this block

In [1]:
pass
# train_file_path = traing_path
# model_name = 'gpt2'
# output_dir = './models'
# overwrite_output_dir = False
# per_device_train_batch_size = 8
# num_train_epochs = 50.0
# save_steps = 50000



# # Train
# train(
#     train_file_path=train_file_path,
#     model_name=model_name,
#     output_dir=output_dir,
#     overwrite_output_dir=overwrite_output_dir,
#     per_device_train_batch_size=per_device_train_batch_size,
#     num_train_epochs=num_train_epochs,
#     save_steps=save_steps
# )

### Fine Tuning with QA set - Start with small number of question answer pairs

1. Make data set

In [None]:
import json
import torch

qa_data_path = '/Users/aaroncui/Desktop/UCL/NLP/NLP_project/data/Harry_Potter_Data_updated.json'

with open(qa_data_path) as f:
    qa_data = json.load(f)

In [28]:
torch.manual_seed(46)

# 2. random sample 20 question answer pair
question_number = torch.randperm(120)[:20].tolist()

train_qa = {}
train_qa['data'] = []
for i in question_number:
    tmp = {}
    tmp['question'] = (
    "You are an assistant with expert knowledge of the Harry Potter series. "
    "Based on the following passage, answer the question concisely in one sentence.\n\n"
    "Passage:\n" + qa_data[i]['content'] + "\n\n"
    "Question: " + qa_data[i]['question'] + "\n"
    "Answer:"
)
    tmp['answer'] = 'Correct option: ' + qa_data[i]['correct_answer_label'] + ' ' + qa_data[i]['correct_answer'] + '. ' + 'Reference text: ' + qa_data[i]['content']
    train_qa['data'].append(tmp)

output_file = '/Users/aaroncui/Desktop/UCL/NLP/NLP_project/data/Harry_Potter_qa_fine_tuning.json'

with open(output_file, "w") as out_f:
        json.dump(train_qa, out_f, indent=4)




In [None]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline

dataset = load_dataset('json', data_files='/content/drive/MyDrive/Colab Notebooks/NLP/Harry_Potter_qa_fine_tuning.json', field='data')

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def format_and_tokenize(batch):
    # Build a list of formatted strings from the batch data.
    texts = [
        f"Question: {q}\nAnswer: {a}"
        for q, a in zip(batch["question"], batch["answer"])
    ]
    print(texts)
    # Tokenize the batch of texts
    return tokenizer(texts, truncation=True, padding='max_length', max_length=512)
    
tokenized_dataset = dataset.map(format_and_tokenize, batched=True)

# ---------------------------
# 3. Fine-Tune GPT-2
# ---------------------------
# Load the GPT-2 model with a language modeling head.
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Resize the model embeddings in case new tokens have been added.
model.resize_token_embeddings(len(tokenizer))

# Set up training arguments.
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Colab Notebooks/NLP/models/custom_q_n_a2',
    num_train_epochs=3,                    # Adjust the number of epochs as needed
    per_device_train_batch_size=2,         # Adjust based on your GPU memory
    logging_steps=100,
    save_steps=500,
    evaluation_strategy='no',              # Set to 'epoch' if you have a validation split
    fp16=True,                              # Use mixed precision if supported
)

# Use the data collator to handle dynamic padding and prepare labels for language modeling.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initialize the Trainer.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],  # Adjust if your dataset is structured differently
    data_collator=data_collator
)

# Start fine-tuning.
trainer.train()

# Save the fine-tuned model and tokenizer.
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/NLP/models/custom_q_n_a2")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/NLP/models/custom_q_n_a2")

# ---------------------------
# 4. Generate Answers with the Fine-Tuned Model
# ---------------------------
# Create a text generation pipeline using the saved model and tokenizer.
text_generator = pipeline('text-generation', model='/content/drive/MyDrive/Colab Notebooks/NLP/models/custom_q_n_a2', tokenizer='/content/drive/MyDrive/Colab Notebooks/NLP/models/custom_q_n_a2')

# Define a sample question prompt. The prompt should follow the same format used during training.
sample_question = "Who is the director of Grunnings?"
prompt = f"Question: {sample_question}\nAnswer:"

# Generate the answer.
output = text_generator(
    prompt,
    max_length=200,
    #do_sample=True,         # Enable sampling for more diverse output.
    #top_k=50,               # Consider the top 50 tokens at each step.
    #top_p=0.90,             # Nucleus sampling: consider tokens with cumulative probability of 90%.
    repetition_penalty=1.0, # Penalize repeated tokens.
    early_stopping=True,
    num_return_sequences=1
)

print(output[0]['generated_text'])