In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, TrainerCallback
from datasets import load_dataset
import torch
import matplotlib.pyplot as plt
import time

In [2]:
# Step 1: Load the dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
print(f"dataset shape : {dataset.shape}")

dataset shape : {'test': (4358, 1), 'train': (36718, 1), 'validation': (3760, 1)}


In [3]:
# Step 2: Load pre-trained GPT-2 model and tokenizer 
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Tokenizer padding
tokenizer.pad_token = tokenizer.eos_token



In [4]:
# Test generation before fine-tuning
def generate_text_before(prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs["input_ids"], max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [5]:
prompt = "A PhD student is excited to join Huawei research team"
print("Before fine-tuning:")
print(generate_text_before(prompt))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Before fine-tuning:
A PhD student is excited to join Huawei research team and work with them on a new project.

"We are excited to be working with Huawei on a new project that will allow us to develop a new wireless technology that will enable us to deliver


In [6]:
# Step 3: Tokenize the text
def tokenize_function(examples):
    #1024 is set as the max_length to utilizes the full context window of GPT-2,
    #  which is better for understanding long sequences of text, however it uses much memory comparing to 512
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    #Use input_ids as labels for training
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

In [7]:
#For effeciency and simplicity the text columns is removed
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [8]:
# Custom callback to log losses
class LossLogger(TrainerCallback):
    def __init__(self):
        self.train_losses = []
        self.eval_losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            # Log training loss
            if 'loss' in logs:
                self.train_losses.append(logs['loss'])

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is not None:
            # Log evaluation loss
            if 'eval_loss' in metrics:
                self.eval_losses.append(metrics['eval_loss'])

# Initialize the loss logger
loss_logger = LossLogger()

In [9]:
# Step 5: Training arguments for fine-tuning
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,#7
    logging_dir="./logs",
    logging_steps=50,
    eval_steps=50,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Enables mixed precision if GPU supports it
    save_steps=500
)



In [10]:
# Step 6: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [11]:
def print_gpu_memory_usage():
    if torch.cuda.is_available():
        print(f"Allocated: {torch.cuda.memory_allocated() / (1024 ** 2):.2f} MB")
        print(f"Cached: {torch.cuda.memory_reserved() / (1024 ** 2):.2f} MB")

In [None]:
# Step 7: Fine-tune the model

# Start timing
start_time = time.perf_counter()
print_gpu_memory_usage()
trainer.train()
print_gpu_memory_usage()
# End timing
end_time = time.perf_counter()

# Calculate training time
training_time = end_time - start_time
print(f"Training Time: {training_time:.2f} seconds")

In [None]:
# Access logged losses
train_losses = loss_logger.train_losses
eval_losses = loss_logger.eval_losses

# Plot the losses
plt.figure(figsize=(12, 6))
plt.plot(train_losses, label='Training Loss', color='blue')
plt.plot(range(len(eval_losses)), eval_losses, label='Evaluation Loss', color='orange', marker='o')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training and Evaluation Losses')
plt.legend()
plt.show()

In [None]:
# Save the fine-tuned model
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

In [17]:
# Load the fine-tuned GPT-2 model and tokenizer
fine_tuned_model_path = "./gpt2-finetuned"  # Path to your fine-tuned model

tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)
model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_path)

In [18]:
# Test generation after fine-tuning
def generate_text_after(prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs["input_ids"], max_length=max_length, num_return_sequences=1, attention_mask=inputs["attention_mask"], eos_token_id=None)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
print("After fine-tuning:")
prompt = "A PhD student is excited to join Huawei research team"
print(generate_text_after(prompt))