In [1]:
# importing necessary libraries
import torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import math

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Load the dataset.
# For this exacise we will be using yoda sentences dataset from Huggingface.
dataset = load_dataset("dvgodoy/yoda_sentences")

# Viewing the dataset
dataset['train'][:10]

{'sentence': ['The birch canoe slid on the smooth planks.',
  'Glue the sheet to the dark blue background.',
  "It's easy to tell the depth of a well.",
  'These days a chicken leg is a rare dish.',
  'Rice is often served in round bowls.',
  'The juice of lemons makes fine punch.',
  'The box was thrown beside the parked truck.',
  'The hogs were fed chopped corn and garbage.',
  'Four hours of steady work faced us.',
  'Large size in stockings is hard to sell.'],
 'translation': ['On the smooth planks, the birch canoe slid.',
  'Glue the sheet to the dark blue background, you must.',
  'Easy it is, to tell the depth of a well.',
  'These days, a rare dish, a chicken leg is.',
  'In round bowls, rice often served is.',
  'Fine punch, the juice of lemons makes.',
  'Beside the parked truck, the box was thrown.',
  'Chopped corn and garbage, the hogs were fed.',
  'Faced us, four hours of steady work did.',
  'Hard to sell, large size in stockings is.'],
 'translation_extra': ['On the s

In [3]:
# We can not directly use this dataset for training. We need to format the datasets.
# we will process the dataset in this form "Sentence: [English Sentence] Translation: [Yoda Sentence]"
def format_yoda(example):
    return {"text": f"Sentence: {example['sentence']} Translation: {example['translation_extra']}"}

dataset = dataset.map(format_yoda)

dataset['train'][0]

{'sentence': 'The birch canoe slid on the smooth planks.',
 'translation': 'On the smooth planks, the birch canoe slid.',
 'translation_extra': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.',
 'text': 'Sentence: The birch canoe slid on the smooth planks. Translation: On the smooth planks, the birch canoe slid. Yes, hrrrm.'}

In [4]:
# Splitting the dataset to train and evaluation set
train_eval_split = dataset["train"].train_test_split(test_size=0.1)
dataset = DatasetDict({
    'train': train_eval_split['train'],
    'eval': train_eval_split['test']
})


In [15]:
# Step 2: Load a small pre-trained LLM model and tokenizer
# Using 'distilgpt2' as a small model example
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [16]:
# # Let's try to translate the sentence using original model.
prompt = "Sentence: The Sky is clear, it's time to fly.\nTranslation:" # Include the start of the target sequence
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

# Generate text
# Adjust generation parameters as needed to control output style
output_sequences = model.generate(
    input_ids,
    max_length=input_ids.shape[1] + 30, # Generate up to 30 new tokens
    num_return_sequences=1,
    no_repeat_ngram_size=2,             # To avoid immediate repetition
    do_sample=True,                     # Use sampling for more diverse output
    top_k=50,                           # Consider top 50 tokens
    top_p=0.95,                         # Nucleus sampling
    temperature=0.8,                    # Controls randomness (slightly higher for more variation)
    pad_token_id=tokenizer.eos_token_id # Ensure generation stops at EOS token
)

generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
print("--- Generated Text ---")
print(generated_text)
print("----------------------")

--- Generated Text ---
Sentence: The Sky is clear, it's time to fly.
Translation: If you've already been in the game and can't get to the cockpit and get in touch with me, I'm trying to contact you. If
----------------------


In [6]:
# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [7]:
# Step 3: Preparing the dataset for training (Tokenization)
def tokenize_function(examples):
    # Tokenize the text, padding and truncation will be handled by the data collator
    return tokenizer(examples["text"])

# Applying tokenize_function and removing other collumns
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["sentence", "translation", "translation_extra", "text"]
)

# Dataset after tokenization
tokenized_datasets['train'][0]

Map: 100%|██████████| 648/648 [00:00<00:00, 16368.11 examples/s]
Map: 100%|██████████| 72/72 [00:00<00:00, 9206.45 examples/s]


{'input_ids': [31837,
  594,
  25,
  5994,
  503,
  1115,
  8341,
  286,
  6266,
  13,
  33322,
  25,
  5994,
  503,
  1115,
  8341,
  286,
  6266,
  11,
  345,
  1276,
  13],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [8]:
# Data collator for causal language modeling
# This will handle padding and creating the labels (which are the input ids shifted)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [9]:
# Step 4: Set up the training arguments
output_dir = "./yoda_finetuned_model"
training_args = TrainingArguments(
    output_dir=output_dir,              # Location of Output Directory
    overwrite_output_dir=True,          # If directory already exists then overwrite
    num_train_epochs=5,                 # Number of time the model will be trained on entire data
    per_device_train_batch_size=4,      # Number of example will be trained at a time
    per_device_eval_batch_size=4,       # Number of example will be evaluated at a time
    eval_strategy="epoch",              # Changed from evaluation_strategy
    save_strategy="epoch",              # Changed from save_strategy
    logging_dir=f"{output_dir}/logs",   # Loggs will be saved here
    logging_steps=50,                   # Log saving frequently
    learning_rate=2e-5,                 
    weight_decay=0.01,
    load_best_model_at_end=True,        # Load the best model based on evaluation loss
    metric_for_best_model="eval_loss",
)


In [10]:
# Step 5: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    data_collator=data_collator,
)

In [11]:
# Step 6: Start the training process
print("Starting training...")
trainer.train()
print("Training finished.")


Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,2.1366,1.926351
2,1.8665,1.851621
3,1.6987,1.8397
4,1.6167,1.838783
5,1.6001,1.836944


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Training finished.


In [13]:
# Step 7: Evaluate the fine-tuned model
print("Evaluating model...")
eval_results = trainer.evaluate(eval_dataset=tokenized_datasets["eval"]) # Explicitly pass eval_dataset
print(f"Evaluation results: {eval_results}")
if 'eval_loss' in eval_results:
    perplexity = math.exp(eval_results["eval_loss"])
    print(f"Perplexity: {perplexity}")


Evaluating model...


Evaluation results: {'eval_loss': 1.8369437456130981, 'eval_runtime': 0.6711, 'eval_samples_per_second': 107.291, 'eval_steps_per_second': 26.823, 'epoch': 5.0}
Perplexity: 6.277323815417289


In [14]:
# Example of generating text with the fine-tuned model
print("Generating example text...")
model.eval() # Set model to evaluation mode. This is a standard practice in pytorch. Here certain layers of the model freezes

# Testing for given sentence
prompt = "Sentence: The Sky is clear, it's time to fly.\nTranslation:" # Include the start of the target sequence
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

# Generate text
# Adjust generation parameters as needed to control output style
output_sequences = model.generate(
    input_ids,
    max_length=input_ids.shape[1] + 30, # Generate up to 30 new tokens
    num_return_sequences=1,
    no_repeat_ngram_size=2,             # To avoid immediate repetition
    do_sample=True,                     # Use sampling for more diverse output
    top_k=50,                           # Consider top 50 tokens
    top_p=0.95,                         # Nucleus sampling
    temperature=0.8,                    # Controls randomness (slightly higher for more variation)
    pad_token_id=tokenizer.eos_token_id # Ensure generation stops at EOS token
)

generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
print("--- Generated Text ---")
print(generated_text)
print("----------------------")


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generating example text...
--- Generated Text ---
Sentence: The Sky is clear, it's time to fly.
Translation: To fly, the Sky must. Yes, hrrmmm. Yrsssss. Time to soar, we must, and there is. H
----------------------


In [None]:
# Let's try to translate the sentence using original model.