## LLM Finetuning

In [1]:
pip install transformers datasets


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [2]:
!export WANDB_MODE=disabled


In [3]:
import torch
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import os
import wandb
os.environ["WANDB_MODE"] = "disabled"


# Load the GPT-2 tokenizer and model (small version: 'gpt2')
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:

from datasets import Dataset
# Sample travel-related data
data = [
    {"text": "Paris is a beautiful city known for its iconic Eiffel Tower."},
    {"text": "A trip to Japan offers a perfect mix of culture, technology, and food."},
    {"text": "Bali is an island paradise in Indonesia, famous for its beaches and temples."},
    {"text": "Rome offers rich history, with ancient landmarks like the Colosseum."},
    {"text": "New York City is a bustling metropolis known for its skyscrapers and diverse culture."}
]

# Load dataset into Hugging Face's Dataset format
train_dataset = Dataset.from_dict({"text": [item["text"] for item in data]})

# Tokenize the dataset
def tokenize_function(examples):
    # Add the labels as the input_ids (shifted for autoregressive language modeling)
    # GPT-2 expects labels as input_ids for loss calculation
    encodings = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=64)
    encodings['labels'] = encodings['input_ids']  # The labels should be the same as input_ids
    return encodings

tokenized_datasets = train_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [5]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",            # output directory for model predictions and checkpoints
    evaluation_strategy="no",          # Disable evaluation (no eval dataset)
    logging_strategy="no",             # Disable logging
    learning_rate=5e-5,                # set learning rate
    per_device_train_batch_size=2,     # batch size per device during training
    num_train_epochs=15,                # number of training epochs
    weight_decay=0.01,                 # strength of weight decay
    save_steps=100,                    # save model every 100 steps
    save_total_limit=2,                # limit the number of saved checkpoints
    fp16=True,                        # don't use fp16 (since we don't have a GPU)
    logging_dir=None,                 # No logging directory will be created
    report_to=None,                   # Disable W&B logging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)



In [6]:
# Train the model
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=45, training_loss=2.063592698838976, metrics={'train_runtime': 15.1988, 'train_samples_per_second': 4.935, 'train_steps_per_second': 2.961, 'total_flos': 2449612800000.0, 'train_loss': 2.063592698838976, 'epoch': 15.0})

In [7]:
# Save the fine-tuned model
model.save_pretrained("./model/fine_tuned_travel_gpt2")
tokenizer.save_pretrained("./model/fine_tuned_travel_gpt2")

print("Fine-tuning completed and model saved!")


Fine-tuning completed and model saved!


In [8]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./model/fine_tuned_travel_gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("./model/fine_tuned_travel_gpt2")

# Set the model to evaluation mode
model.eval()

# Example prompt (You can change this to any travel-related prompt)
prompt = "The best places to visit in Europe are"

# Encode the prompt text
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Generate text
output = model.generate(
    input_ids,
    max_length=100,       # maximum length of the generated text
    num_beams=5,          # number of beams for beam search (higher = more diverse)
    no_repeat_ngram_size=2,  # Prevent repeating n-grams
    temperature=0.7,      # Controls randomness in predictions (lower = more focused)
    top_k=50,             # Limits the number of highest probability tokens to consider
    top_p=0.95,           # Uses nucleus sampling (top-p sampling)
    do_sample=True,       # Whether to use sampling or greedy decoding
    early_stopping=True   # Stop early when an end token is reached
)

# Decode the generated output back to text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Prompt:", prompt)
print("Generated Text:", generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Prompt: The best places to visit in Europe are
Generated Text: The best places to visit in Europe are cities like Barcelona, Milan, and Rome.


In [9]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the pre-trained GPT-2 model and tokenizer (before fine-tuning)
pretrained_model = GPT2LMHeadModel.from_pretrained("gpt2")
pretrained_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Load the fine-tuned GPT-2 model and tokenizer (after fine-tuning)
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./model/fine_tuned_travel_gpt2")
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("./model/fine_tuned_travel_gpt2")

# Set the models to evaluation mode
pretrained_model.eval()
fine_tuned_model.eval()

# Example prompt (You can change this to any travel-related prompt)
prompt = "The best places to visit in Europe are"

# Function to generate text from a model
def generate_text(model, tokenizer, prompt):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(
        input_ids,
        max_length=100,       # maximum length of the generated text
        num_beams=5,          # number of beams for beam search (higher = more diverse)
        no_repeat_ngram_size=2,  # Prevent repeating n-grams
        temperature=0.7,      # Controls randomness in predictions (lower = more focused)
        top_k=50,             # Limits the number of highest probability tokens to consider
        top_p=0.95,           # Uses nucleus sampling (top-p sampling)
        do_sample=True,       # Whether to use sampling or greedy decoding
        early_stopping=True   # Stop early when an end token is reached
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Generate text using the pre-trained model (before fine-tuning)
pretrained_generated_text = generate_text(pretrained_model, pretrained_tokenizer, prompt)

# Generate text using the fine-tuned model (after fine-tuning)
fine_tuned_generated_text = generate_text(fine_tuned_model, fine_tuned_tokenizer, prompt)

# Print the comparison
print("Prompt:", prompt)
print("\nPre-trained GPT-2 Model (Before Fine-tuning):")
print(pretrained_generated_text)

print("\nFine-tuned GPT-2 Model (After Fine-tuning):")
print(fine_tuned_generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: The best places to visit in Europe are

Pre-trained GPT-2 Model (Before Fine-tuning):
The best places to visit in Europe are in the United States, Canada, Australia, New Zealand, and South Africa.

If you're looking for a place to stay, check out our list of the best European vacation destinations.

Fine-tuned GPT-2 Model (After Fine-tuning):
The best places to visit in Europe are cities like Paris, Milan, and Rome.
