In [1]:
# Fine-tuning LLM

In [2]:
! pip install transformers
! pip install dataset
! pip install sentencepiece
! pip install accelerate

Collecting dataset
  Downloading dataset-1.6.2-py2.py3-none-any.whl (18 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2 (from dataset)
  Downloading SQLAlchemy-1.4.52-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=0.6.2 (from dataset)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting banal>=1.0.1 (from dataset)
  Downloading banal-1.0.6-py2.py3-none-any.whl (6.1 kB)
Collecting Mako (from alembic>=0.6.2->dataset)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: banal, sqlalchemy, Mako, alembic, dataset

In [3]:
# Load a LLaMA2 model

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
# preprare sample data

In [6]:
sample_data = [
    {"input": "The capital of France is", "output": "Pairs"},
    {"input": "The tallest mountain in the world is", "output": "Mount Everest"}
]

In [7]:
# Generate more samples to total 100

In [11]:
for i in range(98):
  sample_data.append({"input": f"Sample input text {i}", "output": f"Sample output text {i}" })

In [13]:
# Convert sample data to a format suitable for fine-tuning
train_texts = [sample["input"] + " " + sample["output"] for sample in sample_data]

In [None]:
train_texts

In [15]:
# Tokenize Data

In [26]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model.resize_token_embeddings(len(tokenizer))

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)

In [27]:
# Prepare Dataset for Fine-Tuning

In [28]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = TextDataset(train_encodings)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)


In [29]:
# Fine-Tune the Model

In [36]:
from transformers import AdamW

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Set the model in training mode
model.train()

# Training loop
for epoch in range(10):  # Number of epochs
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")


Epoch: 0, Loss: 2.367539405822754
Epoch: 0, Loss: 1.7704092264175415
Epoch: 0, Loss: 0.8058674335479736
Epoch: 0, Loss: 1.0801846981048584
Epoch: 0, Loss: 0.8340173363685608
Epoch: 0, Loss: 1.0729953050613403
Epoch: 0, Loss: 0.9648277759552002
Epoch: 0, Loss: 1.4708751440048218
Epoch: 0, Loss: 0.851088285446167
Epoch: 0, Loss: 1.0633578300476074
Epoch: 0, Loss: 0.6558752655982971
Epoch: 0, Loss: 0.781147301197052
Epoch: 0, Loss: 0.609009861946106
Epoch: 1, Loss: 1.4753180742263794
Epoch: 1, Loss: 0.7105396389961243
Epoch: 1, Loss: 1.1886485815048218
Epoch: 1, Loss: 0.7345221638679504
Epoch: 1, Loss: 1.0464305877685547
Epoch: 1, Loss: 0.8424046039581299
Epoch: 1, Loss: 0.7358371615409851
Epoch: 1, Loss: 0.7263699173927307
Epoch: 1, Loss: 0.7732903957366943
Epoch: 1, Loss: 1.0020477771759033
Epoch: 1, Loss: 0.6041343808174133
Epoch: 1, Loss: 1.169452428817749
Epoch: 1, Loss: 0.6309430599212646
Epoch: 2, Loss: 1.4377204179763794
Epoch: 2, Loss: 0.9142189025878906
Epoch: 2, Loss: 0.8869530

In [37]:
# Evaluate the Fine-Tuned Model
model.eval()

# Example input
input_text = "The capital of France is"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

In [38]:
# Generate output
with torch.no_grad():
    output_ids = model.generate(input_ids, max_length=20)

output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The capital of France is Pairs 39 Sample output 39
