<a href="https://colab.research.google.com/github/swati-git/FineTuneLLM/blob/main/FineTuning_a_LLM_LIMA_CPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers==4.57.3

In [2]:
!pip install -q torch==2.9.0

In [3]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("facebook/opt-1.3b")
print(config.dtype)

torch.float16


In [4]:
print(f"Model name: {config.model_type}")
print(f"Hidden size: {config.hidden_size}")           # 2048
print(f"Number of layers: {config.num_hidden_layers}") # 24
print(f"Vocabulary size: {config.vocab_size}")        # 50272
print(f"Max sequence length: {config.max_position_embeddings}") # 2048

Model name: opt
Hidden size: 2048
Number of layers: 24
Vocabulary size: 50272
Max sequence length: 2048


In [5]:
from transformers import AutoModelForCausalLM
import torch


model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype=torch.bfloat16)

`torch_dtype` is deprecated! Use `dtype` instead!


In [6]:
def get_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    # Calculate memory (in GB)
    bytes_per_param = 2 if str(model.dtype) == "torch.bfloat16" else 4
    memory_gb = (total_params * bytes_per_param) / (1024**3)

    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Model size in memory: {memory_gb:.2f} GB")
    print(f"Data type: {model.dtype}")



In [7]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [8]:
print_trainable_parameters(model)

trainable params: 1315758080 || all params: 1315758080 || trainable%: 100.0


In [9]:
get_model_size(model)

Total parameters: 1,315,758,080
Trainable parameters: 1,315,758,080
Model size in memory: 2.45 GB
Data type: torch.bfloat16


In [10]:
#Rule of thumb: Need 3-4x model size for training (gradients, optimizer states, etc.)
#2.6 GB model → need ~8-10 GB GPU for training

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")

# ===== CHECK THESE =====
print(f"Vocab size (tokenizer): {len(tokenizer)}")
print(f"Vocab size (model): {model.config.vocab_size}")

# These should match!
#assert len(tokenizer) == model.config.vocab_size, "Mismatch!"

# Check special tokens
print(f"Padding token: {tokenizer.pad_token}")
print(f"EOS token: {tokenizer.eos_token}")
print(f"BOS token: {tokenizer.bos_token}")

# Test tokenization
sample = "Write a product description for headphones"
tokens = tokenizer.encode(sample)
print(f"Sample tokenization: {tokens}")
print(f"Number of tokens: {len(tokens)}")

Vocab size (tokenizer): 50265
Vocab size (model): 50272
Padding token: <pad>
EOS token: </s>
BOS token: </s>
Sample tokenization: [2, 45714, 10, 1152, 8194, 13, 15684]
Number of tokens: 7


In [12]:
!pip install -q deeplake==3.7.1

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 4.0.0 requires dill<0.3.9,>=0.3.0, but you have dill 0.4.0 which is incompatible.
datasets 4.0.0 requires multiprocess<0.70.17, but you have multiprocess 0.70.18 which is incompatible.[0m[31m
[0m

In [13]:
import deeplake

# Connect to the training and testing datasets
ds = deeplake.load('hub://genai360/GAIR-lima-train-set')
ds_test = deeplake.load('hub://genai360/GAIR-lima-test-set')

/

Opening dataset in read-only mode as you don't have write permissions.



-

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/genai360/GAIR-lima-train-set



-

hub://genai360/GAIR-lima-train-set loaded successfully.



/

Opening dataset in read-only mode as you don't have write permissions.


|

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/genai360/GAIR-lima-test-set



\

hub://genai360/GAIR-lima-test-set loaded successfully.



 

In [14]:
ds

Dataset(path='hub://genai360/GAIR-lima-train-set', read_only=True, tensors=['answer', 'embeddings', 'question', 'source'])

In [15]:
!pip install pipdeptree



In [16]:
!pipdeptree -p transformers

"/usr/lib/python3/dist-packages"
  cryptography                     3.4.8            (using 43.0.3, "/usr/local/lib/python3.12/dist-packages")
  keyring                          23.5.0           (using 25.7.0, "/usr/local/lib/python3.12/dist-packages")
  more-itertools                   8.10.0           (using 10.8.0, "/usr/local/lib/python3.12/dist-packages")
  importlib-metadata               4.6.4            (using 8.7.0, "/usr/local/lib/python3.12/dist-packages")
  oauthlib                         3.2.0            (using 3.3.1, "/usr/local/lib/python3.12/dist-packages")
  PyGObject                        3.42.1           (using 3.48.2, "/usr/local/lib/python3.12/dist-packages")
  python-apt                       2.4.0+ubuntu4.1  (using 0.0.0, "/usr/local/lib/python3.12/dist-packages")
  pyparsing                        2.4.7            (using 3.2.5, "/usr/local/lib/python3.12/dist-packages")
  SecretStorage                    3.3.1            (using 3.5.0, "/usr/local/lib/python3.1

In [17]:
!pip list

Package                                  Version
---------------------------------------- --------------------
absl-py                                  1.4.0
accelerate                               1.12.0
access                                   1.1.10.post3
affine                                   2.4.0
aioboto3                                 15.5.0
aiobotocore                              2.25.1
aiofiles                                 24.1.0
aiohappyeyeballs                         2.6.1
aiohttp                                  3.13.2
aioitertools                             0.13.0
aiosignal                                1.4.0
aiosqlite                                0.22.0
alabaster                                1.0.0
albucore                                 0.0.24
albumentations                           2.0.8
ale-py                                   0.11.2
alembic                                  1.17.2
altair                                   5.5.0
annotated-doc             

In [18]:
def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""
    text = f"Question: {example['question'].text()}\n\nAnswer: {example['answer'].text()}"
    return text

In [19]:
!pip install trl==0.19.1

Collecting trl==0.19.1
  Downloading trl-0.19.1-py3-none-any.whl.metadata (10 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=3.0.0->trl==0.19.1)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess<0.70.17 (from datasets>=3.0.0->trl==0.19.1)
  Using cached multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading trl-0.19.1-py3-none-any.whl (376 kB)
Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Using cached multiprocess-0.70.16-py312-none-any.whl (146 kB)
Installing collected packages: dill, multiprocess, trl
[2K  Attempting uninstall: dill
[2K    Found existing installation: dill 0.4.0
[2K    Uninstalling dill-0.4.0:
[2K      Successfully uninstalled dill-0.4.0
[2K  Attempting uninstall: multiprocess
[2K    Found existing installation: multiprocess 0.70.18
[2K    Uninstalling multiprocess-0.70.18:
[2K      Successfully uninstalled multiprocess-0.70.18
[2K  Attempting uninstall: trl
[2K    Found existing installation: trl 0.26

In [1]:
#Given that the model's max sequence length is 2048 tokens as per  "{config.max_position_embeddings}" we'll structure our dataset to match it.

from trl.trainer import ConstantLengthDataset

train_dataset = ConstantLengthDataset(
    tokenizer,
    ds,
    formatting_func=prepare_sample_text,
    infinite=True,
    seq_length=2048
)


NameError: name 'tokenizer' is not defined