# (b) LoRA (PEFT) — SmolLM2‑135M
**Created:** 2025-11-10 02:42 UTC

This notebook applies **LoRA** (parameter‑efficient fine‑tuning) to the same SmolLM2‑135M model. We keep the dataset tiny to finish fast.

In [1]:
!pip -q install --upgrade pip
!pip -q install "transformers>=4.44.2" "datasets>=2.19.0" "accelerate>=0.33.0" "peft>=0.12.0" "trl" "bitsandbytes" "unsloth>=2024.11.0"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
cudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
bigframes 2.12.0 requires rich<14,>=12.4.4, but you have rich 14.2.0 which is incompatible.
libcugraph-cu12 25.6.0 requires libraft-cu12==25.6.*, but you have libraft-cu12 25.2.0 which is incompatible.
torchaudio 2.6.0+cu124 requires torch==2.6.0, but you have torch 2.8.0 which is incompatible.
cudf-polars-cu1

In [2]:
import torch, platform
print("Python:", platform.python_version())
print("Torch:", torch.__version__)
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

Python: 3.11.13
Torch: 2.8.0+cu128
GPU: Tesla T4


In [3]:
from datasets import Dataset
pairs = [
    {"instruction":"Summarize: `merge sort` algorithm.","response":"Split array, sort halves recursively, and merge. O(n log n)."},
    {"instruction":"Write a unit test for add(a,b).","response":"def test_add():\n    assert add(2,3)==5"},
    {"instruction":"Give 3 bullet points about hashing.","response":"- Maps keys to indices\n- Collisions need handling\n- O(1) average lookup"},
    {"instruction":"Why use gradient accumulation?","response":"It simulates larger batch sizes when memory is limited."},
]
def simple_template(example):
    return {"text": f"### Instruction\n{example['instruction']}\n\n### Response\n{example['response']}"}
raw_ds = Dataset.from_list(pairs)
ds = raw_ds.map(simple_template, remove_columns=raw_ds.column_names).train_test_split(test_size=0.25, seed=7)
ds

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 3
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1
    })
})

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
def tok(b): return tokenizer(b["text"], truncation=True, max_length=512)
tokenized = ds.map(tok, batched=True, remove_columns=["text"])
base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32)
base_model.resize_token_embeddings(len(tokenizer))
base_model.config.use_cache = False

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
2025-11-10 03:19:09.488065: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762744749.686655      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762744749.741282      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Skipping import of cpp extensions due to incompatible torch version 2.8.0+cu128 for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

## Attach LoRA adapters
We target all attention projections for a simple, robust default.

In [5]:
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    bias="none",
)
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

trainable params: 4,884,480 || all params: 139,399,488 || trainable%: 3.5039


In [7]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
import transformers, torch
print("Transformers:", transformers.__version__)  # for the recording

args = TrainingArguments(
    output_dir="/kaggle/working/smollm2_lora",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    bf16=torch.cuda.is_available(),
    learning_rate=2e-4,
    logging_steps=5,
    # new-style flags (>=4.47)
    eval_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    eval_steps=20,
    save_steps=50,
    max_steps=120,
    report_to="none",
)

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,  # your PEFT LoRA-wrapped model
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    data_collator=collator,
)
train_result = trainer.train()
train_result


Transformers: 4.57.1


Step,Training Loss,Validation Loss
20,1.0815,6.21671
40,0.0565,8.933578
60,0.0404,9.076246
80,0.0385,8.980349
100,0.0411,9.026912
120,0.0367,9.065476


TrainOutput(global_step=120, training_loss=0.40854275872310003, metrics={'train_runtime': 64.602, 'train_samples_per_second': 29.72, 'train_steps_per_second': 1.858, 'total_flos': 8638197903360.0, 'train_loss': 0.40854275872310003, 'epoch': 120.0})

In [8]:
model.save_pretrained("/kaggle/working/smollm2_lora")
tokenizer.save_pretrained("/kaggle/working/smollm2_lora")
print("Saved LoRA to /kaggle/working/smollm2_lora")

Saved LoRA to /kaggle/working/smollm2_lora


### Merge LoRA (optional) and test generation

In [None]:
# Optional: merge weights for export/inference without PEFT
try:
    merged = model.merge_and_unload()
    merged.save_pretrained("/kaggle/working/smollm2_lora_merged")
    tokenizer.save_pretrained("/kaggle/working/smollm2_lora_merged")
    print("Merged model saved.")
except Exception as e:
    print("Merge skipped:", e)

inputs = tokenizer("Instruction: Write a haiku about coding.\nResponse:", return_tensors="pt").to(model.device)
with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=64, do_sample=True, temperature=0.8)
print(tokenizer.decode(out[0], skip_special_tokens=True))