# Colab1: Full Finetuning with smolLM2 135M

Run in Colab with GPU. This notebook performs full finetuning (full_finetuning=True) on a small dataset using the smolLM2-135M model via Unsloth.

In [None]:
# Install Unsloth and deps (Colab)
!pip install -q unsloth datasets transformers accelerate peft

In [None]:
# Select model and training config
MODEL_NAME = "unsloth/smollm2-135m"
OUTPUT_DIR = "/content/unsloth-smollm2-finetune"
EPOCHS = 1
BATCH_SIZE = 4
LR = 2e-4
FULL_FINETUNE = True  # required for full finetuning


In [None]:
# Load a tiny sample dataset (replace with your own)
from datasets import load_dataset, Dataset

train_texts = [
    "Hello, how can I help you?",
    "What is your favorite book?",
    "Tell me a joke about cats",
    "Give me a short poem about the sea",
]
train_dataset = Dataset.from_dict({"text": train_texts})
train_dataset = train_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset


In [None]:
# Prepare tokenizer/model
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    MODEL_NAME,
    load_in_4bit=False,
)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
# Simple formatting function
MAX_LEN = 256

def preprocess(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

train_tokenized = train_dataset["train"].map(preprocess, batched=True)
val_tokenized = train_dataset["test"].map(preprocess, batched=True)


In [None]:
# Data collator
import torch

def collate_fn(batch):
    input_ids = torch.tensor([b['input_ids'] for b in batch])
    attention_mask = torch.tensor([b['attention_mask'] for b in batch])
    labels = input_ids.clone()
    labels[labels == tokenizer.pad_token_id] = -100
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


In [None]:
# Training loop via Hugging Face Trainer
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=collate_fn,
)

trainer.train()


In [None]:
# Save and test generation
trainer.save_model(OUTPUT_DIR)

from transformers import pipeline
pipe = pipeline("text-generation", model=OUTPUT_DIR, tokenizer=tokenizer, device=0)
print(pipe("Write a limerick about AI", max_length=50))


## Instructions
- Replace the toy dataset with your task-specific data (e.g., Kaggle mental health sample, etc.).
- Ensure GPU runtime is enabled in Colab.
- Record a video walkthrough: install, data load, training, evaluation, and a sample generation.
- Upload the executed notebook (with outputs) and link your video in the top markdown cell.
