In [None]:
!pip -q install "transformers>=4.44.0" "datasets>=2.19.0" "accelerate>=0.34.0" "evaluate>=0.4.2" "peft>=0.12.0" "bitsandbytes>=0.43.0" "trl>=0.9.6" "rouge-score>=0.1.2"

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [None]:
from huggingface_hub import login

login(token="************")

In [None]:
import torch, platform
import transformers, datasets, peft, bitsandbytes as bnb
from transformers import __version__ as tr_version
from peft import __version__ as peft_version
from datasets import __version__ as ds_version

print("Python:", platform.python_version())
print("PyTorch:", torch.__version__)
print("Transformers:", tr_version)
print("PEFT:", peft_version)
print("Datasets:", ds_version)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    dev = torch.cuda.get_device_name(0)
    mem_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"GPU: {dev} | VRAM: {mem_total:.2f} GB")
else:
    print("GPU is not available!")


Python: 3.12.12
PyTorch: 2.8.0+cu126
Transformers: 4.57.1
PEFT: 0.17.1
Datasets: 4.0.0
CUDA available: True
GPU: Tesla T4 | VRAM: 14.74 GB


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model.config.use_cache = False
model.config.pretraining_tp = 1
print("Model and tokenizer loaded in 4bit.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model and tokenizer loaded in 4bit.


In [None]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")
print(dataset)

def build_prompt(article, max_points=3):
    template = (
        "You are a concise news summarizer.\n"
        "Task: Read the news article and produce up to {m} short bullet highlights (each 10-20 words).\n"
        "Article:\n{article}\n\n"
        "Highlights:\n-"
    )
    return template.format(m=max_points, article=article.strip())

def filter_fn(example):
    a = example.get("article", None)
    h = example.get("highlights", None)
    if not a or not h:
        return False
    return 200 < len(a) < 5000 and 20 < len(h) < 600

filtered = dataset.filter(filter_fn, num_proc=4)
print(filtered)

train_small = filtered["train"].shuffle(seed=42).select(range(5000))
eval_small  = filtered["validation"].shuffle(seed=42).select(range(500))
test_small  = filtered["test"].shuffle(seed=42).select(range(200))
len(train_small), len(eval_small), len(test_small)


README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

3.0.0/validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

3.0.0/test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


Filter (num_proc=4):   0%|          | 0/287113 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/13368 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/11490 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 208686
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 9778
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 8398
    })
})


(5000, 500, 200)

In [None]:
from functools import partial

MAX_SEQ_LEN = 512

def tokenize_fn(example, max_points=3):
    prompt = build_prompt(example["article"], max_points=max_points)
    target = example["highlights"].strip()
    text = prompt + " " + target
    tokenized = tokenizer(
        text,
        truncation=True,
        max_length=MAX_SEQ_LEN,
        padding="max_length"
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

train_tok = train_small.map(tokenize_fn, batched=False, remove_columns=train_small.column_names, desc="Tokenizing train")
eval_tok  = eval_small.map(tokenize_fn, batched=False, remove_columns=eval_small.column_names, desc="Tokenizing eval")
test_tok  = test_small.map(lambda e: {"article": e["article"], "highlights": e["highlights"]}, batched=False)
print(train_tok[0].keys())


Tokenizing train:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenizing eval:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=TARGET_MODULES,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 12,615,680 || all params: 1,112,664,064 || trainable%: 1.1338


In [None]:
from google.colab import drive
drive.mount('/content/drive')
OUTPUT_DIR = "/content/drive/MyDrive/tinyllama-cnn-highlights-qlora_checkpoints"

Mounted at /content/drive


In [None]:
import math
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForLanguageModeling


OUTPUT_DIR = "/content/drive/MyDrive/tinyllama-cnn-highlights-qlora_checkpoints"
BATCH_SIZE = 4
GR_ACCUM = 4
LR = 2e-4
EPOCHS = 2
WARMUP = 50

fp16 = (not torch.cuda.is_bf16_supported())
bf16 = torch.cuda.is_bf16_supported()

args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=min(8, BATCH_SIZE*2),
    gradient_accumulation_steps=GR_ACCUM,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    logging_steps=20,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=20,
    save_total_limit=2,
    warmup_steps=WARMUP,
    lr_scheduler_type="cosine",
    fp16=fp16,
    bf16=bf16,
    gradient_checkpointing=True,
    report_to="none",
    optim="paged_adamw_8bit",
)

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    args=args,
    data_collator=collator,
)
train_result =trainer.train(resume_from_checkpoint=OUTPUT_DIR + "/checkpoint-626")
#train_result = trainer.train()
metrics = trainer.evaluate()
print(metrics)


if "eval_loss" in metrics:
    ppl = math.exp(metrics["eval_loss"])
    print(f"Perplexity: {ppl:.2f}")


Truncating train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.
	save_steps: 20 (from args) != 50 (from trainer_state.json)


Step,Training Loss,Validation Loss


{'eval_loss': 1.8444181680679321, 'eval_runtime': 278.8992, 'eval_samples_per_second': 1.793, 'eval_steps_per_second': 0.226, 'eval_entropy': 1.8129675539713057, 'eval_num_tokens': 0.0, 'eval_mean_token_accuracy': 0.5887966241155352, 'epoch': 2.0}
Perplexity: 6.32


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch


CHECKPOINT_PATH = OUTPUT_DIR + "/checkpoint-626"
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
)


tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    CHECKPOINT_PATH,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model.eval()


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2048, out_features=256, bias=False)
            (lora_dropout): ModuleDict(
        

In [None]:
from transformers import GenerationConfig
import textwrap, random
model.eval()

gen_config = GenerationConfig(
    max_new_tokens=120,
    do_sample=True,
    top_p=0.9,
    temperature=0.7,
    repetition_penalty=1.1
)

def generate_highlights(article, max_points=3):
    prompt = build_prompt(article, max_points=max_points)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, generation_config=gen_config)
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    if "Highlights:" in text:
        text = text.split("Highlights:", 1)[1].strip()
    return text


for i in random.sample(range(len(test_small)), 3):
    a = test_small[i]["article"]
    ref = test_small[i]["highlights"]
    pred = generate_highlights(a, max_points=3)
    print("="*100)
    print("Reference:\n", textwrap.fill(ref, 100))
    print("\nPrediction:\n", pred)


Reference:
 Crown Princess Mary was in Aabenraa in southern Denmark on Thursday . Event marked the 75th
anniversary of the invasion by Germany in 1940 . Tasmanian-born royal wore chic black ensemble with
grey accessories . Busy week for royals with birthday celebrations for Queen Margarethe II .

Prediction:
 - The Danish royal married Crown Prince Frederik in 2004 .
The couple have two children, Princess Isabella and Prince Christian .
Mary attended a memorial event in Aabenraa, where she laid a wreath at a war memorial .
She will attend another event today where she is expected to wear a cream dress .
The couple have a total of seven children between them .
It was the shortest German military campaign of World War Two .
Princess Mary attended a memorial service to commemorate the invasion of Denmark
Reference:
 Fareed Zakaria: ISIS has thrived because of a local Sunni cause in Syria and Iraq . Leaders of ISIS
have recognized they are a messaging machine, he says .

Prediction:
 - ISI

In [None]:
import evaluate
rouge = evaluate.load("rouge")

N = min(200, len(test_small))
preds, refs = [], []
for i in range(N):
    a = test_small[i]["article"]
    r = test_small[i]["highlights"]
    p = generate_highlights(a, max_points=3)

    preds.append(p.replace("\n", " "))
    refs.append(r.replace("\n", " "))

rouge_scores = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
rouge_scores


Downloading builder script: 0.00B [00:00, ?B/s]

{'rouge1': np.float64(0.32576324543628005),
 'rouge2': np.float64(0.1041380394915192),
 'rougeL': np.float64(0.19627394855170127),
 'rougeLsum': np.float64(0.19645854470459248)}

In [None]:
ADAPTER_DIR = OUTPUT_DIR + "-lora"
trainer.model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)
print("Saved LoRA adapters ->", ADAPTER_DIR)

from peft import PeftModel
base_reload = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, quantization_config=bnb_config, device_map="auto", trust_remote_code=True
)
base_reload = prepare_model_for_kbit_training(base_reload)
lora_reload = PeftModel.from_pretrained(base_reload, ADAPTER_DIR)
lora_reload.eval()
print("Reloaded base + LoRA.")


Saved LoRA adapters -> /content/drive/MyDrive/tinyllama-cnn-highlights-qlora_checkpoints-lora
Reloaded base + LoRA.


In [None]:
import pandas as pd
logs = pd.DataFrame(trainer.state.log_history)
logs.tail(10)


Unnamed: 0,entropy,epoch,grad_norm,learning_rate,loss,mean_token_accuracy,num_tokens,step,eval_entropy,eval_loss,eval_mean_token_accuracy,eval_num_tokens,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
26,1.790317,1.5984,0.328349,2.304611e-05,1.7618,0.600911,2453504.0,500,,,,,,,,,,,,
27,1.782203,1.6624,0.297626,1.655126e-05,1.7587,0.603044,2617344.0,520,,,,,,,,,,,,
28,1.798495,1.7264,0.314396,1.104839e-05,1.7611,0.599807,2781184.0,540,,,,,,,,,,,,
29,1.787983,1.7904,0.312662,6.602921e-06,1.7656,0.60172,2945024.0,560,,,,,,,,,,,,
30,1.804013,1.8544,0.316098,3.267691e-06,1.7782,0.599319,3108864.0,580,,,,,,,,,,,,
31,1.796321,1.9184,0.326383,1.082349e-06,1.7683,0.601733,3272704.0,600,,,,,,,,,,,,
32,,1.9184,,,,,,600,1.812993,1.844343,0.588747,3272704.0,284.9125,1.755,0.221,,,,,
33,1.800904,1.9824,0.318076,7.287324e-08,1.7689,0.600989,3436544.0,620,,,,,,,,,,,,
34,,2.0,,,,,,626,,,,,,,,0.0099,1010091.513,63231.729,3.216777e+16,0.0
35,,2.0,,,,,,626,1.812968,1.844418,0.588797,0.0,278.8992,1.793,0.226,,,,,
