In [1]:
!pip3 install -q datasets transformers accelerate peft evaluate rouge_score bitsandbytes

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.4 MB/s[0m et

In [2]:
import random
import numpy as np
import torch
import torch.nn as nn
from datasets import load_dataset
import evaluate
from transformers import (
    GPT2TokenizerFast,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer, BitsAndBytesConfig
)
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import tqdm

# ── 1) Reproducibility & Device ───────────────────────────────────────────────
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
device = (
    "mps" if torch.backends.mps.is_available() else
    "cuda" if torch.cuda.is_available() else "cpu"
)
print(f"Using device: {device}")

# ── 2) Tokenizer ───────────────────────────────────────────────────────────────
tok = GPT2TokenizerFast.from_pretrained("gpt2")
tok.add_special_tokens({"additional_special_tokens": ["<sum>"]})
tok.pad_token = tok.eos_token
tok.padding_side = "left"

2025-05-26 02:07:41.811223: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748225261.993234      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748225262.048581      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [3]:
# ── 3) Load & 4-bit Quantize Base Model ───────────────────────────────────────
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
base = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    quantization_config=bnb_config,
    device_map="auto",
)
base.resize_token_embeddings(len(tok))
base = prepare_model_for_kbit_training(base)

# ── 4) Apply LoRA adapters ────────────────────────────────────────────────────
lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["c_attn"],
    task_type=TaskType.CAUSAL_LM,
)
peft_model = get_peft_model(base, lora_cfg)
peft_model.print_trainable_parameters()
peft_model.to(device)

# ── 5) Wrap in your explicit head ─────────────────────────────────────────────
class GPT2Summarizer(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base = base_model
        self.lm_head = base_model.lm_head

    def forward(self, input_ids, attention_mask=None, labels=None):
        out = self.base(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            return_dict=True
        )
        return {"loss": out.loss, "logits": out.logits}

    def generate(self, *args, **kwargs):
        return self.base.generate(*args, **kwargs)

model = GPT2Summarizer(peft_model)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


trainable params: 294,912 || all params: 124,735,488 || trainable%: 0.2364


In [4]:
train_raw = load_dataset("cnn_dailymail", "3.0.0", split="train[:50000]")
val_raw   = load_dataset("cnn_dailymail", "3.0.0", split="validation[:5000]")
rouge     = evaluate.load("rouge")

def preprocess(ex):
    inp = tok(
        "<sum> " + ex["article"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )
    lbl = tok(
        ex["highlights"],
        truncation=True,
        max_length=128,
        padding="max_length"
    )
    return {
        "input_ids":      inp["input_ids"],
        "attention_mask": inp["attention_mask"],
        "labels":         lbl["input_ids"],
    }

train_ds = train_raw.map(preprocess, batched=False, remove_columns=train_raw.column_names)
val_ds   = val_raw.map(preprocess,   batched=False, remove_columns=val_raw.column_names)

# ── 7) Data collator ──────────────────────────────────────────────────────────
collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [5]:
# ── 8) Training arguments ─────────────────────────────────────────────────────
args = TrainingArguments(
    output_dir="sum-qlora",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    fp16=(device=="cuda"),
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    warmup_steps=50,
    lr_scheduler_type="cosine",
    eval_strategy="epoch",
    logging_steps=20,
    save_strategy="no",
    report_to=[],
)

# ── 9) Trainer setup ─────────────────────────────────────────────────────────
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collator,
    tokenizer=tok,
)

# ── 10) Train the QLoRA adapter ───────────────────────────────────────────────
trainer.train()
model.base.save_pretrained("sum-qlora")  
tok.save_pretrained("sum-qlora")

  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.1399,3.307704
2,3.2064,3.306313
3,3.1068,3.309342




('sum-qlora/tokenizer_config.json',
 'sum-qlora/special_tokens_map.json',
 'sum-qlora/vocab.json',
 'sum-qlora/merges.txt',
 'sum-qlora/added_tokens.json',
 'sum-qlora/tokenizer.json')

In [6]:
preds, refs = [], []
for ex in val_raw.select(range(5000)):
    inputs = tok(
        "<sum> " + ex["article"],
        return_tensors="pt",
        max_length=512,
        truncation=True,
        padding=True,
    ).input_ids.to(device)

    output = peft_model.generate(
        inputs,
        max_new_tokens=80,
        num_beams=4,
        length_penalty=1.2,
        early_stopping=True,
        pad_token_id=tok.eos_token_id,
    )[0]

    summary = tok.decode(output[inputs.shape[1]:], skip_special_tokens=True).strip()
    preds.append(summary)
    refs.append(ex["highlights"])

result = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
print("Final ROUGE:", result)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Final ROUGE: {'rouge1': 0.15965791130390994, 'rouge2': 0.03633858453087035, 'rougeL': 0.12117849845437922, 'rougeLsum': 0.14481825298967765}
