In [1]:
!pip3 install -q datasets transformers accelerate peft evaluate rouge_score

In [2]:
import random, numpy as np, torch, torch.nn as nn
from datasets import load_dataset
import evaluate
from transformers import (
    GPT2TokenizerFast,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    AutoModelForCausalLM
)
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training


device = (
    "mps" if torch.backends.mps.is_available() else
    "cuda" if torch.cuda.is_available() else "cpu"
)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

tok = GPT2TokenizerFast.from_pretrained("gpt2")
tok.add_special_tokens({"additional_special_tokens": ["<sum>"]})
tok.pad_token = tok.eos_token

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(f"Device: {device}")

Device: mps


In [4]:
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tok))

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["c_attn"],
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
model.to(device)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 294,912 || all params: 124,735,488 || trainable%: 0.2364


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50258, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
          

In [5]:
class GPT2Summarizer(nn.Module):
    def __init__(self, base):
        super().__init__()
        self.base = base
        self.lm_head = base.lm_head  # reuse LM head from base

    def forward(self, input_ids, attention_mask=None, labels=None):
        output = self.base(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            return_dict=True
        )
        return {"loss": output.loss, "logits": output.logits}

    def generate(self, *args, **kwargs):
        return self.base.generate(*args, **kwargs)

In [6]:
train_raw = load_dataset("cnn_dailymail", "3.0.0", split="train[:100]")
val_raw   = load_dataset("cnn_dailymail", "3.0.0", split="validation[:10]")
rouge     = evaluate.load("rouge")

def preprocess(example):
    prefix = "<sum> " + example["article"]
    input_enc = tok(prefix, truncation=True, max_length=512, padding="max_length")
    label_enc = tok(example["highlights"], truncation=True, max_length=128, padding="max_length")
    return {
        "input_ids": input_enc["input_ids"],
        "attention_mask": input_enc["attention_mask"],
        "labels": label_enc["input_ids"]
    }

train_ds = train_raw.map(preprocess, remove_columns=train_raw.column_names)
val_ds = val_raw.map(preprocess, remove_columns=val_raw.column_names)

collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)

In [7]:
args = TrainingArguments(
    output_dir="sum-lora",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    fp16=(device == "cuda"),
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=2,
    warmup_steps=50,
    lr_scheduler_type="cosine",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=False, 
    logging_steps=20,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collator,
    tokenizer=tok
)

trainer.train()
model.save_pretrained("sum-lora")        
tok.save_pretrained("sum-lora") 

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.2839,3.05014
2,3.2858,3.050298




('sum-lora/tokenizer_config.json',
 'sum-lora/special_tokens_map.json',
 'sum-lora/vocab.json',
 'sum-lora/merges.txt',
 'sum-lora/added_tokens.json',
 'sum-lora/tokenizer.json')

In [10]:
preds, refs = [], []
for ex in val_raw.select(range(10)):
    inputs = tok(
        "<sum> " + ex["article"],
        return_tensors="pt",
        max_length=512,
        truncation=True,
        padding=True
    ).input_ids.to(device)

    output = model.generate(
        inputs,
        max_new_tokens=80,
        num_beams=4,
        length_penalty=1.2,
        early_stopping=True,
        pad_token_id=tok.eos_token_id
    )[0]

    summary = tok.decode(output[inputs.shape[1]:], skip_special_tokens=True).strip()
    preds.append(summary)
    refs.append(ex["highlights"])

result = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
print("Final ROUGE:", result)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Final ROUGE: {'rouge1': np.float64(0.06670099119008854), 'rouge2': np.float64(0.002127659574468085), 'rougeL': np.float64(0.05706609682830343), 'rougeLsum': np.float64(0.05952875691176184)}
