In [8]:
!pip install transformers datasets accelerate peft bitsandbytes torch evaluate tqdm numpy sentencepiece nltk --quiet

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RAZER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
!pip install rouge_score --quiet

In [11]:
!pip install -U datasets --quiet

In [12]:
import random
import numpy as np
import torch
import nltk
import time

from datasets import load_dataset
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    BitsAndBytesConfig # Important for QLoRA
)
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

In [13]:
# --- 1. Initial Configuration & Reproducibility ---
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [14]:
MODEL_NAME = "t5-small"
OUTPUT_DIR = "./t5_small_qlora_summarization_results" # Changed directory name to differentiate from LoRA
SAVED_MODEL_PATH = "./saved_models/t5_small_qlora_summarization" # Changed directory name

In [None]:
# --- 3. Load Tokenizer and T5-small Model with Quantization ---
print(f"\n--- Loading tokenizer and base model {MODEL_NAME} ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    device_map="auto"
)


print("Number of trainable parameters for the SFT model:")
# Count total parameters in the base model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

total_params = count_parameters(model)
trainable_params = count_trainable_parameters(model)

print(f"Trainable parameters: {trainable_params:,} || Total parameters: {total_params:,} || Trainable %: {100 * trainable_params / total_params:.2f}% ")


--- Loading tokenizer and base model t5-small ---
Number of trainable parameters for the SFT model:
Trainable parameters: 60,506,624 || Total parameters: 60,506,624 || Trainable %: 100.00% 


In [16]:
# --- 5. Prepare Data (No prefix used) ---
print("\n--- Loading and preprocessing CNN/DailyMail data ---")
train_raw = load_dataset("cnn_dailymail", "3.0.0", split="train")
val_raw = load_dataset("cnn_dailymail", "3.0.0", split="validation")
test_raw = load_dataset("cnn_dailymail", "3.0.0", split="test")

max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = examples["article"] # No "summarize: " prefix added
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(text_target=examples["highlights"], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_ds = train_raw.map(preprocess_function, batched=True, remove_columns=["article", "highlights", "id"])
tokenized_val_ds = val_raw.map(preprocess_function, batched=True, remove_columns=["article", "highlights", "id"])
tokenized_test_ds = test_raw.map(preprocess_function, batched=True, remove_columns=["article", "highlights", "id"])

print("Training set size:", len(tokenized_train_ds))
print("Validation set size:", len(tokenized_val_ds))
print("Test set size:", len(tokenized_test_ds))



--- Loading and preprocessing CNN/DailyMail data ---


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

Training set size: 287113
Validation set size: 13368
Test set size: 11490


In [19]:

print(f"Model device: {next(model.parameters()).device}")

Model device: cuda:0


In [20]:
# --- 6. Data Collator and Metrics ---
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) # Use the quantized/LoRA model

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return result

# --- 7. Training Configuration (`Seq2SeqTrainingArguments`) ---
print("\n--- Configuring training parameters ---")
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=3e-4,
    num_train_epochs=1,
    logging_steps=100,
    save_steps=1000,
    eval_strategy="epoch", 
    save_strategy="epoch",  
    save_total_limit=2,
    bf16=(device == "cuda"),
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    predict_with_generate=True,
    label_names=["labels"],
)

# --- 8. Initialize Trainer and Train ---
print("\n--- Starting SFT training process ---")
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

start_time_train = time.time()
trainer.train()
end_time_train = time.time()
train_duration = end_time_train - start_time_train
print(f"\nSFT training completed in: {train_duration:.2f} seconds")




--- Configuring training parameters ---

--- Starting SFT training process ---


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.9738,1.79946,25.4165,12.057,20.8717,23.9459,20.0


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\RAZER/nltk_data'
    - 'c:\\Users\\RAZER\\AppData\\Local\\Programs\\Python\\Python311\\nltk_data'
    - 'c:\\Users\\RAZER\\AppData\\Local\\Programs\\Python\\Python311\\share\\nltk_data'
    - 'c:\\Users\\RAZER\\AppData\\Local\\Programs\\Python\\Python311\\lib\\nltk_data'
    - 'C:\\Users\\RAZER\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [23]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\RAZER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [24]:
# --- 9. Final Evaluation on Validation Set (best model) ---
print("\n--- Evaluating the best model on the Validation set ---")
final_metrics_sft = trainer.evaluate(tokenized_val_ds) # Store QLoRA results in a separate variable
print("Final evaluation results on Validation (SFT):", final_metrics_sft)

print("\n--- Evaluating the best model on the Test set ---")
final_metrics_sft = trainer.evaluate(tokenized_test_ds) # Store QLoRA results in a separate variable
print("Final evaluation results on Test (SFT):", final_metrics_sft)




--- Evaluating the best model on the Validation set ---
Final evaluation results on Validation (SFT): {'eval_loss': 1.7818782329559326, 'eval_rouge1': 25.2014, 'eval_rouge2': 12.0655, 'eval_rougeL': 20.6984, 'eval_rougeLsum': 23.7537, 'eval_gen_len': 20.0}

--- Evaluating the best model on the Test set ---
Final evaluation results on Test (SFT): {'eval_loss': 1.7994601726531982, 'eval_rouge1': 25.4165, 'eval_rouge2': 12.057, 'eval_rougeL': 20.8717, 'eval_rougeLsum': 23.9459, 'eval_gen_len': 20.0}


In [22]:
# --- 10. Save QLoRA adapter and Tokenizer ---
print(f"\n--- Saving SFT model and tokenizer at: {SAVED_MODEL_PATH} ---")
trainer.save_model(SAVED_MODEL_PATH)
tokenizer.save_pretrained(SAVED_MODEL_PATH)
print("Model saving complete.")


--- Saving SFT model and tokenizer at: ./saved_models/t5_small_qlora_summarization ---
Model saving complete.
