In [None]:
!pip install transformers datasets accelerate peft bitsandbytes bert-score rouge-score

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metada

In [None]:
!pip install --upgrade datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [None]:
from datasets import load_dataset


# Load a small subset for quick fine-tuning
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train").select(range(3000))

# Rename columns for summarization
dataset = dataset.rename_columns({"article": "text", "highlights": "summary"})

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer

model_name = "microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # important for padding

def tokenize(batch):
    inputs = [
        f"Summarize with high factual consistency:\n{article}\nSummary:\n{summary}"
        for article, summary in zip(batch["text"], batch["summary"])
    ]
    model_inputs = tokenizer(
        inputs,
        max_length=640,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs



tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["text", "summary"])

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="float16"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

lora_config = LoraConfig(
    r=16, #rank of the low rank matrices
    lora_alpha=32, #scaling parameter
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

trainable params: 3,145,728 || all params: 1,421,416,448 || trainable%: 0.2213


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./fine_tuned_phi",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=5,  # epochs
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=50,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,3.1264
20,2.7771
30,2.4906
40,2.4114
50,2.4495
60,2.3936
70,2.4235
80,2.4202
90,2.407
100,2.4423


TrainOutput(global_step=1875, training_loss=2.3122796498616536, metrics={'train_runtime': 5800.298, 'train_samples_per_second': 2.586, 'train_steps_per_second': 0.323, 'total_flos': 7.58337896448e+16, 'train_loss': 2.3122796498616536, 'epoch': 5.0})

In [None]:
fine_tuned_model = trainer.model


In [None]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from bert_score import score as bertscore
from rouge_score import rouge_scorer
import numpy as np
import torch
from tqdm import tqdm

# Load tokenizer and test dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

test_dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:100]")

# Load original base model for comparison (not fine-tuned)
original_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# fine_tuned_model is already defined earlier as the LoRA-adapted model

def generate_summary(model, tokenizer, text):
    input_text = "Summarize with high factual consistency:\n" + text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(model.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=False
        )

    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_text.replace(input_text, "").strip()

# Initialize scorers
rouge = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"])
original_rouge_scores, fine_tuned_rouge_scores = [], []
original_bert_scores, fine_tuned_bert_scores = [], []

print("Evaluating model performance...")

for sample in tqdm(test_dataset, desc="Processing"):
    text, ref_summary = sample["article"], sample["highlights"]

    # Generate summaries
    orig_summary = generate_summary(original_model, tokenizer, text)
    tuned_summary = generate_summary(fine_tuned_model, tokenizer, text)

    # Compute ROUGE
    original_rouge = rouge.score(ref_summary, orig_summary)
    fine_tuned_rouge = rouge.score(ref_summary, tuned_summary)
    original_rouge_scores.append([original_rouge[m].fmeasure for m in original_rouge])
    fine_tuned_rouge_scores.append([fine_tuned_rouge[m].fmeasure for m in fine_tuned_rouge])

    # Compute BERTScore
    _, _, orig_f1 = bertscore([orig_summary], [ref_summary], lang='en', model_type='distilbert-base-uncased')
    _, _, tuned_f1 = bertscore([tuned_summary], [ref_summary], lang='en', model_type='distilbert-base-uncased')
    original_bert_scores.append(orig_f1.mean().item())
    fine_tuned_bert_scores.append(tuned_f1.mean().item())

# Print average results
print("\n Average ROUGE scores (Original vs. Fine-Tuned):")
print("Original ROUGE-1, ROUGE-2, ROUGE-L:", np.mean(original_rouge_scores, axis=0))
print("Fine-Tuned ROUGE-1, ROUGE-2, ROUGE-L:", np.mean(fine_tuned_rouge_scores, axis=0))

print("\n Average BERTScore (Factual consistency):")
print("Original model BERTScore:", np.mean(original_bert_scores))
print("Fine-Tuned model BERTScore:", np.mean(fine_tuned_bert_scores))

Evaluating model performance...


Processing:   0%|          | 0/100 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Processing:  39%|███▉      | 39/100 [07:00<10:45, 10.59s/it]This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Processing: 100%|██████████| 100/100 [17:27<00:00, 10.47s/it]


✅ Average ROUGE scores (Original vs. Fine-Tuned):
Original ROUGE-1, ROUGE-2, ROUGE-L: [0.11252887 0.0321406  0.08426926]
Fine-Tuned ROUGE-1, ROUGE-2, ROUGE-L: [0.16036073 0.05816786 0.11920624]

✅ Average BERTScore (Factual consistency):
Original model BERTScore: 0.7195869427919388
Fine-Tuned model BERTScore: 0.7531965786218643



