## Libraries

In [1]:
#  Install required packages
!pip install -U -q transformers accelerate peft bitsandbytes
!pip install -U -q git+https://github.com/huggingface/trl.git
!pip install -q evaluate
!pip install -q sacrebleu


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m93.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [13]:
#  Imports
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
from trl import SFTTrainer
import torch
import math
from tqdm import tqdm

##Dataset

In [4]:
# ✅ Load QnA dataset from plain .txt files
def load_qna(path):
    with open(path, "r", encoding="utf-8") as f:
        blocks = f.read().strip().split("\n\n")  # Two newlines = one QnA pair
    return [{"text": block.strip()} for block in blocks]

train_dataset = Dataset.from_list(load_qna("Train.txt"))
val_dataset   = Dataset.from_list(load_qna("Validation.txt"))

print("Train samples:", len(train_dataset))
print("Val samples:", len(val_dataset))
print("Example:", train_dataset[0]["text"])

Train samples: 800
Val samples: 100
Example: Q: How do I get in touch with the company?
A: You can email us directly at unthealthandfood@gmail.com for any help or questions.


In [5]:
# ✅ Load GPT-2-Large with 4-bit quantization
model_name = "gpt2-large"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

## Model building

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
model = prepare_model_for_kbit_training(model)

# avoiding padding issues
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
#  Apply LoRA for QLoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

In [8]:
#  Training setup
training_args = TrainingArguments(
    output_dir="./gpt2-qlora-output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    logging_steps=10,
    save_total_limit=2,
    save_steps=100,
    report_to="none"
)

In [9]:
#  Fine-tune using SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_args,
    formatting_func=lambda x: x["text"],
    peft_config=lora_config
)



Applying formatting function to train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
#Train model
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,3.5266
20,2.8259
30,2.3829
40,1.8648
50,1.532
60,1.1963
70,0.9863
80,0.8285
90,0.7266
100,0.6065


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=300, training_loss=0.7549880544344584, metrics={'train_runtime': 484.977, 'train_samples_per_second': 4.949, 'train_steps_per_second': 0.619, 'total_flos': 444583053250560.0, 'train_loss': 0.7549880544344584})

## Model Evaluation

In [14]:
# Evaluate on validation set
eval_results = trainer.evaluate()

#  Validation Loss
eval_loss = eval_results["eval_loss"]
print(f"Validation Loss: {eval_loss:.4f}")

#  Validation Perplexity
perplexity = math.exp(eval_loss)
print(f"📉 Validation Perplexity: {perplexity:.2f}")

#  Exact Match Accuracy on Validation Set
def compute_exact_match(model, tokenizer, dataset):
    model.eval()
    correct = 0
    total = 0

    for example in tqdm(dataset, desc="Evaluating EM"):
        input_text = example["text"]
        if "A:" not in input_text:
            continue
        question, true_answer = input_text.split("A:", 1)
        prompt = question.strip() + "A:"

        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=50)
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract model's answer
        pred_answer = generated.split("A:")[-1].strip().split("\n")[0]
        true_answer = true_answer.strip().split("\n")[0]

        if pred_answer == true_answer:
            correct += 1
        total += 1

    return correct / total if total > 0 else 0.0

em_score = compute_exact_match(model, tokenizer, val_dataset)
print(f"✅ Exact Match Accuracy on Validation Set: {em_score * 100:.2f}%")

Validation Loss: 0.1686
📉 Validation Perplexity: 1.18


Evaluating EM:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating EM:   1%|          | 1/100 [00:01<03:00,  1.82s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating EM:   2%|▏         | 2/100 [00:03<02:29,  1.53s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating EM:   3%|▎         | 3/100 [00:04<02:30,  1.55s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating EM:   4%|▍         | 4/100 [00:06<02:19,  1.46s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating EM:   5%|▌         | 5/100 [00:07<02:19,  1.47s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating EM:   6%|▌         | 6/100 [00:09<02:23,  1.53s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating EM:   7%|▋         | 7/100 [00:10<02:15,  1.45s/it]Setting 

✅ Exact Match Accuracy on Validation Set: 96.00%





In [15]:
print(f"Validation Loss: {eval_loss:.4f}")
print(f"📉 Validation Perplexity: {perplexity:.2f}")
print(f"✅ Exact Match Accuracy on Validation Set: {em_score * 100:.2f}%")

Validation Loss: 0.1686
📉 Validation Perplexity: 1.18
✅ Exact Match Accuracy on Validation Set: 96.00%
