# Fine-Tuning BERT & GPT (HuggingFace)

In [15]:
!nvidia-smi
import torch
print(torch.cuda.is_available())

Thu Nov 27 09:18:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   60C    P0             30W /   70W |     102MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
#!pip install evaluate

In [16]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    pipeline,
    AutoModelForCausalLM
)
import evaluate

### Fine-Tune BERT (Text Classification)

In [17]:
# We fine-tune BERT on the SST2 sentiment dataset.
dataset = load_dataset("sst2")
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

In [18]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
def tokenize(batch):
    return tokenizer(batch["sentence"], truncation=True)

tokenized = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [20]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
help(DataCollatorWithPadding)

Help on class DataCollatorWithPadding in module transformers.data.data_collator:

class DataCollatorWithPadding(builtins.object)
 |  DataCollatorWithPadding(tokenizer: transformers.tokenization_utils_base.PreTrainedTokenizerBase, padding: Union[bool, str, transformers.utils.generic.PaddingStrategy] = True, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, return_tensors: str = 'pt') -> None
 |
 |  Data collator that will dynamically pad the inputs received.
 |
 |  Args:
 |      tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
 |          The tokenizer used for encoding the data.
 |      padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
 |          Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
 |          among:
 |
 |          - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
 |            

In [21]:
import numpy
numpy.array([1,10,2,4]).argmax()

np.int64(1)

In [22]:
accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(-1)
    return accuracy.compute(predictions=preds, references=labels)

In [23]:
args = TrainingArguments(
    output_dir="bert-finetuned-sst2",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)


In [31]:
#!pip install peft bitsandbytes accelerate -q

In [34]:
from peft import LoraConfig, get_peft_model

In [42]:
peft_config = LoraConfig(r=16,lora_alpha=32, target_modules=["query","value"], lora_dropout=0.05, bias = "none", task_type='SEQ_CLS')

In [None]:
help(LoraConfig)

In [43]:
model = get_peft_model(model,peft_config)

In [44]:
print(model.print_trainable_parameters)

<bound method PeftModel.print_trainable_parameters of PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                 

In [47]:
model.print_trainable_parameters()

trainable params: 591,362 || all params: 110,075,140 || trainable%: 0.5372


In [45]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [46]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0618,0.283976,0.925459
2,0.0908,0.276515,0.925459


TrainOutput(global_step=8420, training_loss=0.06799452276524343, metrics={'train_runtime': 642.9222, 'train_samples_per_second': 209.509, 'train_steps_per_second': 13.096, 'total_flos': 2453736217768344.0, 'train_loss': 0.06799452276524343, 'epoch': 2.0})

In [25]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmadhuri-madhuris[0m ([33mmadhuri-madhuris-abc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.175,0.256713,0.916284
2,0.1203,0.297938,0.926606


TrainOutput(global_step=8420, training_loss=0.16934690362200885, metrics={'train_runtime': 1306.7244, 'train_samples_per_second': 103.081, 'train_steps_per_second': 6.444, 'total_flos': 2436910441971660.0, 'train_loss': 0.16934690362200885, 'epoch': 2.0})

In [None]:
# QAT + LoRA on BERT (4-bit During Train)
from transformers import BitsAndBytesConfig

# QAT Config (4-bit sim: model "sees" noise during train)
qat_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Reload model with QAT (apply LoRA on top)
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2, quantization_config=qat_config, device_map="auto"
)
model = get_peft_model(model, peft_config)  # Your r=16/alpha=32

# Train with QAT sim (1 epoch subset)
qat_args = TrainingArguments(
    output_dir="qat-lora-bert", num_train_epochs=1, per_device_train_batch_size=8,  # Smaller batch for sim
    learning_rate=2e-5, gradient_checkpointing=True  # Memory for QAT
)
qat_trainer = Trainer(model=model, args=qat_args, train_dataset=tokenized["train"].select(range(2000)),
                      eval_dataset=tokenized["validation"], tokenizer=tokenizer, data_collator=collator,
                      compute_metrics=compute_metrics)
qat_trainer.train()

# Post-QAT: Quantize & Eval (should hold acc better)
qat_model = qat_trainer.model  # Already sim'd
qat_acc = qat_trainer.evaluate()["eval_accuracy"]
print(f"QAT Train Acc: {qat_acc:.3f} (Holds better post-quant)")

# Save
qat_trainer.save_model("qat-lora-bert-sst2")

In [26]:
#trainer.save_model("lora-bert-sst2")  # Save LoRA-merged

##### Evaluate & Test BERT

In [28]:
clf = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
clf("do you expect me to say it was a good movie though it was so boring")

Device set to use cuda:0


[{'label': 'LABEL_0', 'score': 0.995940089225769}]

##### Fine-Tune GPT2 (Text Generation)

In [None]:
gpt_name = "gpt2"
gpt_tok = AutoTokenizer.from_pretrained(gpt_name)
gpt_tok.pad_token = gpt_tok.eos_token

gpt_model = AutoModelForCausalLM.from_pretrained(gpt_name)

In [None]:
text_ds = load_dataset("wikitext", "wikitext-2-raw-v1")

In [None]:
def tokenize_gpt(batch):
    return gpt_tok(batch["text"], truncation=True, padding="max_length", max_length=128)

tokenized_text = text_ds.map(tokenize_gpt, batched=True, remove_columns=["text"])
tokenized_text

In [None]:
gpt_args = TrainingArguments(
    output_dir="gpt2-finetuned",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
)

In [None]:
gpt_trainer = Trainer(
    model=gpt_model,
    args=gpt_args,
    train_dataset=tokenized_text["train"],
    eval_dataset=tokenized_text["validation"],
    tokenizer=gpt_tok,
)

In [None]:
gpt_trainer.train()

In [None]:
gpt_trainer.save_model("gpt2-finetuned")

##### Test GPT2 Fine-Tuned

In [None]:
# GPT2 Generation Test
gen_pipe = pipeline("text-generation", model=gpt_model, tokenizer=gpt_tok, max_length=60)
gen_pipe("Deep learning is a revolutionary field because")[0]["generated_text"]

In [None]:
# Setup (run once)
# !pip install -q transformers datasets peft bitsandbytes accelerate evaluate wandb
# Imports (add to your notebook if needed)
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, pipeline, TrainingArguments, Trainer
from datasets import load_dataset
import evaluate
import time
import wandb
wandb.init(project="llm-workshop", name="ptq-bert-demo")  # Log to your project

# Load your fine-tuned BERT from notebook
model_dir = "bert-finetuned-sst2"  # From your trainer.save_model()
tokenizer = AutoTokenizer.from_pretrained(model_dir)
full_model = AutoModelForSequenceClassification.from_pretrained(model_dir, torch_dtype=torch.float16)

# Calibration data (SST-2 val subset)
dataset = load_dataset("sst2", split="validation[:128]")
def tokenize(batch): return tokenizer(batch["sentence"], truncation=True, padding=True)
calib_data = dataset.map(tokenize, batched=True)

# PTQ: Load in 8-bit (calibrates automatically)
quant_config = BitsAndBytesConfig(load_in_8bit=True)
quant_model = AutoModelForSequenceClassification.from_pretrained(
    model_dir, quantization_config=quant_config, device_map="auto", torch_dtype=torch.float16
)

# Eval accuracy (full vs PTQ)
accuracy = evaluate.load("accuracy")
def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(-1)
    return accuracy.compute(predictions=preds, references=labels)

args = TrainingArguments(output_dir="ptq-temp", per_device_eval_batch_size=16, no_cuda=False)  # Eval only
trainer_full = Trainer(model=full_model, args=args, eval_dataset=calib_data, tokenizer=tokenizer, compute_metrics=compute_metrics)
full_acc = trainer_full.evaluate()["eval_accuracy"]

trainer_ptq = Trainer(model=quant_model, args=args, eval_dataset=calib_data, tokenizer=tokenizer, compute_metrics=compute_metrics)
ptq_acc = trainer_ptq.evaluate()["eval_accuracy"]

print(f"Full Acc: {full_acc:.3f} | PTQ 8-bit Acc: {ptq_acc:.3f} | Drop: {full_acc - ptq_acc:.3f}")

# Speed test (your pipeline style)
full_clf = pipeline("sentiment-analysis", model=full_model, tokenizer=tokenizer)
quant_clf = pipeline("sentiment-analysis", model=quant_model, tokenizer=tokenizer)

test_text = "This movie is surprisingly good!"
start = time.time(); full_out = full_clf(test_text); full_time = time.time() - start
start = time.time(); ptq_out = quant_clf(test_text); ptq_time = time.time() - start

print(f"Full Time: {full_time:.4f}s | PTQ Time: {ptq_time:.4f}s | Speedup: {full_time / ptq_time:.1f}x")
print("Sample PTQ Output:", ptq_out)

# Log to wandb
wandb.log({"full_acc": full_acc, "ptq_acc": ptq_acc, "speedup": full_time / ptq_time})
wandb.finish()

# Optional: Save PTQ model
quant_model.save_pretrained("ptq-bert-sst2")

In [None]:
from peft import AdaLoraConfig
adalora_config = AdaLoraConfig(
    init_r=64, target_modules=["q_proj", "v_proj"], lora_alpha=64, lora_dropout=0.05, task_type=TaskType.CAUSAL_LM,
    use_rslora=True
)
gpt_model = get_peft_model(gpt_model, adalora_config)  # Reload if needed: from_pretrained("gpt2-finetuned")
gpt_model.print_trainable_parameters()  # ~0.1% (pruned)

gpt_args.num_train_epochs = 1  # Reuse args
gpt_trainer = Trainer(model=gpt_model, args=gpt_args, train_dataset=tokenized_text["train"].select(range(500)),
                      eval_dataset=tokenized_text["validation"].select(range(100)), tokenizer=gpt_tok)
gpt_trainer.train()
gpt_trainer.save_model("adalora-gpt2")

gen_pipe = pipeline("text-generation", model=gpt_model, tokenizer=gpt_tok)
print(gen_pipe("AI is", max_new_tokens=20)[0]["generated_text"])  # More fluent

Paid A100 GPU Variant (Colab Pro+/AWS p4d – ~1 min, Batch 32):

In [None]:
# Same as above, but:
optimizer = torch.optim.AdamW(student.parameters(), lr=2e-5)
# In loop:
for batch in torch.utils.data.DataLoader(train_tokenized, batch_size=32, collate_fn=lambda x: tokenizer.pad(x, return_tensors="pt")):  # 2x batch
    # ... rest unchanged
args = TrainingArguments(..., per_device_eval_batch_size=32, bf16=True)  # A100 fast BF16
trainer_student = Trainer(model=student, args=args, ...)  # Eval on full val