# Fine-Tuning BERT & GPT (HuggingFace)

In [None]:
#!pip install evaluate

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    pipeline,
    AutoModelForCausalLM
)
import evaluate

### Fine-Tune BERT (Text Classification)

In [None]:
# We fine-tune BERT on the SST2 sentiment dataset.
dataset = load_dataset("sst2")
dataset

In [None]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
def tokenize(batch):
    return tokenizer(batch["sentence"], truncation=True)

tokenized = dataset.map(tokenize, batched=True)

In [None]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(-1)
    return accuracy.compute(predictions=preds, references=labels)

In [None]:
args = TrainingArguments(
    output_dir="bert-finetuned-sst2",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)


In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss


##### Evaluate & Test BERT

In [None]:
clf = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
clf("This movie was surprisingly good!")

##### Fine-Tune GPT2 (Text Generation)

In [None]:
gpt_name = "gpt2"
gpt_tok = AutoTokenizer.from_pretrained(gpt_name)
gpt_tok.pad_token = gpt_tok.eos_token

gpt_model = AutoModelForCausalLM.from_pretrained(gpt_name)

In [None]:
text_ds = load_dataset("wikitext", "wikitext-2-raw-v1")

In [None]:
def tokenize_gpt(batch):
    return gpt_tok(batch["text"], truncation=True, padding="max_length", max_length=128)

tokenized_text = text_ds.map(tokenize_gpt, batched=True, remove_columns=["text"])
tokenized_text

In [None]:
gpt_args = TrainingArguments(
    output_dir="gpt2-finetuned",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
)

In [None]:
gpt_trainer = Trainer(
    model=gpt_model,
    args=gpt_args,
    train_dataset=tokenized_text["train"],
    eval_dataset=tokenized_text["validation"],
    tokenizer=gpt_tok,
)

In [None]:
gpt_trainer.train()

##### Test GPT2 Fine-Tuned

In [None]:
# GPT2 Generation Test
gen_pipe = pipeline("text-generation", model=gpt_model, tokenizer=gpt_tok, max_length=60)
gen_pipe("Deep learning is a revolutionary field because")[0]["generated_text"]

#### When to Fine-Tune vs. Use Embeddings or PEFT