In [7]:
# -----------------------------------------------------------
# 🟢 Cell 1 – Setup & Imports
# -----------------------------------------------------------
import os, time, numpy as np, torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast

# Enable cuDNN autotuner for conv speed
torch.backends.cudnn.benchmark = True

# Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Mixed-precision scaler
scaler = GradScaler()

# Configuration
MODEL_NAME    = 'bert-base-uncased'
MAX_LEN       = 64
BATCH_SIZE    = 8           # try bumping to fill the GPU
NUM_WORKERS   = 4           # tune to your CPU cores
EPOCHS        = 3
LR            = 2e-5
WARMUP_STEPS  = 100
SEED          = 42

# Reproducibility
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE.type == "cuda":
    torch.cuda.manual_seed_all(SEED)

Using device: cuda


  scaler = GradScaler()


In [10]:
# Toy Dataset
sentences = [
    "This movie is great and amazing!",
    "This movie is terrible and awful.",
    "You are a wonderful person.",
    "You are a horrible person.",
    "I do not like this movie at all.",
    "Despite the rain, the picnic was fun.",
    "The service was slow, but the food was good." # Mixed example
]
# Labels: 1 for Positive, 0 for Negative
labels = [1, 0, 1, 0, 0, 1, 1] # Treating the last one as positive due to "good"Q

In [11]:
# -----------------------------------------------------------
# 🟢 Cell 2 – Data Preparation
# -----------------------------------------------------------
print(f"\nLoading tokenizer ({MODEL_NAME})...")
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

class ToyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts, self.labels = texts, labels
        self.tokenizer, self.max_len = tokenizer, max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids':     enc['input_ids'].squeeze(0),
            'attention_mask':enc['attention_mask'].squeeze(0),
            'labels':        torch.tensor(self.labels[idx], dtype=torch.long)
        }

dataset    = ToyDataset(sentences, labels, tokenizer, MAX_LEN)
data_loader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=(DEVICE.type == "cuda")
)


Loading tokenizer (bert-base-uncased)...


In [14]:
# -----------------------------------------------------------
# 🟢 Cell 3 – Model & Optimizer
# -----------------------------------------------------------
print(f"\nLoading model ({MODEL_NAME})...")
del model  # or whatever variable holds your prior UNet, etc.
torch.cuda.empty_cache()

# Now load BERT into half-precision on GPU
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    output_attentions=True,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)
model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr=LR)

total_steps = len(data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=total_steps
)

print(f"Total optimization steps: {total_steps}")
print(f"Warmup steps: {WARMUP_STEPS}")



Loading model (bert-base-uncased)...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# -----------------------------------------------------------
# 🟢 Cell 4 – Training Loop (AMP + LR Scheduler)
# -----------------------------------------------------------
print(f"\nStarting fine-tuning for {EPOCHS} epochs...\n")

model.train()
for epoch in range(1, EPOCHS + 1):
    epoch_loss = 0.0
    t0 = time.time()

    for batch in data_loader:
        # Non-blocking transfer
        input_ids      = batch['input_ids'].to(DEVICE,      non_blocking=True)
        attention_mask = batch['attention_mask'].to(DEVICE, non_blocking=True)
        labels         = batch['labels'].to(DEVICE,         non_blocking=True)

        optimizer.zero_grad()

        # AMP forward/backward
        with torch.cuda.amp.autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Step scheduler once per batch
        scheduler.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(data_loader)
    elapsed = time.time() - t0
    print(f"Epoch {epoch}/{EPOCHS} — loss: {avg_loss:.4f} — time: {elapsed:.1f}s")

print("\nFine-tuning complete.")