# In this project, I have perfomed: Fine-tuning BERT for text classification — the foundation for all modern NLP pipelines. I have used Hugging Face Transformers with Pytorch.

In [None]:
# -----------------------
# Required Dependencies
# -----------------------
!pip install transformers
!pip install --upgrade transformers

In [1]:
# -----------------------
# Required Imports 
# -----------------------
import torch
from transformers import BertTokenizer, BertForSequenceClassification # This 'transformers' s a hugging face library. 'BertForSequenceClassification' comes from Pytorch.
from datasets import load_dataset, load_from_disk
import pandas as pd
from torch.optim import AdamW
import re
import numpy as np
from torch.utils.data import DataLoader
from torchsummary import summary

2025-11-17 04:51:44.904757: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763355105.103125     125 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763355105.156957     125 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
dataset = load_dataset("imdb") # Loading IMDB dataset for sentiment analysis.
print(dataset)

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [None]:
# -----------------------
# Cleaning the dataset 
# -----------------------
#  There are lot of HTML tags in the text (reviews). Note, the BertTokenizer automatically handles and removes the rare 'symbols', preserving only the common ones.
def clean_text(text):
    text = re.sub(r'<.*?>', ' ', text) # Removes all the HTML tags since there are many.
    text = re.sub(r'\s+', ' ', text) # Removes extra spaces
    
    return text.strip()

# Applying mapping the current text and saving the clean version in the same dictionary keys (train & test)
dataset = dataset.map(lambda x: {'text': clean_text(x['text'])})

In [3]:
# ---------------------------------------------
# Checking if dataset is balanced or not
# ---------------------------------------------
labels = dataset["train"]["label"]
neg_label = labels.count(0)
pos_label = labels.count(1)

print(f"Negative Label: {neg_label}")
print(f"Positive Label: {pos_label}")

Negative Label: 12500
Positive Label: 12500


### The dataset is balanced.

In [None]:
# ---------------------
# Tokenization
# --------------------
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # bert-base-uncased → standard English BERT. Tokenizer converts words → token IDs. The 'BertTokenizer' is also a pretrained model just like the 'BERT' itself (Here, 'bert-base-uncased' is a pretrained-english-tokenizer).
# Performing Tokenization in batches which is recommended for faster computing
# Also, the 'BertTokenizer' is already lower-cased. It also pads the sequences.
def tokenize(batch):
    return tokenizer(batch['text'],  padding = 'max_length', max_length = 128, truncation = True) # By default, the padding is 'post' that's why not written padding = 'post', instead written padding = 'max_len' which will pad each sequence to the maximum length which 128. 128 is generally sufficient for small texts(reviews, emails, etc). truncate = True will truncate sentences longer than 128 length.

dataset = dataset.map(tokenize, batched = True)

print(dataset)  # The tokenizer returns a dictionary with multiple keys, like: 'input_ids' → token IDs for each word/sub word, 'token_type_ids' → segment IDs (used by BERT for sentence pairs), 'attention_mask' → 1 for real tokens, 0 for padding. map() automatically adds these new columns to your dataset while keeping the original ones ('text' and 'label').
# Also, note that all the features in this dataset-dictionary (even 'text'-feature) are used by the model internally for validation and loss calculation. "But, how 'text' will be used for loss_calculation and validation?" - The model will compute the 'token_ids' for the actual input sentences each time when it wants in order to calculate the loss and compare it with the predicted 'input_ids'.


dataset.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label']) # Keep only the necessary columns for model training. 'torch' is the specifier to make the dataset as 'torch' tensors.

# Splitting the test data into test and validation data
dataset = load_from_disk("/kaggle/input/train-test-validation-tokenizer-of-imdb-dataset/IMDB_train_test_validation_tokenizer") ## Optional loading.
test_split = dataset["test"].train_test_split(test_size = 0.2, seed = 42)
test_dataset = test_split["train"]
valid_dataset = test_split["test"]

train_loader = DataLoader(dataset['train'], batch_size = 16, shuffle = True) # Creating DataLoaders.
test_loader = DataLoader(test_dataset, batch_size = 16)
valid_loader = DataLoader(valid_dataset, batch_size = 16) # Validation dataset is required for temperature scaling.

In [4]:
# # This block is executable when tokenized dataset is to be loaded from disk.
# # Splitting the test data into test and validation data
# dataset = load_from_disk("/kaggle/input/train-test-validation-tokenizer-of-imdb-dataset/IMDB_train_test_validation_tokenizer") ## Optional loading.
# test_split = dataset["test"].train_test_split(test_size = 0.2, seed = 42, load_from_cache_file=False, test_indices_cache_file_name="/kaggle/working/test.idx",train_indices_cache_file_name="/kaggle/working/valid.idx")

# test_dataset = test_split["train"]
# valid_dataset = test_split["test"]

# train_loader = DataLoader(dataset['train'], batch_size = 16, shuffle = True) # Creating DataLoaders.
# test_loader = DataLoader(test_dataset, batch_size = 16)
# valid_loader = DataLoader(valid_dataset, batch_size = 16) # Validation dataset is required for temperature scaling.

In [None]:
dataset.save_to_disk("IMDB_train_test_validation_tokenizer")
!zip -r IMDB_train_test_validation_tokenizer.zip IMDB_train_test_validation_tokenizer


In [5]:
# -----------------
# Model Training 
# -----------------
# The workflow and methods to train and compile is different from Tensorflow in Pytorch
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2) # bert-base-uncased is: A pretrained English-language BERT model (base size, lowercase text) trained on the BooksCorpus and English Wikipedia datasets
optimizer = AdamW(model.parameters(), lr = 5e-5) # Using the AdamW optimizer because: AdamW --> Adam + correct weight decay → better generalization, stable training, less overfitting. Use the AdamW from the hugging face itself. This much Learning rate is a sweet-spot for BERT and is recommended.
criterion = torch.nn.CrossEntropyLoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# --------- Training Loop ---------
model.train() # This: Puts the model in training mode and Activates dropout and batch normalization (training-specific layers). Without 'model.train()' these layers will behave as in evaluation mode, which will make the training inaccurate.
for epoch in range(1): # Training with only 1 epoch is sufficient for checking if whether everything is working fine or not.
    total_loss = 0 # This will capture the loss at each batch.
    for batch in train_loader: # Iterating through each batch.
        optimizer.zero_grad() # This will reset the gradients of the previous batch for the new batch.

        outputs = model(**{k: v.to(device) for k, v in batch.items() if k != 'label'}, labels = batch['label'].to(device)) # outputs is a SequenceClassifierOutput object.
        
        loss = outputs.loss # outputs is a SequenceClassifierOutput object with: outputs.logits → the raw predictions and outputs.loss → the loss computed automatically
        total_loss += loss.item() # loss.item() → converts the PyTorch tensor to a Python float so we can accumulate it. total_loss += loss.item() → sum the losses across all batches.

        loss.backward() # This is a Back propagation which calculates gradients for all model parameters.
        optimizer.step() # This Updates the model weights based on the gradients.

    print(f"Epoch: {epoch + 1} | Training Loss: {total_loss / len(train_loader):.4f}") # len(train_loader) will give the total no. of batches. Dividing it with the accumulated total_loss gives an average loss of each batch.

print(model.config)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1 | Training Loss: 0.3485
BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dtype": "float32",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "transformers_version": "4.57.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [6]:
# --------------------------
# Temperature Scaling
# -------------------------
# Taking the output logits of validation dataset for teaching the temperature and later, I will use test ataset for evaluation on scaled logits. We need to teach the temperature on the validation data and then use the temperature to scale the logits for test data.
model.eval() # Puts the model into evaluation mode. Disables dropout and other training-only layers for consistent results.

# Creating the list of labels of the entire test dataset + the list of all the probabilites outputs.
labels_list = [] # It stores all the labels
logits_list = []

with torch.no_grad(): # torch.no_grad() → disables gradient calculation (saves memory and speeds up evaluation).
    for batch in valid_loader:
        outputs = model(**{k: v.to(device) for k, v in batch.items() if k != 'label'})
        logits_list.append(outputs.logits.cpu()) #'cpu' is needed for numpy during ECE calculation.
        labels = batch["label"].to(device)
        labels_list.append(labels.cpu())

labels_ts = torch.cat(labels_list)
logits_ts = torch.cat(logits_list)

In [7]:
# Optimizing Temperature
# The aim is to get the actual and correct probability values of test data on the basis of scaled logits. Since we have got the value of ECE, it's the time to get the real probability values of test data and then compute the ECE again on those probability values to get the actual ECE value for correct inference. To do this; we will perform temperature scaling to scale the output logits.
temperature = torch.tensor(1.0, dtype = torch.float32, requires_grad = True) 
optimizer_ts = torch.optim.LBFGS([temperature], lr = 0.01, max_iter = 50) # Updates temperature value after each iteration inside eval_fn.

criterion_ts = torch.nn.CrossEntropyLoss()

def eval_fn():
    optimizer_ts.zero_grad()
    scaled_logits = logits_ts / temperature # Scaled Logits
    loss = criterion_ts(scaled_logits, labels_ts)
    loss.backward()
    return loss

optimizer_ts.step(eval_fn) # Runs the eval_fn repeatdly until max_iter.
print(f"Learned Temperature: {temperature.item():.4f}") # 'temperature' will be automatically updated. Now, this temperature has been successfully learnt throught the outputs.logits, now this temperature will be used to scale the same outputs.logits to calculate the real probabilities and then to calculate to real ECE.

Learned Temperature: 0.8900


In [8]:
# ----------------------------
# Temperature function
# ----------------------
def apply_temperature(logits):
    return logits / temperature.item()

In [None]:
# -------------------------------------------------
# Evaluation and Probability calculation loop
# -------------------------------------------------
total, correct = 0.0 ,0.0 # correct → counts correct predictions. total → counts total samples.
model.eval()
probs1 = []
labels1 = []
with torch.no_grad():
    for batch in test_loader:
        outputs = model(**{k: v.to(device) for k, v in batch.items() if k!='label'})
        labels = batch['label'].to(device)
        labels1.append(labels.cpu()) # Numpy expects cpu tensors.
        scaled_logits = apply_temperature(outputs.logits) # Now, I am using scaled_logits as output logits to get softmax probabilities.
        probs = torch.softmax(scaled_logits, dim = 1)[:, 1]
        probs1.append(probs.cpu())
        preds = torch.argmax(scaled_logits, dim = 1) # torch.argmax(..., dim=1) → takes the index of the highest logit, i.e., predicted class (0 or 1). Basically, 'outputs.logits' is are the values that the softmax takes to make them between 0-1. We can use the same logic of 'torch.argamx' because, we get the same output label if we use '.argmax(softmax)' or '.argmax(outputs.logits)'. Note: 'outputs.logits' not the probability values.
        correct += (preds == labels).sum().item() # 'This compares predicted labels with true labels element-wise. .sum() Counts how many True values are there. In PyTorch, True is treated as 1 and False as 0. .item() Converts a single-element tensor to a Python number, why? because PyTorch tensors are not plain numbers, and for arithmetic or printing, we usually want a float/int.
        total += labels.size(0) # batch['label'] - This is a tensor containing the true labels for the current batch. '.size(0)' will give the total no. of values in each batch and give the final total value as a value which is the sum of all the values in all batches.

print(f"Test Accuracy: {100 * correct / total:.4f}%") # Total samples / Correct samples = Test accuracy. Multiplying with 100 to get the accuracy in percentage.        
probs1_numpy = torch.cat(probs1).numpy() # '.cat' concatenates all the list of pytorch-tensors into a single pytorch-tensor, then numpy converts them into numpy array.
labels1_numpy = torch.cat(labels1).numpy()

Test Accuracy: 88.3000%


In [10]:
# ---------------------------------------
# Expected Calibration Error (ECE)
# ---------------------------------------
def expected_calibration_error(probs, labels, n_bins = 15):
    bins = np.linspace(0, 1, n_bins + 1) # Gives 16 values between 0 and 1 separated by equal space. E.g., (0, 0.2, 0.4,..)
    ece = 0.0 # Stores the Expected Calibration Error
 
    for i in range(n_bins):
        bin_lower, bin_upper = bins[i], bins[i+1]
        in_bin = (probs > bin_lower) & (probs <= bin_upper)
        prop_in_bin = np.mean(in_bin) # This gives proportion of probs lying in the bin.
        if prop_in_bin > 0:
            acc = np.mean(labels[in_bin] == (probs[in_bin] > 0.5)) # Calculating accuracy.
            conf = np.mean(probs[in_bin])
            ece += np.abs(acc - conf) * prop_in_bin
    return ece

ece = expected_calibration_error(probs1_numpy, labels1_numpy)
print(f"ECE: {ece:.4f}")

ECE: 0.3757


# Before TEMPERATURE SCALING, ECE score was 0.4186 and Test Accuracy was 85.0150 - The model's accurate 85% of the time; but the confidence of the model is higher from the accuracy which means the model is overconfident and we can't trust its predictions.

# After TEMPERATURE SCALING, ECE SCORE is 0.3986 and Test Accuracy is 88.4500% - There is a 2 percent increase in the ECE even after temperature scaling because I have trained the model for 1 epoch only due to resource shortage. Yet accuracy is a little higher.

In [11]:
# ------------------------------------------
# Brier Score Computation
# ------------------------------------------
def compute_brier_score(probs, labels):
    brier_score = np.mean((probs - labels) ** 2)
    return f"Brier Score: {brier_score:.4f}"

compute_brier_score(probs1_numpy, labels1_numpy)

'Brier Score: 0.0847'

## Brier Score is 0.0847. This means the model's probabilities are closer to the true labels, but the model is not reliable because it is overconfident as observed in the ECE.

In [12]:
# ----------------------
# Testing 
# ----------------------
sample_texts = ["I loved this movie!", "This movie was terrible."]

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer(sample_texts, max_length = 128, padding = 'max_length', truncation = True, return_tensors="pt") # This will return the output as a tensorflow tensors.
outputs = model(**{k: v.to(device) for k, v in inputs.items()})

predictions = torch.argmax(outputs.logits, dim = 1) # 'outputs.logits' will return the shape of (batch_size, num_labels — 0&1 here). Since the labels are at the axis 1 so, I am using that axis for fetching the max value.
print(predictions.cpu().numpy()) # Numpy expects a CPU computation only.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

[1 0]


# Model has predicted the sample texts correctly even if it was trained with only 1 epoch!

In [13]:
model.cpu()
model.save_pretrained("./bert_sentiment_model")

In [14]:
model.save_pretrained("/kaggle/working/bert_sentiment_model")
tokenizer.save_pretrained("/kaggle/working/bert_sentiment_model")

('/kaggle/working/bert_sentiment_model/tokenizer_config.json',
 '/kaggle/working/bert_sentiment_model/special_tokens_map.json',
 '/kaggle/working/bert_sentiment_model/vocab.txt',
 '/kaggle/working/bert_sentiment_model/added_tokens.json')

In [15]:
tokenizer.save_pretrained("./kaggle/working/bert_sentiment_mode")

('./kaggle/working/bert_sentiment_mode/tokenizer_config.json',
 './kaggle/working/bert_sentiment_mode/special_tokens_map.json',
 './kaggle/working/bert_sentiment_mode/vocab.txt',
 './kaggle/working/bert_sentiment_mode/added_tokens.json')

In [16]:
!zip -r bert_sentiment_model.zip bert_sentiment_model


  adding: bert_sentiment_model/ (stored 0%)
  adding: bert_sentiment_model/model.safetensors (deflated 7%)
  adding: bert_sentiment_model/tokenizer_config.json (deflated 75%)
  adding: bert_sentiment_model/vocab.txt (deflated 53%)
  adding: bert_sentiment_model/config.json (deflated 49%)
  adding: bert_sentiment_model/special_tokens_map.json (deflated 42%)
