# In this project, I have perfomed: Fine-tuning BERT for text classification — the foundation for all modern NLP pipelines. I have used Hugging Face Transformers with Pytorch.

In [None]:
# -----------------------
# Required Dependencies
# -----------------------
!pip install transformers
!pip install --upgrade transformers

In [1]:
# -----------------------
# Required Imports 
# -----------------------
import torch
from transformers import BertTokenizer, BertForSequenceClassification # This 'transformers' s a hugging face library. 'BertForSequenceClassification' comes from Pytorch.
from datasets import load_dataset
import pandas as pd
from torch.optim import AdamW
import re
from torch.utils.data import DataLoader
from torchsummary import summary

2025-11-11 04:03:57.589408: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762833837.796833      98 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762833837.853500      98 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
dataset = load_dataset("imdb") # Loading IMDB dataset for sentiment analysis.
print(dataset)

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
# -----------------------
# Cleaning the dataset 
# -----------------------
#  There are lot of HTML tags in the text (reviews). Note, the BertTokenizer automatically handles and removes the rare 'symbols', preserving only the common ones.
def clean_text(text):
    text = re.sub(r'<.*?>', ' ', text) # Removes all the HTML tags since there are many.
    text = re.sub(r'\s+', ' ', text) # Removes extra spaces
    
    return text.strip()

# Applying mapping the current text and saving the clean version in the same dictionary keys (train & test)
dataset = dataset.map(lambda x: {'text': clean_text(x['text'])})

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
# ---------------------
# Tokenization
# --------------------
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # bert-base-uncased → standard English BERT. Tokenizer converts words → token IDs. The 'BertTokenizer' is also a pretrained model just like the 'BERT' itself (Here, 'bert-base-uncased' is a pretrained-english-tokenizer).
# Performing Tokenization in batches which is recommended for faster computing
# Also, the 'BertTokenizer' is already lower-cased. It also pads the sequences.
def tokenize(batch):
    return tokenizer(batch['text'],  padding = 'max_length', max_length = 128, truncation = True) # By default, the padding is 'post' that's why not written padding = 'post', instead written padding = 'max_len' which will pad each sequence to the maximum length which 128. 128 is generally sufficient for small texts(reviews, emails, etc). truncate = True will truncate sentences longer than 128 length.

dataset = dataset.map(tokenize, batched = True)

print(dataset)  # The tokenizer returns a dictionary with multiple keys, like: 'input_ids' → token IDs for each word/sub word, 'token_type_ids' → segment IDs (used by BERT for sentence pairs), 'attention_mask' → 1 for real tokens, 0 for padding. map() automatically adds these new columns to your dataset while keeping the original ones ('text' and 'label').
# Also, note that all the features in this dataset-dictionary (even 'text'-feature) are used by the model internally for validation and loss calculation. "But, how 'text' will be used for loss_calculation and validation?" - The model will compute the 'token_ids' for the actual input sentences each time when it wants in order to calculate the loss and compare it with the predicted 'input_ids'.


dataset.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label']) # Keep only the necessary columns for model training. 'torch' is the specifier to make the dataset as 'torch' tensors.


train_loader = DataLoader(dataset['train'], batch_size = 16, shuffle = True) # Creating DataLoaders.
test_loader = DataLoader(dataset['train'], batch_size = 16)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})


In [5]:
# -----------------
# Model Training 
# -----------------
# The workflow and methods to train and compile is different from Tensorflow in Pytorch
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2) 
optimizer = AdamW(model.parameters(), lr = 5e-5) # Using the AdamW optimizer because: AdamW --> Adam + correct weight decay → better generalization, stable training, less overfitting. Use the AdamW from the hugging face itself. This much Learning rate is a sweet-spot for BERT and is recommended.
criterion = torch.nn.CrossEntropyLoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# --------- Training Loop ---------
model.train() # This: Puts the model in training mode and Activates dropout and batch normalization (training-specific layers). Without 'model.train()' these layers will behave as in evaluation mode, which will make the training inaccurate.
for epoch in range(1): # Training with only 1 epoch is sufficient for checking if whether everything is working fine or not.
    total_loss = 0 # This will capture the loss at each batch.
    for batch in train_loader: # Iterating through each batch.
        optimizer.zero_grad() # This will reset the gradients of the previous batch for the new batch.

        outputs = model(**{k: v.to(device) for k, v in batch.items() if k != 'label'}, labels = batch['label'].to(device)) # outputs is a SequenceClassifierOutput object.
        
        loss = outputs.loss # outputs is a SequenceClassifierOutput object with: outputs.logits → the raw predictions and outputs.loss → the loss computed automatically
        total_loss += loss.item() # loss.item() → converts the PyTorch tensor to a Python float so we can accumulate it. total_loss += loss.item() → sum the losses across all batches.

        loss.backward() # This is a Back propagation which calculates gradients for all model parameters.
        optimizer.step() # This Updates the model weights based on the gradients.

    print(f"Epoch: {epoch + 1} | Training Loss: {total_loss / len(train_loader):.4f}") # len(train_loader) will give the total no. of batches. Dividing it with the accumulated total_loss gives an average loss of each batch.

print(model.config)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1 | Training Loss: 0.3421
BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dtype": "float32",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "transformers_version": "4.57.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [15]:
# ----------------
# Evaluation
# ---------------
model.eval() # Puts the model into evaluation mode. Disables dropout and other training-only layers for consistent results.

correct, total = 0, 0 # correct → counts correct predictions. total → counts total examples

with torch.no_grad(): # torch.no_grad() → disables gradient calculation (saves memory and speeds up evaluation).
    for batch in test_loader:
        outputs = model(**{k: v.to(device) for k, v in batch.items() if k != 'label'})
        preds = torch.argmax(outputs.logits, dim = 1) # torch.argmax(..., dim=1) → takes the index of the highest logit, i.e., predicted class (0 or 1)
        labels = batch["label"].to(device)
        correct += (preds == labels).sum().item() # 'preds == batch['label']' This compares predicted labels with true labels element-wise. .sum() Counts how many True values are there. In PyTorch, True is treated as 1 and False as 0. .item() Converts a single-element tensor to a Python number, why? because PyTorch tensors are not plain numbers, and for arithmetic or printing, we usually want a float/int.
        total += labels.size(0) # batch['label'] - This is a tensor containing the true labels for the current batch. '.size(0)' will give the total no. of values in each batch and give the final total value as a value which is the sum of all the values in all batches.
print(f"Test Accuracy: {100 * correct / total:.2f}%")

Test Accuracy: 94.42%


In [26]:
# ----------------------
# Testing 
# ----------------------
sample_texts = ["I loved this movie!", "This movie was terrible."]

inputs = tokenizer(sample_texts, max_length = 128, padding = 'max_length', truncation = True, return_tensors="pt") # This will return the output as a tensorflow tensors.
outputs = model(**{k: v.to(device) for k, v in inputs.items()})

predictions = torch.argmax(outputs.logits, dim = 1) # 'outputs.logits' will return the shape of (batch_size, num_labels). Since the labels are at the axis 1 so, I am using that axis for fetching the max value.
print(predictions.cpu().numpy()) # Numpy expects a CPU computation only.

[1 0]


In [None]:
model.save_pretrained("/kaggle/working/bert_sentiment_model")
tokenizer.save_pretrained("/kaggle/working/bert_sentiment_model")

In [None]:
!zip -r bert_sentiment_model.zip bert_sentiment_model

# Model has predicted the sample texts correctly even if it was trained with only 1 epoch!