In [None]:
# Notebook: Hugging Face Fine-Tuning Model with GLUE SST2
# Author: Thomas Purk
# Date: 2025-04-02
# Reference: https://huggingface.co/docs/tokenizers/index
# Reference: https://huggingface.co/docs/datasets/en/index
# Reference: https://huggingface.co/datasets/nyu-mll/glue
# Reference: https://huggingface.co/google-bert/bert-base-cased
# Reference: https://huggingface.co/docs/evaluate/index

# Hugging Face Fine-Tuning Model with GLUE SST2

This notebook demonstrates fine-tuning of the 'bert-base-cased' model checkpoint using the GLUE SST2 dataset. The original model was trained for the a mask filling task. These steps replace the mask filling head with a sentiment anlysis head. This is accomplished using the Hugging Face Trainer module and the GLUE SST2 dataset which has "positive" and "negative" labels.

**Model**

> Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in this paper and first released in this repository. This model is case-sensitive: it makes a difference between english and English.

https://huggingface.co/google-bert/bert-base-cased


**Dataset**

>The Stanford Sentiment Treebank consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence. It uses the two-way (positive/negative) class split, with only sentence-level labels.


>Wang, Alex, Amanpreet, Singh, Julian, Michael, Felix, Hill, Omer, Levy, Samuel R., Bowman. "GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding." In the Proceedings of ICLR. 2019.


https://huggingface.co/datasets/nyu-mll/glue

## Notebook Setup

In [None]:
# Package Installs

#!pip install transformers evaluate datasets
!pip list | grep "transformers*\|datasets*\|evaluate*"

In [None]:
# Setup the Notebook

# General
import os
import json
import logging
logging.getLogger("transformers").setLevel(logging.WARNING) # Suppress unnecessary logging

# Visualization
import pprint
from IPython.core.display import display, HTML

# Data, Science, & Math
import numpy as np
import pandas as pd

# NLP
import transformers
import evaluate
from datasets import load_dataset
from transformers import AutoConfig
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import pipeline
from transformers import TrainingArguments
from transformers import Trainer
import torch

CUDA_LAUNCH_BLOCKING=1

In [None]:
if torch.cuda.is_available():
    device_id = torch.cuda.current_device()
    gpu_name = torch.cuda.get_device_name(device_id)
    print(f"GPU Name: {gpu_name}")
else:
    print("CUDA is not available. Running on CPU.")

In [None]:
# Notebook functions

def tokenize_function(data):
    """ Tokenizes the senteces in a dataset item

        Args:
            data (dictionary): An item from the dataset

        Returns:
            dict: The sentence encoded as numbers
    """

    # Get and return the grouped embeddings from the input dictonary
    return tokenizer(
        data["sentence"],
        padding="max_length",
        truncation=True
    )

def compute_metrics(evals_preds):
    """ Computes the metrics to describe the performance of the model results

        Args:
            evals_preds (EvalPrediction): a named tuple including a predictions and label_ids field

        Returns:
            dict: The value describing the performance
    """


    # Unpack the dictionary into variables
    # logits: an array of predictions as logits
    # labels: an array of sequence classification task results 0: is seqence 1: is not sequence
    logits, labels = evals_preds

    # Load the metrics associated with the MRPC dataset with the evaluate.load() function
    metric = evaluate.load("accuracy")

    # Convert the logits to their predicted class
    # It contains the logits for each element of the dataset we passed to predict()
    # To transform them into predictions that we can compare to our labels,
    # we need to take the index with the maximum value on the second axis
    predictions = np.argmax(
        a=logits,
        axis=-1
    )

    # Call the compute function
    return metric.compute(
        predictions=predictions,
        references=labels
    )

## Datasetup

In [None]:
# Load the SST2 data set from GLUE https://huggingface.co/datasets/nyu-mll/glue
ds_sst2 = load_dataset("glue", "sst2")
#ds_sst2 = load_dataset("yelp_review_full")
type(ds_sst2)

In [None]:
# Show Dataset Info
display(ds_sst2.column_names)
print('')
print(f'Total Training Rows: {ds_sst2.num_rows["train"]}')
print(f'Total Test Rows: {ds_sst2.num_rows["test"]}')

In [None]:
# Example Records
print(f'Example Train Record: {ds_sst2["train"][0]}')
print(f'Example Test Record: {ds_sst2["test"][0]}')

In [None]:
# Validate Labels

# Debug Possible Errors
# Error on GPU: "RuntimeError: CUDA error: device-side assert triggered"
# Error on CPU: "IndexError: Target -1 is out of bounds."
# Check for any -1 labels

print(f'Unique Training Labels: {set(ds_sst2["train"]["label"])}')
print('')
print(f'Unique Test Labels: {set(ds_sst2["test"]["label"])}')

In [None]:
# The sst2 test dataset's labels are hidden (-1), so they cannot be user for validation during trianing.
# The Train dataset is much larger than we are going to use in this demonstration.
# So it can be split into train and test

# Split the original train set into 80% train, 20% test
split_dataset = ds_sst2["train"].train_test_split(test_size=0.2, seed=42)

# Access the splits
ds_train_split = split_dataset["train"].shuffle(seed=42).select(range(4000))
ds_test_split = split_dataset["test"].shuffle(seed=42).select(range(800))


print(f'Training Rows: {ds_train_split.num_rows}')
print(f'Test Rows: {ds_test_split.num_rows}')
print('')

In [None]:
# Setup the Tokenizer & Model based on the checkpoint name

# Define the Model Checkpoint to Fine-Tune
#checkpoint = "distilbert/distilbert-base-uncased"
checkpoint = "google-bert/bert-base-cased"

# Automatically select a matching tokenizer based on the checkpoint name
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# How many labels does the dataset and model have?
ds_class_labels = ds_train_split.features["label"]
config = AutoConfig.from_pretrained(checkpoint)

print(f'The dataset has labels: {ds_class_labels.names}')
print('')
print(f'The model checkpoint has labels: {config.id2label}')

In [None]:
# Create custom labels for the new model "head"
custom_id2label = {}
custom_label2id = {}

# Loop each label in the dataset class names
# The model will be trained (fine-tuned) to predict these names
for l in ds_class_labels.names:
    id = ds_class_labels.str2int(l) # the id of the dataset label
    custom_id2label[id] = l
    custom_label2id[l] = id

print(f'custom_id2label: {custom_id2label}')
print(f'custom_label2id: {custom_label2id}')

In [None]:
# Automatically select a matching sequence classification model based on the checkpoint name
# NOTE: "AutoModelForSequenceClassification" is the same class used by the "text-classification" default pipeline

# Create the model
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=len(custom_id2label)
)

# Update label mappings
model.config.id2label = custom_id2label
model.config.label2id = custom_label2id

In [None]:
# Tokenize
# Create a new dictionary of tokenized datasets

ds_train_tokenize = ds_train_split.map(tokenize_function, batched=True)
ds_test_tokenize = ds_test_split.map(tokenize_function, batched=True)

In [None]:
# Execute the Fine-Tuning

# Create a new data collator to assemble sample data for training
# DataCollatorWithPadding - dynamically pada the inputs received.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define a set of hyperparameters for the Trainer to use
training_args = TrainingArguments(
    output_dir="test-trainer",
    eval_strategy="epoch",
    report_to="none" # disables the weights and biases (wandb) callback in the TrainingArguments
)


# Create a new trainier by passing in the objects created above
# This trainer will fine-tune the model for sentiment analysis using the GLUE SST2 dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train_tokenize,
    eval_dataset=ds_test_tokenize,
    compute_metrics=compute_metrics
)



In [None]:
# Execute the training process
trainer.train()

In [None]:
# Execute the model on some sample data
# Some code from ChatGPT

# NOTE: Colab's free GPU has been activated
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and move it to the device
model = model.to(device)

sentences = [
    "I absolutely love this new phone! It's fast and the camera is amazing.",
    "The service at the restaurant was terrible, and the food was cold when it arrived.",
    "I'm feeling pretty neutral about the movie—it had some good moments but was mostly forgettable.",
    "Winning the competition was the best experience of my life!",
    "I can't believe how frustrating this software update is; nothing works properly now."
]

# Create a pipeline for sentiment classification
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Test the model on new data
results = classifier(sentences)

# Print results
for sentence, result in zip(sentences, results):
    print(f"Sentence: {sentence}\nPredicted Sentiment: {result}\n")
