In [None]:
# Install Hugging Face Transformers, datasets, and torch libraries
# Uncomment the line below to install if not already installed
# !pip install transformers torch datasets==3.3.2


In [3]:
# Import libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

In [4]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


## Data Preperation and Tokenization

### Data Preperation & Tokenization

In [5]:
imdb_dataset = load_dataset('imdb')

# view dataset structure
print(imdb_dataset)

# view a sample data point
print(imdb_dataset['train'][10])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})
{'text': 'It was great to see some of my favorite stars of 30 years ago including John Ritter, Ben Gazarra and Audrey Hepburn. They looked quite wonderful. But that was it. They were not given any characters or good lines to work with. I neither understood or cared what the characters were doing.<br /><br />Some of the smaller female roles were fine, Patty Henson and Colleen Camp were quite competent and confident in their small sidekick parts. They showed some talent and it is sad they didn\'t go on to star in more and better films. Sadly, I didn\'t think Dorothy Stratten got a chance to act in this her only important film role.<br /><br />The film appears to have some fans, and I was very open-minde

This dataset is a collection of movie reviews on IMDB. Each record has the review under `text` and sentiment under `label` (0 is negative, 1 is positive).

### Preprocessing
We apply some basic preprocessing steps here:
- `lower()` treats same words with different capitalization the same
- `strip()` removes extra whitespace that might mess up tokenization process
- Split dataset into train and test

In [6]:
# lowercases all text and removes extra whitespace
def preprocess(text):
    return text.lower().strip()

# applies preprocessing to dataset
imdb_dataset = imdb_dataset.map(lambda dict_item: {'text': preprocess(dict_item['text'])})

train_data, test_data = imdb_dataset['train'], imdb_dataset['test']

# checks a sample after preprocessing
print(train_data[10])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

{'text': 'it was great to see some of my favorite stars of 30 years ago including john ritter, ben gazarra and audrey hepburn. they looked quite wonderful. but that was it. they were not given any characters or good lines to work with. i neither understood or cared what the characters were doing.<br /><br />some of the smaller female roles were fine, patty henson and colleen camp were quite competent and confident in their small sidekick parts. they showed some talent and it is sad they didn\'t go on to star in more and better films. sadly, i didn\'t think dorothy stratten got a chance to act in this her only important film role.<br /><br />the film appears to have some fans, and i was very open-minded when i started watching it. i am a big peter bogdanovich fan and i enjoyed his last movie, "cat\'s meow" and all his early ones from "targets" to "nickleodeon". so, it really surprised me that i was barely able to keep awake watching this one.<br /><br />it is ironic that this movie is a

### Tokenization
**Tokenization** is the process of breaking down a piece of text into smaller units called **tokens** (typically a word). Here, we use HuggingFace's `AutoTokenizer`. Note that we want to ensure all sequences are the same length (128) since the model expects fixed-length input sequences.

In [7]:
# initialize tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(dict_items):
    return tokenizer(
        dict_items['text'],
        padding='max_length',   # adds extra padding to reach max_length
        truncation=True,        # truncates sequence if too long
        max_length=128          # all sequences are exactly 128 chars
    )

# tokenize both train and test datasets
tokenized_train_data = train_data.map(tokenize, batched=True)
tokenized_test_data = test_data.map(tokenize, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

### Padding & Truncation
This demonstrates how padding and truncation work.

In [8]:
sample_text = imdb_dataset['train'][10]['text']

tokenized_sample = tokenizer(
    sample_text,
    truncation=True,
    padding='max_length',
    max_length=512
)

print(tokenized_sample)
print(len(tokenized_sample['input_ids']))

{'input_ids': [101, 2009, 2001, 2307, 2000, 2156, 2070, 1997, 2026, 5440, 3340, 1997, 2382, 2086, 3283, 2164, 2198, 23168, 1010, 3841, 14474, 11335, 1998, 14166, 22004, 1012, 2027, 2246, 3243, 6919, 1012, 2021, 2008, 2001, 2009, 1012, 2027, 2020, 2025, 2445, 2151, 3494, 2030, 2204, 3210, 2000, 2147, 2007, 1012, 1045, 4445, 5319, 2030, 8725, 2054, 1996, 3494, 2020, 2725, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2070, 1997, 1996, 3760, 2931, 4395, 2020, 2986, 1010, 17798, 27227, 1998, 28385, 3409, 2020, 3243, 17824, 1998, 9657, 1999, 2037, 2235, 29240, 3033, 1012, 2027, 3662, 2070, 5848, 1998, 2009, 2003, 6517, 2027, 2134, 1005, 1056, 2175, 2006, 2000, 2732, 1999, 2062, 1998, 2488, 3152, 1012, 13718, 1010, 1045, 2134, 1005, 1056, 2228, 9984, 2358, 8609, 6528, 2288, 1037, 3382, 2000, 2552, 1999, 2023, 2014, 2069, 2590, 2143, 2535, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 2143, 3544, 2000, 2031, 2070, 4599, 1010, 1998, 1045, 2001, 2200, 2330, 1011, 13128, 20

## Fine-Tuning DistilBERT for Sentiment Classification

### Setting Up the Model
We can use the pre-trained model and set it up for binary classification with `num_labels=2`

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

# print model architecture
print(model)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


### Configuring Training Parameters
*HuggingFace* provides lots of useful configuration options through `TrainingArguments` that we can later pass into the model trainer. Comes with a lot of defaults, we can change certain settings to our liking.

In [10]:
training_args = TrainingArguments(
    output_dir='./results',         # directory for model checkpoints
    eval_strategy='epoch',          # evaluate model at end of each epoch
    learning_rate=5e-5,             # 5e-5 is common for pre-trained models
    per_device_train_batch_size=16, # trainng batch size
    per_device_eval_batch_size=16,  # eval batch size for
    num_train_epochs=3,             # num training epochs
    weight_decay=0.01,              # regularization to prevent overfitting
    logging_dir='./logs',           # directory for logs
    logging_steps=10,               # log every 10 steps
    save_strategy='epoch',          # save model after every epoch
    load_best_model_at_end=True     # load the best model at the end of training
)

print(training_args)

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_use_gather_object=False

### Training the Model

In [11]:
# define custom performance metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

# train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_test_data,
    compute_metrics=compute_metrics,
)

trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtikkikkit21[0m ([33mtikkikkit21-virginia-tech[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2956,0.36748,0.85232,0.917283,0.77448,0.839854
2,0.2359,0.350259,0.87072,0.885396,0.85168,0.868211
3,0.2135,0.537011,0.87712,0.872413,0.88344,0.877892


TrainOutput(global_step=4689, training_loss=0.21983361573590732, metrics={'train_runtime': 1145.9814, 'train_samples_per_second': 65.446, 'train_steps_per_second': 4.092, 'total_flos': 2483763724800000.0, 'train_loss': 0.21983361573590732, 'epoch': 3.0})

After training, we can evaluate the results.

In [12]:
eval_result = trainer.evaluate()
print('Evaluation results:', eval_result)

Evaluation results: {'eval_loss': 0.35025909543037415, 'eval_accuracy': 0.87072, 'eval_precision': 0.885395874916833, 'eval_recall': 0.85168, 'eval_f1': 0.8682107323438265, 'eval_runtime': 86.3676, 'eval_samples_per_second': 289.46, 'eval_steps_per_second': 18.097, 'epoch': 3.0}


We can see that the general accuracy is pretty high, which is supported by high precision, recall, and F1. Furthermore, `eval_runtime` and `samples_per_second` demonstrate efficiency when processing the movie reviews, which is also good.

## Making Predictions

### Build a Prediction Function
We can create a function that feeds a new review to the model which calculates scores for each classification., We use `torch` to get the highest score to determine the sentiment classification.

In [13]:
def predict_sentiment(text):
    # tokenize input text
    tokens = tokenizer(
        text,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=128
    )
    tokens = {key: val.to(device) for key, val in tokens.items()}

    # get model prediction
    output = model(**tokens)
    prediction = torch.argmax(output.logits, dim=1).item()
    sentiment = 'Positive' if prediction == 1 else 'Negative'

    return sentiment

### Test Prediction Function
Now we can test our prediction function on a sample review.

In [14]:
new_text = 'This movie was amazing! I loved every minute of it.'
print(f"Text: '{new_text}'")
print('Predicted Sentiment:', predict_sentiment(new_text))

Text: 'This movie was amazing! I loved every minute of it.'
Predicted Sentiment: Positive
