In [None]:
# Install Hugging Face Transformers, datasets, and torch libraries
# Uncomment the line below to install if not already installe
!pip install transformers datasets torch



In [None]:
# Import libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Loading dataset
imdb_dataset = load_dataset("imdb")
# View dataset structure
print(imdb_dataset)
# View a sample data point
print(imdb_dataset['train'][10])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})
{'text': 'It was great to see some of my favorite stars of 30 years ago including John Ritter, Ben Gazarra and Audrey Hepburn. They looked quite wonderful. But that was it. They were not given any characters or good lines to work with. I neither understood or cared what the characters were doing.<br /><br />Some of the smaller female roles were fine, Patty Henson and Colleen Camp were quite competent and confident in their small sidekick parts. They showed some talent and it is sad they didn\'t go on to star in more and better films. Sadly, I didn\'t think Dorothy Stratten got a chance to act in this her only important film role.<br /><br />The film appears to have some fans, and I was very open-minde

In [5]:
# Define a preprocessing function
def preprocess(text):
    # Lowercase text and strip extra whitespace
    return text.lower().strip()

# Apply preprocessing to dataset
imdb_dataset = imdb_dataset.map(lambda dict_item: {'text': preprocess(dict_item['text'])})

train_data, test_data = imdb_dataset["train"], imdb_dataset["test"]

# Check a sample after preprocessing
print(train_data[10])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

{'text': 'it was great to see some of my favorite stars of 30 years ago including john ritter, ben gazarra and audrey hepburn. they looked quite wonderful. but that was it. they were not given any characters or good lines to work with. i neither understood or cared what the characters were doing.<br /><br />some of the smaller female roles were fine, patty henson and colleen camp were quite competent and confident in their small sidekick parts. they showed some talent and it is sad they didn\'t go on to star in more and better films. sadly, i didn\'t think dorothy stratten got a chance to act in this her only important film role.<br /><br />the film appears to have some fans, and i was very open-minded when i started watching it. i am a big peter bogdanovich fan and i enjoyed his last movie, "cat\'s meow" and all his early ones from "targets" to "nickleodeon". so, it really surprised me that i was barely able to keep awake watching this one.<br /><br />it is ironic that this movie is a

In [6]:
# Initialize tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define tokenization function
def tokenize(dict_items):
  return tokenizer(dict_items["text"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization
tokenized_train_data = train_data.map(tokenize, batched=True)
tokenized_test_data = test_data.map(tokenize, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [8]:
# For illsutration purpose only
sample_text = imdb_dataset['train'][10]['text']
# Example configuration for padding and truncation
tokenized_sample = tokenizer(
    sample_text,
    # Cut off sequences longer than model's max input size
    truncation = True,
    # Pad to model's max input size
    padding = 'max_length',
    # Maximum token length for DistilBERT
    max_length=512
)

# View padded and truncated tokenized output
print(tokenized_sample)
print(len(tokenized_sample['input_ids']))

{'input_ids': [101, 2009, 2001, 2307, 2000, 2156, 2070, 1997, 2026, 5440, 3340, 1997, 2382, 2086, 3283, 2164, 2198, 23168, 1010, 3841, 14474, 11335, 1998, 14166, 22004, 1012, 2027, 2246, 3243, 6919, 1012, 2021, 2008, 2001, 2009, 1012, 2027, 2020, 2025, 2445, 2151, 3494, 2030, 2204, 3210, 2000, 2147, 2007, 1012, 1045, 4445, 5319, 2030, 8725, 2054, 1996, 3494, 2020, 2725, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2070, 1997, 1996, 3760, 2931, 4395, 2020, 2986, 1010, 17798, 27227, 1998, 28385, 3409, 2020, 3243, 17824, 1998, 9657, 1999, 2037, 2235, 29240, 3033, 1012, 2027, 3662, 2070, 5848, 1998, 2009, 2003, 6517, 2027, 2134, 1005, 1056, 2175, 2006, 2000, 2732, 1999, 2062, 1998, 2488, 3152, 1012, 13718, 1010, 1045, 2134, 1005, 1056, 2228, 9984, 2358, 8609, 6528, 2288, 1037, 3382, 2000, 2552, 1999, 2023, 2014, 2069, 2590, 2143, 2535, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 2143, 3544, 2000, 2031, 2070, 4599, 1010, 1998, 1045, 2001, 2200, 2330, 1011, 13128, 20

In [9]:
# Set up the model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    # Binary classification: Positive and Negative
    num_labels = 2
)

# Print model architecture
print(model)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [10]:
training_args = TrainingArguments(
    # Output directory for model checkpoints
    output_dir="./results",
    # Evaluate the model at the end of each epoch
    eval_strategy="epoch",
    # Learning rate
    learning_rate = 5e-5,
    # Batch size for training
    per_device_train_batch_size=16,
    # Batch size for evaluation
    per_device_eval_batch_size=16,
    # Number of training epochs
    num_train_epochs=3,
    # Weight decay for regularization
    weight_decay = 0.01,
    # Directory for storing logs
    logging_dir="./logs",
    # Log after every 10 steps
    logging_steps=10,
    # Save model after every epoch
    save_strategy="epoch",
    # Load the best model at the end of training
    load_best_model_at_end=True
)

print(training_args)

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_use_gather_object=False,

In [11]:
# Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Initialize the Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_test_data,
    compute_metrics = compute_metrics,

)

# Train the model
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mvarungowda2007[0m ([33mvarungowda2007-iedc-dsce[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3035,0.359829,0.85176,0.916146,0.7744,0.839331
2,0.2593,0.345798,0.87344,0.880316,0.8644,0.872285
3,0.1667,0.534076,0.87712,0.871884,0.88416,0.877979


TrainOutput(global_step=4689, training_loss=0.22122974327522493, metrics={'train_runtime': 1274.3862, 'train_samples_per_second': 58.852, 'train_steps_per_second': 3.679, 'total_flos': 2483763724800000.0, 'train_loss': 0.22122974327522493, 'epoch': 3.0})

In [12]:
# Evaluate the model
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

Evaluation results: {'eval_loss': 0.345798134803772, 'eval_accuracy': 0.87344, 'eval_precision': 0.8803161153658139, 'eval_recall': 0.8644, 'eval_f1': 0.872285460563494, 'eval_runtime': 92.2178, 'eval_samples_per_second': 271.097, 'eval_steps_per_second': 16.949, 'epoch': 3.0}


In [13]:
# Build a prediction function
def predict_sentiment(text):
    # Tokenize the input text
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    # Move tokens to device
    tokens = {key: val.to(device) for key, val in tokens.items()}
    # Get model predictions
    output = model(**tokens)
    # Get the predicted class
    prediction = torch.argmax(output.logits, dim = 1).item()
    sentiment = "Positive" if prediction == 1 else "Negative"
    return sentiment

In [15]:
# Example: Testing with a new sentence
new_text = "This movie was amazing! I loved every minute of it."
print(f"Text: '{new_text}'")
print("Predict Sentiment:", predict_sentiment(new_text))

Text: 'This movie was amazing! I loved every minute of it.'
Predict Sentiment: Positive
