In [None]:
!pip install transformers datasets evaluate

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Load the IMDB movie reviews dataset from the Hugging Face dataset library
imdb = load_dataset("imdb")
imdb["test"][0]

In [None]:
# Load a pre-trained tokenizer (DistilBERT model, uncased) from the Hugging Face library
tokenizer = ##### YOUR CODE #####

# Define a preprocessing function that tokenizes the input text and applies truncation
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

# Apply the preprocessing function to the IMDB dataset using the 'map' method,
# which processes the dataset in batches and returns the tokenized version
tokenized_imdb = imdb.map(preprocess_function, batched=True)

# Create a data collator that will dynamically pad the input sequences to the same length
# using the tokenizer. This ensures that batches of tokenized text are uniformly padded for training or evaluation.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create a dictionary to map class indices to human-readable labels for sentiment analysis
# 0 represents "NEGATIVE" sentiment, and 1 represents "POSITIVE" sentiment
# Create a reverse dictionary to map sentiment labels back to their respective class indices
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
# Load the 'accuracy' metric from the Hugging Face evaluation library
accuracy = evaluate.load("accuracy")

# Define a function to compute evaluation metrics
# The function takes the evaluation predictions and ground truth labels as input
def compute_metrics(eval_pred):
    # Unpack the predictions and labels from the evaluation tuple
    predictions, labels = eval_pred

    # Convert the model's raw output (logits) into predicted class labels using argmax
    predictions = np.argmax(predictions, axis=1)

    # Compute the accuracy by comparing predictions with reference labels
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
# Load a pre-trained DistilBERT model for sequence classification from the Hugging Face library
# The model is initialized for a binary classification task (num_labels=2)
# The id2label and label2id mappings are provided to associate class indices with their respective sentiment labels
model = ##### YOUR CODE #####

In [None]:
# Define the training arguments for the Trainer API
training_args = TrainingArguments(
    output_dir="my_awesome_model",            # Directory where the model checkpoints will be saved
    learning_rate=2e-5,                       # Learning rate for the optimizer
    per_device_train_batch_size=16,           # Batch size per device for training
    per_device_eval_batch_size=16,            # Batch size per device for evaluation
    num_train_epochs=2,                       # Number of training epochs
    weight_decay=0.01,                        # Weight decay for regularization
    evaluation_strategy="epoch",              # Evaluate the model at the end of each epoch
    save_strategy="epoch",                    # Save the model at the end of each epoch
    load_best_model_at_end=True,              # Load the best model based on evaluation at the end of training
    push_to_hub=False,                        # Whether to push the model to the Hugging Face Model Hub (disabled)
)

# Initialize the Trainer class with the model, training arguments, datasets, and other components
trainer = Trainer(
    model=model,                              # Pre-trained DistilBERT model for sequence classification
    args=training_args,                       # Training arguments specified above
    train_dataset=tokenized_imdb["train"],    # Tokenized IMDB training dataset
    eval_dataset=tokenized_imdb["test"],      # Tokenized IMDB test dataset for evaluation
    tokenizer=tokenizer,                      # Tokenizer for processing input text
    data_collator=data_collator,              # Data collator for padding sequences in batches
    compute_metrics=compute_metrics,          # Function to compute evaluation metrics (accuracy)
)

# Start training the model using the specified trainer and training arguments
trainer.train()