In [8]:
!pip install -q datasets transformers torch

In [1]:
from datasets import load_dataset

# Load the Emotion dataset
dataset = load_dataset("dair-ai/emotion")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
from transformers import AutoTokenizer

# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenizing function
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

# Apply the tokenization to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [3]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

# Set format for PyTorch
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Create data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create data loaders
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=16, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=16, collate_fn=data_collator)

In [4]:
from transformers import AutoModelForSequenceClassification

# Use a smaller version of BERT, like TinyBERT
model = AutoModelForSequenceClassification.from_pretrained("huawei-noah/TinyBERT_General_4L_312D", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers import AdamW, get_scheduler

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define learning rate scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)



In [6]:
import torch
from tqdm.auto import tqdm

# Use GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Training loop
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


  0%|          | 0/3000 [00:00<?, ?it/s]

In [7]:
model.eval()
accuracy = 0
total = 0

for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    accuracy += (predictions == batch["labels"]).sum().item()
    total += len(batch["labels"])

print(f"Validation Accuracy: {accuracy / total:.4f}")

Validation Accuracy: 0.9225


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
# Define the path in Google Drive where the model will be saved
output_dir = "/content/drive/MyDrive/fine-tuned-tinybert"

# Save the model and tokenizer to the Google Drive path
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")

Model saved to /content/drive/MyDrive/fine-tuned-tinybert
