# Fine-tuning RoBERTa for 10-Class Classification

This notebook fine-tunes a RoBERTa model to classify recipes into 10 classes based on their nutritional value (0-100). The classes are:
- Class 0: 0-10
- Class 1: 11-20
- Class 2: 21-30
- Class 3: 31-40
- Class 4: 41-50
- Class 5: 51-60
- Class 6: 61-70
- Class 7: 71-80
- Class 8: 81-90
- Class 9: 91-100

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
import numpy as np
from tqdm import tqdm
import wandb
from huggingface_hub import login
import os
from google.colab import userdata

In [None]:
NUM_EPOCHS = 9

In [None]:
# Access API keys from Colab secrets
HUGGINGFACE_TOKEN = userdata.get('HF_TOKEN')
WANDB_API_KEY = userdata.get('WANDB_API_KEY')

# Initialize wandb
wandb.login(key=WANDB_API_KEY)

# Initialize Weights & Biases
wandb.init(
    project="nutrivision-roberta-classification",
    config={
        "architecture": "RoBERTa",
        "dataset": "recipe-classification",
        "epochs": NUM_EPOCHS,
        "batch_size": 8,
        "learning_rate": 2e-5
    }
)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
# Load and preprocess data
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    texts = []
    labels = []

    for item in data:
        texts.append(item['text'])
        # Convert regression value to class (0-9)
        label = int(item['label'] // 10)
        if label > 9:  # Handle edge case where label is 100
            label = 9
        labels.append(label)

    return texts, labels

# Load data
texts, labels = load_data('roberta_regression_data.json')

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [None]:
# Create dataset class
class RecipeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=10)

# Create datasets
train_dataset = RecipeDataset(train_texts, train_labels, tokenizer)
val_dataset = RecipeDataset(val_texts, val_labels, tokenizer)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = NUM_EPOCHS

In [None]:
# Training loop
def train():
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(train_dataloader)

# Validation loop
def evaluate():
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(val_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    return total_loss / len(val_dataloader), correct / total

In [None]:
# Training
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    train_loss = train()
    val_loss, _ = evaluate()

    # Log metrics to Weights & Biases
    wandb.log({
        'epoch': epoch + 1,
        'train_loss': train_loss,
        'val_loss': val_loss,
    })

    print(f'Train Loss: {train_loss:.4f}')
    print(f'Val Loss: {val_loss:.4f}')

Epoch 1/9


100%|██████████| 10/10 [00:01<00:00,  6.28it/s]
100%|██████████| 3/3 [00:00<00:00, 12.44it/s]


Train Loss: 2.0008
Val Loss: 1.9432
Epoch 2/9


100%|██████████| 10/10 [00:01<00:00,  6.44it/s]
100%|██████████| 3/3 [00:00<00:00, 12.57it/s]


Train Loss: 1.6330
Val Loss: 1.9006
Epoch 3/9


100%|██████████| 10/10 [00:01<00:00,  6.41it/s]
100%|██████████| 3/3 [00:00<00:00, 12.51it/s]


Train Loss: 1.5391
Val Loss: 1.7897
Epoch 4/9


100%|██████████| 10/10 [00:01<00:00,  6.45it/s]
100%|██████████| 3/3 [00:00<00:00, 12.63it/s]


Train Loss: 1.4185
Val Loss: 1.7708
Epoch 5/9


100%|██████████| 10/10 [00:01<00:00,  6.44it/s]
100%|██████████| 3/3 [00:00<00:00, 12.57it/s]


Train Loss: 1.4223
Val Loss: 1.7434
Epoch 6/9


100%|██████████| 10/10 [00:01<00:00,  6.45it/s]
100%|██████████| 3/3 [00:00<00:00, 12.63it/s]


Train Loss: 1.3951
Val Loss: 1.6740
Epoch 7/9


100%|██████████| 10/10 [00:01<00:00,  6.44it/s]
100%|██████████| 3/3 [00:00<00:00, 12.57it/s]


Train Loss: 1.3248
Val Loss: 1.7039
Epoch 8/9


100%|██████████| 10/10 [00:01<00:00,  6.44it/s]
100%|██████████| 3/3 [00:00<00:00, 12.56it/s]


Train Loss: 1.2616
Val Loss: 1.7668
Epoch 9/9


100%|██████████| 10/10 [00:01<00:00,  6.44it/s]
100%|██████████| 3/3 [00:00<00:00, 12.48it/s]

Train Loss: 1.1045
Val Loss: 1.7577





In [None]:
# Save the model locally
model.save_pretrained('roberta_classification_model')
tokenizer.save_pretrained('roberta_classification_model')

# Push to Hugging Face Hub
model.push_to_hub("zoya-hammadk/nutrivision-roberta-classification")
tokenizer.push_to_hub("zoya-hammadk/nutrivision-roberta-classification")

# Close Weights & Biases run
wandb.finish()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


0,1
epoch,▁▂▃▄▅▅▆▇█
train_loss,█▅▄▃▃▃▃▂▁
val_loss,█▇▄▄▃▁▂▃▃

0,1
epoch,9.0
train_loss,1.10445
val_loss,1.75769
