# Fine-tuning RoBERTa for 10-Class Classification

This notebook fine-tunes a RoBERTa model to classify recipes into 10 classes based on their nutritional value (0-100). The classes are:
- Class 0: 0-10
- Class 1: 11-20
- Class 2: 21-30
- Class 3: 31-40
- Class 4: 41-50
- Class 5: 51-60
- Class 6: 61-70
- Class 7: 71-80
- Class 8: 81-90
- Class 9: 91-100

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import wandb
from huggingface_hub import login
import os
from dotenv import load_dotenv

In [None]:
# Load environment variables
load_dotenv()

# Login to Hugging Face
hf_token = os.getenv('HF_TOKEN')
login(token=hf_token)

# Initialize Weights & Biases
wandb.init(
    project="nutrivision-roberta-classification",
    config={
        "architecture": "RoBERTa",
        "dataset": "recipe-classification",
        "epochs": 3,
        "batch_size": 8,
        "learning_rate": 2e-5
    }
)

In [None]:
# Load and preprocess data
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    texts = []
    labels = []
    
    for item in data:
        texts.append(item['text'])
        # Convert regression value to class (0-9)
        label = int(item['label'] // 10)
        if label > 9:  # Handle edge case where label is 100
            label = 9
        labels.append(label)
    
    return texts, labels

# Load data
texts, labels = load_data('processed_data/roberta_regression_data.json')

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [None]:
# Create dataset class
class RecipeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=10)

# Create datasets
train_dataset = RecipeDataset(train_texts, train_labels, tokenizer)
val_dataset = RecipeDataset(val_texts, val_labels, tokenizer)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [None]:
# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

In [None]:
# Training loop
def train():
    model.train()
    total_loss = 0
    
    for batch in tqdm(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    return total_loss / len(train_dataloader)

# Validation loop
def evaluate():
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in tqdm(val_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    
    return total_loss / len(val_dataloader), correct / total

In [None]:
# Training
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    train_loss = train()
    val_loss, val_acc = evaluate()
    
    # Log metrics to Weights & Biases
    wandb.log({
        'epoch': epoch + 1,
        'train_loss': train_loss,
        'val_loss': val_loss,
        'val_accuracy': val_acc
    })
    
    print(f'Train Loss: {train_loss:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}')

In [None]:
# Save the model locally
model.save_pretrained('roberta_classification_model')
tokenizer.save_pretrained('roberta_classification_model')

# Push to Hugging Face Hub
model.push_to_hub("zoya-hammadk/nutrivision-roberta-classification")
tokenizer.push_to_hub("zoya-hammadk/nutrivision-roberta-classification")

# Close Weights & Biases run
wandb.finish()