# Fine-tuning RoBERTa for 25-Class Classification

This notebook fine-tunes a RoBERTa model to classify recipes into 25 classes based on their nutritional value (0-100). The classes are:
- Class 0: 0-4
- Class 1: 5-8
- Class 2: 9-12
- Class 3: 13-16
- Class 4: 17-20
- Class 5: 21-24
- Class 6: 25-28
- Class 7: 29-32
- Class 8: 33-36
- Class 9: 37-40
- Class 10: 41-44
- Class 11: 45-48
- Class 12: 49-52
- Class 13: 53-56
- Class 14: 57-60
- Class 15: 61-64
- Class 16: 65-68
- Class 17: 69-72
- Class 18: 73-76
- Class 19: 77-80
- Class 20: 81-84
- Class 21: 85-88
- Class 22: 89-92
- Class 23: 93-96
- Class 24: 97-100

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
import numpy as np
from tqdm import tqdm
import wandb
from huggingface_hub import login
import os
from google.colab import userdata

In [None]:
NUM_EPOCHS = 9

In [None]:
# Access API keys from Colab secrets
HUGGINGFACE_TOKEN = userdata.get('HF_TOKEN')
WANDB_API_KEY = userdata.get('WANDB_API_KEY')

# Initialize wandb
wandb.login(key=WANDB_API_KEY)

# Initialize Weights & Biases
wandb.init(
    project="nutrivision-roberta-classification-25",
    config={
        "architecture": "RoBERTa",
        "dataset": "recipe-classification-25",
        "epochs": NUM_EPOCHS,
        "batch_size": 8,
        "learning_rate": 2e-5,
        "num_classes": 25
    }
)

In [None]:
# Load and preprocess data
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    texts = []
    labels = []

    for item in data:
        texts.append(item['text'])
        # Convert regression value to class (0-24)
        label = int(item['label'] // 4)
        if label > 24:  # Handle edge case where label is 100
            label = 24
        labels.append(label)

    return texts, labels

# Load data
texts, labels = load_data('roberta_regression_data.json')

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [None]:
# Create dataset class
class RecipeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=25)

# Create datasets
train_dataset = RecipeDataset(train_texts, train_labels, tokenizer)
val_dataset = RecipeDataset(val_texts, val_labels, tokenizer)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [None]:
# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{NUM_EPOCHS}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    avg_loss = total_loss / len(train_dataloader)
    wandb.log({"train_loss": avg_loss})
    
    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    
    val_accuracy = correct / total
    avg_val_loss = val_loss / len(val_dataloader)
    
    wandb.log({
        "val_loss": avg_val_loss,
        "val_accuracy": val_accuracy
    })
    
    print(f'Epoch {epoch + 1}/{NUM_EPOCHS}')
    print(f'Average Training Loss: {avg_loss:.4f}')
    print(f'Validation Loss: {avg_val_loss:.4f}')
    print(f'Validation Accuracy: {val_accuracy:.4f}')

In [None]:
# Save the model
model.save_pretrained('nutrivision-roberta-25')
tokenizer.save_pretrained('nutrivision-roberta-25')

# Push to Hugging Face Hub
login(token=HUGGINGFACE_TOKEN)
model.push_to_hub('zoya-hammadk/nutrivision-roberta-25')
tokenizer.push_to_hub('zoya-hammadk/nutrivision-roberta-25')