# Fine-tuning RoBERTa for Glycemic Load Regression

This notebook fine-tunes a RoBERTa model for glycemic load regression using the processed dataset.

In [None]:
# Install required packages
!pip install transformers datasets wandb tqdm huggingface_hub

In [None]:
# Import all required libraries
import os
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer,RobertaForSequenceClassification,RobertaConfig,AdamW,get_linear_schedule_with_warmup,pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import wandb
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from google.colab import userdata
from huggingface_hub import HfApi, create_repo

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

In [None]:
# Access API keys from Colab secrets
HUGGINGFACE_TOKEN = userdata.get('HUGGINGFACE_TOKEN')
WANDB_API_KEY = userdata.get('WANDB_API_KEY')

# Initialize wandb
wandb.login(key=WANDB_API_KEY)
wandb.init(
    project='nutrivision',
    name='roberta-regression',
    config={
        'model_name': 'roberta-base',
        'task': 'glycemic_load_regression',
        'max_length': 512,
        'batch_size': 16,
        'learning_rate': 2e-5,
        'num_epochs': 3,
        'warmup_steps': 0,
        'weight_decay': 0.01,
        'gradient_accumulation_steps': 1,
        'max_grad_norm': 1.0
    }
)

## Load and Prepare Data

In [None]:
# Load the processed dataset
with open('roberta_regression_data.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)
print(f'Dataset shape: {df.shape}')
df.head()

In [None]:
# Split data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
print(f'Training set size: {len(train_df)}')
print(f'Validation set size: {len(val_df)}')

## Create Dataset and DataLoader Classes

In [None]:
class GlycemicLoadDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_length=512):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'target': torch.tensor(target, dtype=torch.float)
        }

## Initialize Model and Training Components

In [None]:
# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Create datasets
train_dataset = GlycemicLoadDataset(
    train_df['text'].values,
    train_df['glycemic_load'].values,
    tokenizer
)

val_dataset = GlycemicLoadDataset(
    val_df['text'].values,
    val_df['glycemic_load'].values,
    tokenizer
)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=wandb.config.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=wandb.config.batch_size)

# Initialize model
config = RobertaConfig.from_pretrained('roberta-base', num_labels=1)  # 1 output for regression
model = RobertaForSequenceClassification.from_pretrained('roberta-base', config=config)
model.to(device)

# Initialize optimizer and scheduler
optimizer = AdamW(
    model.parameters(),
    lr=wandb.config.learning_rate,
    weight_decay=wandb.config.weight_decay
)

num_training_steps = len(train_loader) * wandb.config.num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=wandb.config.warmup_steps,
    num_training_steps=num_training_steps
)

## Training Loop

In [None]:
# Training parameters
best_val_loss = float('inf')

# Training loop
for epoch in range(wandb.config.num_epochs):
    model.train()
    total_train_loss = 0
    
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{wandb.config.num_epochs}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)
        
        model.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
        loss = outputs.loss
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), wandb.config.max_grad_norm)
        optimizer.step()
        scheduler.step()
        
        total_train_loss += loss.item()
        
        # Log batch metrics
        wandb.log({
            'batch_loss': loss.item(),
            'learning_rate': scheduler.get_last_lr()[0]
        })
    
    # Calculate average training loss
    avg_train_loss = total_train_loss / len(train_loader)
    
    # Validation
    model.eval()
    total_val_loss = 0
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            total_val_loss += loss.item()
            
            # Get predictions
            preds = outputs.logits.squeeze()
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
    
    # Calculate validation metrics
    avg_val_loss = total_val_loss / len(val_loader)
    val_mse = mean_squared_error(all_targets, all_preds)
    val_r2 = r2_score(all_targets, all_preds)
    
    # Log epoch metrics
    wandb.log({
        'epoch': epoch + 1,
        'train_loss': avg_train_loss,
        'val_loss': avg_val_loss,
        'val_mse': val_mse,
        'val_r2': val_r2
    })
    
    print(f'Epoch {epoch + 1}/{wandb.config.num_epochs}:')
    print(f'Average training loss: {avg_train_loss:.4f}')
    print(f'Average validation loss: {avg_val_loss:.4f}')
    print(f'Validation MSE: {val_mse:.4f}')
    print(f'Validation R²: {val_r2:.4f}')
    
    # Save best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained('models/roberta_regression')
        tokenizer.save_pretrained('models/roberta_regression')
        print('Saved best model!')

# Close wandb run
wandb.finish()

## Upload Model to Hugging Face Hub

In [None]:
# Create a new repository on Hugging Face Hub
repo_name = "nutrivision/roberta-glycemic-load-regression"
try:
    create_repo(repo_name, token=HUGGINGFACE_TOKEN, repo_type="model")
    print(f"Created new repository: {repo_name}")
except Exception as e:
    print(f"Repository might already exist: {e}")

# Load the best model
model = RobertaForSequenceClassification.from_pretrained('models/roberta_regression')
tokenizer = RobertaTokenizer.from_pretrained('models/roberta_regression')

# Push model and tokenizer to Hugging Face Hub
model.push_to_hub(repo_name, token=HUGGINGFACE_TOKEN)
tokenizer.push_to_hub(repo_name, token=HUGGINGFACE_TOKEN)

print(f"Model and tokenizer uploaded to {repo_name}")

## Test the Uploaded Model

In [None]:
# Load the model from Hugging Face Hub
regression_pipeline = pipeline(
    "text-classification",
    model=repo_name,
    tokenizer=repo_name,
    device=0 if torch.cuda.is_available() else -1
)

# Test with a sample text
sample_text = "1 cup of cooked white rice"
prediction = regression_pipeline(sample_text)
print(f"Sample text: {sample_text}")
print(f"Predicted glycemic load: {prediction[0]['score']:.2f}")

## Visualize Results

In [None]:
# Create predictions vs actual plot
plt.figure(figsize=(10, 6))
plt.scatter(all_targets, all_preds, alpha=0.5)
plt.plot([min(all_targets), max(all_targets)], [min(all_targets), max(all_targets)], 'r--')
plt.xlabel('Actual Glycemic Load')
plt.ylabel('Predicted Glycemic Load')
plt.title('Predicted vs Actual Glycemic Load')
plt.show()

# Create residual plot
residuals = np.array(all_preds) - np.array(all_targets)
plt.figure(figsize=(10, 6))
plt.scatter(all_preds, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Glycemic Load')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()