In [1]:
# Install required packages
!pip install transformers datasets torch huggingface_hub wandb tqdm bitsandbytes

# Import libraries
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from huggingface_hub import login
import os
import wandb
from google.colab import files
from tqdm.auto import tqdm
from google.colab import userdata
import json
from datasets import Dataset
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.m

In [2]:
# Training hyperparameters
EPOCHS = 7
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 1e-4
LR_SCHEDULER_TYPE = 'cosine'
WARMUP_RATIO = 0.03
OPTIMIZER = "paged_adamw_32bit"

In [3]:
# Initialize wandb with API key from Colab secrets
wandb.login(key=userdata.get('WANDB_API_KEY'))
wandb.init(project="nutrivision-gl-prediction", name="roberta-finetuning")

# Login to Hugging Face using token from Colab secrets
login(token=userdata.get('HF_TOKEN'))

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mzoyahammadk[0m ([33mzoyahammadk-institute-of-space-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
# Upload your dataset file
uploaded = files.upload()
json_file = list(uploaded.keys())[0]  # Get the name of the uploaded file

# Load and process the dataset
with open(json_file) as f:
    data = json.load(f)

Saving roberta_training_data.json to roberta_training_data.json


In [5]:
# Convert the data to the correct format
processed_data = []
for item in data:
    processed_item = {
        "text": item["text"],
        "label": item["label"]["glycemic_load"]  # Extract the numerical GL value
    }
    processed_data.append(processed_item)

# Create dataset
dataset = Dataset.from_list(processed_data)
print("Dataset loaded with", len(dataset), "examples")

# Split dataset (90% train, 10% validation)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

Dataset loaded with 100 examples


In [6]:
# Load official RoBERTa large and tokenizer
model_name = "roberta-large"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
# Using regression (num_labels=1) since we're predicting continuous GL values
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=1)

# Tokenizer function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

# Tokenize the dataset
print("Tokenizing dataset...")
tokenized_datasets = dataset.map(tokenize_function, batched=True, desc="Tokenizing")

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing dataset...


Tokenizing:   0%|          | 0/90 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/10 [00:00<?, ? examples/s]

In [7]:
# Define compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    labels = labels.squeeze()

    mse = mean_squared_error(labels, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(labels, predictions)

    return {
        'mse': mse,
        'rmse': rmse,
        'r2': r2
    }

# Define training arguments
training_args = TrainingArguments(
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    warmup_ratio=WARMUP_RATIO,
    optim=OPTIMIZER,
    push_to_hub=True,
    hub_model_id="zoya-hammadk/nutrivision-roberta",
    report_to="wandb",
    logging_steps=10,  # Log every 10 steps
    save_steps=10,  # Save every 10 steps
)

In [8]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,  # Add compute_metrics function
)

# Start training
print("Starting training...")
trainer.train()

# Push to HF Hub
print("Pushing model to Hugging Face Hub...")
trainer.push_to_hub()

# Finish wandb run
wandb.finish()

Starting training...




Step,Training Loss
10,466.1169
20,256.1882
30,185.7903
40,153.7761
50,147.5187
60,152.7735
70,126.1795
80,163.7998
90,119.9208
100,160.2809


Pushing model to Hugging Face Hub...


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

0,1
train/epoch,▁▁▂▂▃▃▄▄▅▅▆▆▇▇▇██
train/global_step,▁▁▂▂▃▃▄▄▅▅▆▆▇▇▇██
train/grad_norm,█▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁
train/learning_rate,███▇▇▆▅▅▄▃▃▂▂▁▁▁
train/loss,█▄▃▂▂▂▁▂▁▂▁▁▂▂▂▂

0,1
total_flos,587114775152640.0
train/epoch,7.0
train/global_step,161.0
train/grad_norm,287.58557
train/learning_rate,0.0
train/loss,134.9642
train_loss,170.18433
train_runtime,286.5562
train_samples_per_second,2.199
train_steps_per_second,0.562
