# Nutrition5k – Scaled Training (ViT / ResNet)

This notebook trains calorie regression models on **larger Nutrition5k subsets**
using the official RGB train/test splits.

- Uses `rgb_train_ids.txt` and `rgb_test_ids.txt`.
- Allows flexible training size (e.g. 1k dishes vs full train split).
- Focuses on training a stronger **ViT-B/16** model (optionally ResNet-50).

In [1]:
from pathlib import Path

# ---------- EXPERIMENT CONFIG ----------

# How much data to use from rgb_train_ids.txt:
# - None => use all train IDs
# - 1000 => use 1000 train IDs (for faster experiments)
N_TRAIN_LIMIT = 1000  # set to None later for full train split

USE_LOCAL_DISK = True  # True => /content (recommended for big data), False => Drive

MODEL_ARCH = "vit"     # "vit" or "resnet" or "both"

IMAGE_SIZE = 224
BATCH_SIZE = 32
NUM_EPOCHS = 15
LR_VIT = 3e-5
LR_RESNET = 1e-4
WEIGHT_DECAY = 1e-4

In [2]:
!git clone https://github.com/swanframe/food-calorie-estimation.git
%cd food-calorie-estimation

!pip install timm pyyaml

import sys
import torch
from torch.utils.data import DataLoader
import pandas as pd

from src.data.nutrition5k_dataset import Nutrition5kOverheadDataset, get_transforms
from src.models.baseline_cnn import ResNetCalorieRegressor
from src.models.vit_regressor import ViTCalorieRegressor
from src.training.train_loop import train_model
from src.training.utils import set_seed
from src.evaluation.metrics import compute_regression_metrics, print_regression_metrics
from src.evaluation.plots import plot_true_vs_pred, plot_error_histogram

REPO_ROOT = Path.cwd()
sys.path.append(str(REPO_ROOT))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Repo root:", REPO_ROOT)
print("Using device:", device)

set_seed(42)

Cloning into 'food-calorie-estimation'...
remote: Enumerating objects: 60, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 60 (delta 23), reused 47 (delta 10), pack-reused 0 (from 0)[K
Receiving objects: 100% (60/60), 248.08 KiB | 10.34 MiB/s, done.
Resolving deltas: 100% (23/23), done.
/content/food-calorie-estimation
Repo root: /content/food-calorie-estimation
Using device: cuda


In [3]:
from google.colab import drive

drive.mount("/content/drive")

if USE_LOCAL_DISK:
    DATA_ROOT = Path("/content/data/nutrition5k")   # images + metadata (temporary)
else:
    DATA_ROOT = Path("/content/drive/MyDrive/data/nutrition5k")

MODEL_DIR = Path("/content/drive/MyDrive/models/food-calorie-estimation")
REPORTS_DIR = REPO_ROOT / "reports"

for path in [DATA_ROOT, MODEL_DIR, REPORTS_DIR]:
    path.mkdir(parents=True, exist_ok=True)

DATA_ROOT, MODEL_DIR, REPORTS_DIR

Mounted at /content/drive


(PosixPath('/content/data/nutrition5k'),
 PosixPath('/content/drive/MyDrive/models/food-calorie-estimation'),
 PosixPath('/content/food-calorie-estimation/reports'))

## 1. Download Nutrition5k metadata & RGB splits

If `metadata/` and `dish_ids/` are already present under `DATA_ROOT`, this cell can be skipped.
Otherwise, it copies the official Nutrition5k metadata and dish id files from the public GCS bucket.

In [4]:
import os

METADATA_DIR = DATA_ROOT / "metadata"
DISH_IDS_DIR = DATA_ROOT / "dish_ids"

if not METADATA_DIR.exists() or not DISH_IDS_DIR.exists():
    DATA_ROOT_STR = str(DATA_ROOT)
    !gsutil -m cp -r "gs://nutrition5k_dataset/nutrition5k_dataset/metadata"  "$DATA_ROOT_STR"
    !gsutil -m cp -r "gs://nutrition5k_dataset/nutrition5k_dataset/dish_ids" "$DATA_ROOT_STR"

METADATA_DIR, DISH_IDS_DIR

Copying gs://nutrition5k_dataset/nutrition5k_dataset/metadata/dish_metadata_cafe1.csv...
/ [0/3 files][    0.0 B/  2.2 MiB]   0% Done                                    Copying gs://nutrition5k_dataset/nutrition5k_dataset/metadata/dish_metadata_cafe2.csv...
Copying gs://nutrition5k_dataset/nutrition5k_dataset/metadata/ingredients_metadata.csv...
/ [0/3 files][    0.0 B/  2.2 MiB]   0% Done                                    / [0/3 files][    0.0 B/  2.2 MiB]   0% Done                                    / [1/3 files][ 99.4 KiB/  2.2 MiB]   4% Done                                    / [2/3 files][  1.2 MiB/  2.2 MiB]  51% Done                                    / [3/3 files][  2.2 MiB/  2.2 MiB] 100% Done                                    
Operation completed over 3 objects/2.2 MiB.                                      
Copying gs://nutrition5k_dataset/nutrition5k_dataset/dish_ids/README...
Copying gs://nutrition5k_dataset/nutrition5k_dataset/dish_ids/dish_ids_all.txt...
Copying g

(PosixPath('/content/data/nutrition5k/metadata'),
 PosixPath('/content/data/nutrition5k/dish_ids'))

In [5]:
# Load dish meta (same cleaning logic as notebook 01)
base_cols = [
    "dish_id",
    "total_calories",
    "total_mass",
    "total_fat",
    "total_carb",
    "total_protein",
    "num_ingrs",
]

cafe1 = pd.read_csv(METADATA_DIR / "dish_metadata_cafe1.csv",
                    engine="python", header=None, on_bad_lines="skip")
cafe2 = pd.read_csv(METADATA_DIR / "dish_metadata_cafe2.csv",
                    engine="python", header=None, on_bad_lines="skip")

extra_cols1 = [f"extra_{i}" for i in range(cafe1.shape[1] - len(base_cols))]
extra_cols2 = [f"extra_{i}" for i in range(cafe2.shape[1] - len(base_cols))]

cafe1.columns = base_cols + extra_cols1
cafe2.columns = base_cols + extra_cols2

cafe1 = cafe1[base_cols]
cafe2 = cafe2[base_cols]

cafe1 = cafe1[cafe1["dish_id"] != "dish_id"]
cafe2 = cafe2[cafe2["dish_id"] != "dish_id"]

for col in ["total_calories", "total_mass", "total_fat", "total_carb", "total_protein", "num_ingrs"]:
    cafe1[col] = pd.to_numeric(cafe1[col], errors="coerce")
    cafe2[col] = pd.to_numeric(cafe2[col], errors="coerce")

dish_meta = pd.concat([cafe1, cafe2], ignore_index=True)

# Official RGB train/test IDs
splits_dir = DISH_IDS_DIR / "splits"
rgb_train_ids = pd.read_csv(splits_dir / "rgb_train_ids.txt", header=None, names=["dish_id"])
rgb_test_ids  = pd.read_csv(splits_dir / "rgb_test_ids.txt",  header=None, names=["dish_id"])

(len(rgb_train_ids), len(rgb_test_ids)), dish_meta.shape

((4059, 709), (4583, 7))

In [6]:
from sklearn.model_selection import train_test_split

# Merge metadata with official splits
train_meta = rgb_train_ids.merge(dish_meta, on="dish_id", how="inner")
test_meta  = rgb_test_ids.merge(dish_meta, on="dish_id", how="inner")

# Optionally limit train size
if N_TRAIN_LIMIT is not None:
    train_meta = train_meta.sample(n=N_TRAIN_LIMIT, random_state=42).reset_index(drop=True)

# Create validation split from train_meta
train_df, val_df = train_test_split(
    train_meta,
    test_size=0.1,       # 10% of train → val
    random_state=42,
)

test_df = test_meta.reset_index(drop=True)

len(train_df), len(val_df), len(test_df)

(900, 100, 650)

## 2. Download overhead RGB images for selected dishes

We now download `rgb.png` for all dish_ids used in the train/val/test splits.

For larger experiments, images are stored on **local disk** (`/content/data/...`) when `USE_LOCAL_DISK=True`
to avoid filling up Google Drive.

In [7]:
import subprocess
from tqdm import tqdm

OVERHEAD_LOCAL_ROOT = DATA_ROOT / "imagery" / "realsense_overhead"
OVERHEAD_LOCAL_ROOT.mkdir(parents=True, exist_ok=True)

all_ids = pd.concat([train_df["dish_id"], val_df["dish_id"], test_df["dish_id"]]).unique()
missing_rgb = []

for dish_id in tqdm(all_ids):
    dish_id = str(dish_id)
    local_dish_dir = OVERHEAD_LOCAL_ROOT / dish_id
    local_dish_dir.mkdir(parents=True, exist_ok=True)

    gs_path = f"gs://nutrition5k_dataset/nutrition5k_dataset/imagery/realsense_overhead/{dish_id}/rgb.png"

    try:
        subprocess.run(
            ["gsutil", "cp", gs_path, str(local_dish_dir)],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
    except subprocess.CalledProcessError:
        missing_rgb.append(dish_id)

print("Missing rgb.png for", len(missing_rgb), "dishes")

100%|██████████| 1650/1650 [45:50<00:00,  1.67s/it]

Missing rgb.png for 570 dishes





In [8]:
if missing_rgb:
    mask_train = ~train_df["dish_id"].isin(missing_rgb)
    mask_val   = ~val_df["dish_id"].isin(missing_rgb)
    mask_test  = ~test_df["dish_id"].isin(missing_rgb)

    train_df = train_df[mask_train].reset_index(drop=True)
    val_df   = val_df[mask_val].reset_index(drop=True)
    test_df  = test_df[mask_test].reset_index(drop=True)

len(train_df), len(val_df), len(test_df)

(566, 64, 450)

In [9]:
train_transform, eval_transform = get_transforms(image_size=IMAGE_SIZE)

train_dataset = Nutrition5kOverheadDataset(
    data=train_df,
    images_root=OVERHEAD_LOCAL_ROOT,
    target_col="total_calories",
    transform=train_transform,
)
val_dataset = Nutrition5kOverheadDataset(
    data=val_df,
    images_root=OVERHEAD_LOCAL_ROOT,
    target_col="total_calories",
    transform=eval_transform,
)
test_dataset = Nutrition5kOverheadDataset(
    data=test_df,
    images_root=OVERHEAD_LOCAL_ROOT,
    target_col="total_calories",
    transform=eval_transform,
)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=torch.cuda.is_available(),
)
val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=torch.cuda.is_available(),
)
test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=torch.cuda.is_available(),
)

len(train_dataset), len(val_dataset), len(test_dataset)

(566, 64, 450)

In [10]:
if MODEL_ARCH in ("resnet", "both"):
    set_seed(42)
    resnet_model = ResNetCalorieRegressor(
        backbone_name="resnet50",
        pretrained=True,
        dropout_p=0.3,
        freeze_backbone=False,   # you can start unfreezing here
    ).to(device)

    optimizer_resnet = torch.optim.Adam(
        filter(lambda p: p.requires_grad, resnet_model.parameters()),
        lr=LR_RESNET,
        weight_decay=WEIGHT_DECAY,
    )

    resnet_ckpt_path = MODEL_DIR / f"resnet50_nutrition5k_{N_TRAIN_LIMIT or 'full'}.pt"

    history_resnet = train_model(
        model=resnet_model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer_resnet,
        device=device,
        loss_fn=torch.nn.MSELoss(),
        num_epochs=NUM_EPOCHS,
        use_amp=True,
        checkpoint_path=resnet_ckpt_path,
    )

In [11]:
if MODEL_ARCH in ("vit", "both"):
    set_seed(42)
    vit_model = ViTCalorieRegressor(
        model_name="vit_base_patch16_224",
        pretrained=True,
        freeze_backbone=False,  # unfreeze for more capacity
    ).to(device)

    optimizer_vit = torch.optim.Adam(
        filter(lambda p: p.requires_grad, vit_model.parameters()),
        lr=LR_VIT,
        weight_decay=WEIGHT_DECAY,
    )

    vit_ckpt_path = MODEL_DIR / f"vit_base_patch16_224_nutrition5k_{N_TRAIN_LIMIT or 'full'}.pt"

    history_vit = train_model(
        model=vit_model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer_vit,
        device=device,
        loss_fn=torch.nn.MSELoss(),
        num_epochs=NUM_EPOCHS,
        use_amp=True,
        checkpoint_path=vit_ckpt_path,
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

  scaler = torch.cuda.amp.GradScaler(enabled=(use_amp and device.type == "cuda"))
  with torch.cuda.amp.autocast(enabled=(scaler is not None)):


Epoch [1/15] Train Loss: 90240.01, Train MAE: 221.18 | Val Loss: 82725.98, Val MAE: 197.13
  -> Saved new best model to /content/drive/MyDrive/models/food-calorie-estimation/vit_base_patch16_224_nutrition5k_1000.pt (Val MAE: 197.13)
Epoch [2/15] Train Loss: 76253.44, Train MAE: 192.09 | Val Loss: 74819.99, Val MAE: 180.27
  -> Saved new best model to /content/drive/MyDrive/models/food-calorie-estimation/vit_base_patch16_224_nutrition5k_1000.pt (Val MAE: 180.27)
Epoch [3/15] Train Loss: 73526.56, Train MAE: 186.71 | Val Loss: 74193.03, Val MAE: 179.09
  -> Saved new best model to /content/drive/MyDrive/models/food-calorie-estimation/vit_base_patch16_224_nutrition5k_1000.pt (Val MAE: 179.09)
Epoch [4/15] Train Loss: 73052.84, Train MAE: 185.90 | Val Loss: 73825.69, Val MAE: 178.42
  -> Saved new best model to /content/drive/MyDrive/models/food-calorie-estimation/vit_base_patch16_224_nutrition5k_1000.pt (Val MAE: 178.42)
Epoch [5/15] Train Loss: 72686.42, Train MAE: 185.25 | Val Loss: 734

In [12]:
import numpy as np

def collect_predictions(model, dataloader, device):
    model.eval()
    all_true, all_pred = [], []
    with torch.no_grad():
        for images, targets in dataloader:
            images = images.to(device)
            targets = targets.to(device)
            preds = model(images).squeeze(-1)
            all_true.extend(targets.cpu().numpy().tolist())
            all_pred.extend(preds.cpu().numpy().tolist())
    return np.array(all_true), np.array(all_pred)

In [13]:
FIGURES_DIR = REPORTS_DIR / "figures"
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

results = []

if MODEL_ARCH in ("resnet", "both"):
    resnet_model_eval = ResNetCalorieRegressor(
        backbone_name="resnet50",
        pretrained=True,
        dropout_p=0.3,
        freeze_backbone=False,
    ).to(device)
    resnet_ckpt = torch.load(resnet_ckpt_path, map_location=device)
    resnet_model_eval.load_state_dict(resnet_ckpt["model_state_dict"])

    y_true_r, y_pred_r = collect_predictions(resnet_model_eval, test_loader, device)
    metrics_r = compute_regression_metrics(y_true_r, y_pred_r)
    print("ResNet test metrics:")
    print_regression_metrics(metrics_r)
    results.append(("resnet50", metrics_r))

if MODEL_ARCH in ("vit", "both"):
    vit_model_eval = ViTCalorieRegressor(
        model_name="vit_base_patch16_224",
        pretrained=True,
        freeze_backbone=False,
    ).to(device)
    vit_ckpt = torch.load(vit_ckpt_path, map_location=device)
    vit_model_eval.load_state_dict(vit_ckpt["model_state_dict"])

    y_true_v, y_pred_v = collect_predictions(vit_model_eval, test_loader, device)
    metrics_v = compute_regression_metrics(y_true_v, y_pred_v)
    print("\nViT test metrics:")
    print_regression_metrics(metrics_v)
    results.append(("vit_base_patch16_224", metrics_v))


ViT test metrics:
MAE:  186.87 kCal
RMSE: 268.86 kCal
MSE:  72284.48
MAPE: 1287716285.81 %
R²:   -0.724


In [14]:
rows = []
for name, m in results:
    rows.append({
        "model": name,
        "mae": m["mae"],
        "rmse": m["rmse"],
        "mse": m["mse"],
        "mape": m["mape"],
        "r2": m["r2"],
        "n_train": len(train_df),
        "n_val": len(val_df),
        "n_test": len(test_df),
    })

metrics_df = pd.DataFrame(rows)
metrics_path = REPORTS_DIR / f"metrics_nutrition5k_{N_TRAIN_LIMIT or 'full'}.csv"
metrics_df.to_csv(metrics_path, index=False)
metrics_df

Unnamed: 0,model,mae,rmse,mse,mape,r2,n_train,n_val,n_test
0,vit_base_patch16_224,186.866948,268.857727,72284.477433,1287716000.0,-0.723542,566,64,450


In [15]:
# Example for ViT
plot_true_vs_pred(
    y_true_v, y_pred_v,
    title=f"ViT-B/16 – Nutrition5k ({len(train_df)} train)",
    model_name="ViT-B/16",
    save_path=FIGURES_DIR / f"true_vs_pred_vit_nutrition5k_{N_TRAIN_LIMIT or 'full'}.png",
)

plot_error_histogram(
    y_true_v, y_pred_v,
    title=f"ViT-B/16 – Error Histogram ({len(train_df)} train)",
    model_name="ViT-B/16",
    save_path=FIGURES_DIR / f"error_hist_vit_nutrition5k_{N_TRAIN_LIMIT or 'full'}.png",
)