<a href="https://colab.research.google.com/github/tesfayeez/Beneficiarylist/blob/main/GG_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Nutrition5k EfficientNet Training Notebook for Google Colab with A100 GPU
# Optimized for A100 GPU with mixed precision training and enhanced performance

# %% [markdown]
# # 🍽️ Nutrition5k - Food Macro Prediction with EfficientNet (A100 Optimized)
#
# This notebook implements an MVP for predicting nutritional information (calories, protein, fat, carbs) from food images using the Nutrition5k dataset and EfficientNet models, optimized for A100 GPU.

# %% [markdown]
# ## 🚀 1. GPU Setup and Verification

In [124]:
# %%
# Cell 1: Check GPU availability and specs
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU Device:", torch.cuda.get_device_name(0))
    print("GPU Count:", torch.cuda.device_count())

    # Check if we have A100
    gpu_name = torch.cuda.get_device_name(0)
    if "A100" in gpu_name:
        print("✅ A100 GPU detected! Enabling optimizations...")
    else:
        print(f"⚠️ GPU detected: {gpu_name}. Code will still work but is optimized for A100.")

    # Memory info
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

    # Enable TF32 for A100 (3x speedup for operations)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = False
else:
    print("❌ No GPU found! Please enable GPU in Runtime > Change runtime type > GPU (A100)")


PyTorch version: 2.8.0+cu126
CUDA available: True
CUDA version: 12.6
GPU Device: NVIDIA A100-SXM4-40GB
GPU Count: 1
✅ A100 GPU detected! Enabling optimizations...
GPU Memory: 39.56 GB


In [125]:
# %%
# Cell 2: Install required packages
!pip install -q efficientnet_pytorch torch torchvision pandas numpy matplotlib pillow tqdm scikit-learn tensorboard
!pip install -q albumentations opencv-python-headless
!pip install -q torch-optimizer  # Advanced optimizers
!pip install -q apex  # For mixed precision training (optional, we'll use native)


In [3]:
# %%
# Cell 3: Import all libraries
import os
import sys
import json
import random
import shutil
import warnings
import subprocess
import gc
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Tuple, Optional
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torch.cuda.amp import autocast, GradScaler  # Native AMP for A100
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
from efficientnet_pytorch import EfficientNet

import albumentations as A
from albumentations.pytorch import ToTensorV2

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

# Force CUDA device
device = torch.device('cuda')
print(f"Using device: {device}")


Using device: cuda


In [126]:
# %%
# Cell 4: Configuration optimized for A100
class Config:
    """Configuration for training - A100 optimized"""
    # Data settings
    DATA_DIR = '/content/nutrition5k_data'
    SUBSET_SIZE = 1000  # Larger subset for A100 (can handle more)
    SAMPLE_RATE = 5     # Sample every 5th frame from videos
    MAX_FRAMES_PER_VIDEO = 10  # Limit frames per video for memory

    # Model settings
    MODEL_NAME = 'efficientnet-b1'  # b1 for better accuracy with A100 power
    INPUT_SIZE = 240  # 224 for b0, 240 for b1

    # Training settings - A100 optimized
    BATCH_SIZE = 64  # Larger batch size for A100 (can go up to 128)
    ACCUMULATION_STEPS = 2  # Gradient accumulation for effective batch of 128
    EPOCHS = 30
    LEARNING_RATE = 3e-3
    WEIGHT_DECAY = 1e-4

    # Mixed precision training
    USE_AMP = True  # Automatic Mixed Precision for A100

    # Multi-worker data loading
    NUM_WORKERS = 4  # Parallel data loading
    PIN_MEMORY = True  # Pin memory for faster GPU transfer

    # Targets
    TARGET_COLS = ['calories', 'mass', 'fat', 'carb', 'protein']

    # Normalization stats (ImageNet)
    MEAN = [0.485, 0.456, 0.406]
    STD = [0.229, 0.224, 0.225]

    # Paths
    CHECKPOINT_DIR = '/content/checkpoints'
    LOG_DIR = '/content/logs'
    BEST_MODEL_PATH = '/content/best_model.pth'

    # Random seed
    SEED = 42

    # Early stopping
    PATIENCE = 5
    MIN_DELTA = 0.001

config = Config()

# Set seeds
random.seed(config.SEED)
np.random.seed(config.SEED)
torch.manual_seed(config.SEED)
torch.cuda.manual_seed_all(config.SEED)

# Create directories
os.makedirs(config.DATA_DIR, exist_ok=True)
os.makedirs(config.CHECKPOINT_DIR, exist_ok=True)
os.makedirs(config.LOG_DIR, exist_ok=True)


In [5]:
# %%
# Cell 5: Download metadata
%cd {config.DATA_DIR}

print("📥 Downloading metadata files...")
!gsutil -q cp gs://nutrition5k_dataset/nutrition5k_dataset/metadata/dish_metadata_cafe1.csv .
!gsutil -q cp gs://nutrition5k_dataset/nutrition5k_dataset/metadata/dish_metadata_cafe2.csv .
!gsutil -q cp gs://nutrition5k_dataset/nutrition5k_dataset/dish_ids/splits/* . 2>/dev/null || true

print("✅ Metadata downloaded!")


/content/nutrition5k_data
📥 Downloading metadata files...
✅ Metadata downloaded!


In [127]:
# Cell 6 (robust, keep all columns)

import csv

def _try_read_csv(path):
    """
    Try robust parsing strategies for Nutrition5k metadata.
    Keeps all columns.
    """
    # 1) Default C engine (fast)
    try:
        return pd.read_csv(path)
    except Exception:
        pass

    # 2) Python engine with strict quoting
    try:
        return pd.read_csv(
            path,
            engine="python",
            sep=",",
            quotechar='"',
            escapechar="\\"
        )
    except Exception:
        pass

    # 3) Auto-detect delimiter using csv.Sniffer
    try:
        with open(path, "r", errors="replace") as f:
            sample = f.read(4096)
            try:
                dialect = csv.Sniffer().sniff(sample, delimiters=[",", ";", "\t", "|"])
                sep = dialect.delimiter
            except Exception:
                sep = ","
        return pd.read_csv(
            path,
            engine="python",
            sep=sep,
            quotechar='"',
            escapechar="\\",
            on_bad_lines="skip"  # skip only truly broken lines
        )
    except Exception as e:
        raise e

def parse_metadata(csv_path):
    """Return full dataframe with all metadata columns preserved."""
    df = _try_read_csv(csv_path)
    df.columns = [c.strip().lower() for c in df.columns]  # normalize headers
    return df

# ---- Load metadata ----
print("📊 Loading metadata with all columns...")
df_cafe1 = parse_metadata("dish_metadata_cafe1.csv")
df_cafe2 = parse_metadata("dish_metadata_cafe2.csv")

df_all = pd.concat([df_cafe1, df_cafe2], ignore_index=True)
print(f"Total rows: {len(df_all)}")
print(f"Columns: {list(df_all.columns)[:15]} ... (total {len(df_all.columns)})")

# Pick subset
df_subset = df_all.sample(
    n=min(config.SUBSET_SIZE, len(df_all)), random_state=config.SEED
).reset_index(drop=True)

print("\nDataset statistics (numeric columns):")
print(df_subset.describe(include=[np.number]))
subset_dishes = df_subset.to_dict(orient="records")


📊 Loading metadata with all columns...
Total rows: 4581
Columns: ['dish_1561662216', '300.794281', '193.000000', '12.387489', '28.218290', '18.633970', 'ingr_0000000508', 'soy sauce', '3.398568', '1.80124104', '0.020391408', '0.166529832', '0.275284008', 'ingr_0000000122', 'garlic'] ... (total 177)

Dataset statistics (numeric columns):
        300.794281  193.000000   12.387489   28.218290   18.633970  \
count   958.000000  958.000000  958.000000  958.000000  958.000000   
mean    193.698663  168.960334    9.499092   16.273582   12.879130   
std     196.057981  137.316864   12.017462   27.098430   16.502249   
min       0.000000    1.000000    0.000000    0.000000    0.000000   
25%      53.598918   65.000000    0.420750    4.834961    1.317314   
50%     128.440010  131.500000    5.074835   12.065000    5.607500   
75%     284.152496  233.750000   13.635000   21.559999   18.553978   
max    2400.780029  886.000000  106.343002  732.300049  103.567314   

         3.398568  1.80124104 

In [128]:
# Cell 6 (robust, headerless-safe, keep ALL columns)

import csv

CANONICAL_FIRST6 = [
    "dish_id",          # 0
    "total_calories",   # 1
    "total_mass",       # 2
    "total_fat",        # 3
    "total_carb",       # 4
    "total_protein",    # 5
]

def _read_with_engine(path, sep=None, header="infer"):
    """Helper to read with either default C engine or python engine."""
    if sep is None:
        # Try fast path (C engine)
        try:
            return pd.read_csv(path, header=header)
        except Exception:
            pass
        # Try python engine (default comma)
        return pd.read_csv(path, engine="python", header=header)
    else:
        # Given sep: try C engine then python
        try:
            return pd.read_csv(path, sep=sep, header=header)
        except Exception:
            return pd.read_csv(path, sep=sep, engine="python", header=header, on_bad_lines="skip")

def _sniff_sep(path):
    with open(path, "r", errors="replace") as f:
        sample = f.read(4096)
        try:
            dialect = csv.Sniffer().sniff(sample, delimiters=[",",";","\t","|"])
            return dialect.delimiter
        except Exception:
            return ","  # fallback

def _ensure_headers(df):
    """
    If 'dish_id' not found among columns, assume the file had NO header:
    re-read with header=None and assign canonical names for the first 6 cols,
    keeping all remaining columns.
    """
    cols_lower = [c.strip().lower() for c in df.columns]
    if "dish_id" in cols_lower:
        # Normalize headers to lowercase/stripped for consistency
        df.columns = [c.strip().lower() for c in df.columns]
        return df

    # No dish_id in headers -> re-read with header=None
    sep = _sniff_sep(csv_path_global)  # will be set before calls
    df2 = _read_with_engine(csv_path_global, sep=sep, header=None)

    # Build column names: first 6 canonical, rest generic c6, c7, ...
    num_cols = df2.shape[1]
    names = []
    for i in range(num_cols):
        if i < len(CANONICAL_FIRST6):
            names.append(CANONICAL_FIRST6[i])
        else:
            names.append(f"col_{i}")
    df2.columns = names
    return df2

def parse_metadata_full(csv_path):
    """Read Nutrition5k CSV keeping ALL columns and ensuring 'dish_id' exists."""
    global csv_path_global
    csv_path_global = csv_path  # used by _ensure_headers

    # Try reading with inferred header first
    sep = _sniff_sep(csv_path)
    df = _read_with_engine(csv_path, sep=sep, header="infer")
    df = _ensure_headers(df)

    # Normalize dish_id to string
    if "dish_id" not in df.columns:
        raise ValueError(f"'dish_id' column still not found in {csv_path}. Columns: {list(df.columns)[:20]} ...")
    df["dish_id"] = df["dish_id"].astype(str).str.strip()

    return df

# -------- Load metadata (ALL columns) --------
print("📊 Loading metadata (full columns, headerless-safe)...")
df_cafe1 = parse_metadata_full('dish_metadata_cafe1.csv')
df_cafe2 = parse_metadata_full('dish_metadata_cafe2.csv')

df_all = pd.concat([df_cafe1, df_cafe2], ignore_index=True)

# OPTIONAL: coerce known numeric targets if present
for col in ["total_calories", "total_mass", "total_fat", "total_carb", "total_protein"]:
    if col in df_all.columns:
        df_all[col] = pd.to_numeric(df_all[col], errors="coerce")

print(f"Total rows available: {len(df_all)}")
print(f"First 12 columns: {list(df_all.columns)[:12]} ... (total {len(df_all.columns)})")

# Random subset
df_subset = df_all.sample(n=min(config.SUBSET_SIZE, len(df_all)), random_state=config.SEED).reset_index(drop=True)

print("\nDataset statistics (numeric columns):")
print(df_subset.describe(include=[np.number]))

# Back-compat for downstream code that expects a list of dicts
subset_dishes = df_subset.to_dict(orient="records")
print(f"\nSubset dishes available: {len(subset_dishes)}")


📊 Loading metadata (full columns, headerless-safe)...
Total rows available: 4583
First 12 columns: ['dish_id', 'total_calories', 'total_mass', 'total_fat', 'total_carb', 'total_protein', 'col_6', 'col_7', 'col_8', 'col_9', 'col_10', 'col_11'] ... (total 125)

Dataset statistics (numeric columns):
       total_calories   total_mass    total_fat   total_carb  total_protein  \
count     1000.000000  1000.000000  1000.000000  1000.000000    1000.000000   
mean       194.046320   173.418000     9.563239    14.799060      13.900911   
std        197.802066   142.658637    12.457677    14.665273      18.476023   
min          0.000000     3.000000     0.000000     0.000000       0.000000   
25%         43.209999    70.000000     0.321750     3.904915       1.180366   
50%        124.320000   133.000000     5.112388    10.011065       6.300000   
75%        282.555786   230.250000    13.804787    22.225019      19.720807   
max       1158.989990  1102.000000    83.761002   101.256134     117.9

In [129]:
# Cell 7 (Kaggle-based download for Nutrition5k imagery)
import os, zipfile, subprocess, glob
from tqdm.auto import tqdm

DATASET_SLUG = "zygmuntyt/nutrition5k-dataset-side-angle-images"
DEST_DIR = os.path.join(config.DATA_DIR, "images")
os.makedirs(DEST_DIR, exist_ok=True)

print("📥 Downloading Nutrition5k side-angle images from Kaggle...")
# Requires you to upload kaggle.json first to /content (you already did earlier)
!mkdir -p ~/.kaggle && cp /content/kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d $DATASET_SLUG -p $DEST_DIR --unzip

# Verify extraction
sample_files = glob.glob(f"{DEST_DIR}/**/*.jpeg", recursive=True)
print(f"✅ Found {len(sample_files)} JPEG images after unzip")
print("Example paths:", sample_files[:5])

# Normalize dish_id folders
print("🧩 Normalizing folder structure...")
for path in tqdm(sample_files[:20000]):
    dish = os.path.basename(os.path.dirname(os.path.dirname(path)))  # e.g. dish_1550704750
    new_dir = os.path.join(DEST_DIR, "side_angles", dish)
    os.makedirs(new_dir, exist_ok=True)
    new_path = os.path.join(new_dir, os.path.basename(path))
    os.rename(path, new_path)

print("✅ Organized images under /images/side_angles/<dish_id>/")


📥 Downloading Nutrition5k side-angle images from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/zygmuntyt/nutrition5k-dataset-side-angle-images
License(s): unknown
Downloading nutrition5k-dataset-side-angle-images.zip to /content/nutrition5k_data/images
 99% 5.91G/5.97G [00:15<00:00, 591MB/s]
100% 5.97G/5.97G [00:15<00:00, 416MB/s]
✅ Found 74845 JPEG images after unzip
Example paths: ['/content/nutrition5k_data/images/dish_1574280343/frames_sampled30/camera_A_frame_002.jpeg', '/content/nutrition5k_data/images/dish_1574280343/frames_sampled30/camera_D_frame_001.jpeg', '/content/nutrition5k_data/images/dish_1574280343/frames_sampled30/camera_B_frame_001.jpeg', '/content/nutrition5k_data/images/dish_1574280343/frames_sampled30/camera_C_frame_001.jpeg', '/content/nutrition5k_data/images/dish_1574280343/frames_sampled30/camera_D_frame_002.jpeg']
🧩 Normalizing folder structure...


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Organized images under /images/side_angles/<dish_id>/


In [72]:
import os, glob, shutil
from tqdm.auto import tqdm

ROOT = "/content/nutrition5k_data/images"
SIDE_DIR = os.path.join(ROOT, "side_angles")
os.makedirs(SIDE_DIR, exist_ok=True)

# Find all .jpeg files directly under images/
jpeg_files = glob.glob(os.path.join(ROOT, "*.jpeg"))
print(f"Found {len(jpeg_files)} loose JPEGs in root folder")

for path in tqdm(jpeg_files):
    filename = os.path.basename(path)
    # Extract dish id from file name like: dish_1550704750_camera_A_frame_001.jpeg
    parts = filename.split("_")
    dish_id = "_".join(parts[:2]) if filename.startswith("dish_") else "unknown_dish"
    target_dir = os.path.join(SIDE_DIR, dish_id)
    os.makedirs(target_dir, exist_ok=True)
    shutil.move(path, os.path.join(target_dir, filename))

print("✅ Grouped all images into side_angles/<dish_id>/ folders")

# Verify
total_imgs = len(glob.glob(os.path.join(SIDE_DIR, "*/*.jpeg")))
dish_count = len(glob.glob(os.path.join(SIDE_DIR, "dish_*")))
print(f"✅ Found {dish_count} dishes with {total_imgs} total JPEG images")


Found 0 loose JPEGs in root folder


0it [00:00, ?it/s]

✅ Grouped all images into side_angles/<dish_id>/ folders
✅ Found 1690 dishes with 20000 total JPEG images


In [97]:
# %%
# Cell 8: Custom Dataset class
class Nutrition5kDataset(Dataset):
    """Dataset class for Nutrition5k with GPU optimizations"""

    def __init__(self, df, img_dir, transform=None, use_overhead=True, cache_size=100):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform
        self.use_overhead = use_overhead
        self.target_cols = config.TARGET_COLS

        # Build image paths
        self.image_paths = self._build_image_paths()

        # Normalize targets
        self.scaler = RobustScaler()
        self.targets = self.scaler.fit_transform(df[self.target_cols].values)

        # Image cache for frequently accessed images
        self.cache = {}
        self.cache_size = cache_size

    def _build_image_paths(self):
        """Build list of available image paths"""
        paths = []

        for _, row in self.df.iterrows():
            dish_id = row['dish_id']
            dish_paths = []

            # Side angle frames
            side_dir = f"{self.img_dir}/side_angles/{dish_id}"
            if os.path.exists(side_dir):
                for cam_dir in os.listdir(side_dir):
                    if 'frames' in cam_dir:
                        frame_dir = os.path.join(side_dir, cam_dir)
                        frames = [os.path.join(frame_dir, f) for f in os.listdir(frame_dir)
                                if f.endswith('.jpg')]
                        # Limit frames per video
                        dish_paths.extend(frames[:config.MAX_FRAMES_PER_VIDEO])

            # Overhead image
            if self.use_overhead:
                overhead_path = f"{self.img_dir}/overhead/{dish_id}/color.png"
                if os.path.exists(overhead_path):
                    dish_paths.append(overhead_path)

            # If no images found, use a placeholder
            if not dish_paths:
                dish_paths = [None]

            paths.append(dish_paths)

        return paths

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Get random image from available paths
        img_paths = self.image_paths[idx]

        if img_paths[0] is None:
            # Create blank image if no images available
            img = np.zeros((config.INPUT_SIZE, config.INPUT_SIZE, 3), dtype=np.uint8)
        else:
            # Randomly select one image
            img_path = random.choice(img_paths)

            # Check cache first
            if img_path in self.cache:
                img = self.cache[img_path].copy()
            else:
                # Load image
                img = cv2.imread(img_path)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

                # Update cache (LRU-style)
                if len(self.cache) >= self.cache_size:
                    # Remove oldest entry
                    self.cache.pop(next(iter(self.cache)))
                self.cache[img_path] = img.copy()

        # Apply transforms
        if self.transform:
            augmented = self.transform(image=img)
            img = augmented['image']

        # Get targets
        targets = torch.tensor(self.targets[idx], dtype=torch.float32)

        return img, targets


In [109]:
# Cell 8B (fixed): Multi-View Dataset with safe padding
class Nutrition5kMultiViewDataset(Dataset):
    """
    Returns exactly k_views images per dish by sampling with replacement if needed.
    Guarantees consistent tensor shape for batching.
    """
    def __init__(self, df, img_dir, transform=None, use_overhead=True, k_views=3, cache_size=200):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform
        self.use_overhead = use_overhead
        self.target_cols = config.TARGET_COLS
        self.k_views = k_views

        self.image_paths = self._build_image_paths()
        self.scaler = RobustScaler()
        self.targets = self.scaler.fit_transform(df[self.target_cols].values)

        self.cache, self.cache_size = {}, cache_size

    def _build_image_paths(self):
        paths = []
        for _, row in self.df.iterrows():
            dish_id = row["dish_id"]
            dish_paths = glob.glob(os.path.join(self.img_dir, "side_angles", dish_id, "*.jpeg"))
            if self.use_overhead:
                overhead = os.path.join(self.img_dir, "overhead", dish_id, "color.png")
                if os.path.exists(overhead):
                    dish_paths.append(overhead)
            if not dish_paths:
                dish_paths = [None]
            paths.append(dish_paths)
        return paths

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_paths = self.image_paths[idx]
        if img_paths[0] is None:
            imgs = [np.zeros((config.INPUT_SIZE, config.INPUT_SIZE, 3), dtype=np.uint8)
                    for _ in range(self.k_views)]
        else:
            # Sample exactly k_views with replacement if not enough
            chosen = np.random.choice(img_paths,
                                      size=self.k_views,
                                      replace=len(img_paths) < self.k_views)
            imgs = []
            for p in chosen:
                if p in self.cache:
                    img = self.cache[p].copy()
                else:
                    img = cv2.imread(p)
                    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                    if len(self.cache) >= self.cache_size:
                        self.cache.pop(next(iter(self.cache)))
                    self.cache[p] = img.copy()
                imgs.append(img)

        imgs = [self.transform(image=i)['image'] for i in imgs]
        imgs = torch.stack(imgs, dim=0)  # shape: [k, 3, H, W]

        target = torch.tensor(self.targets[idx], dtype=torch.float32)
        return imgs, target


In [99]:
# Cell 9: Define augmentation pipelines (version-robust for Albumentations)

import albumentations as A
from albumentations.pytorch import ToTensorV2

def get_transforms(phase='train'):
    """Get augmentation pipeline optimized for food images, robust across Albumentations versions."""
    size_tuple = (config.INPUT_SIZE, config.INPUT_SIZE)

    # Build TRAIN pipeline
    if phase == 'train':
        # Try v2-style API first (size=(H,W)); fall back to height/width
        try:
            rand_resized_crop = A.RandomResizedCrop(
                size=size_tuple,           # Albumentations >=2.0
                scale=(0.8, 1.0)
            )
        except TypeError:
            rand_resized_crop = A.RandomResizedCrop(
                height=config.INPUT_SIZE,  # Albumentations <=1.x
                width=config.INPUT_SIZE,
                scale=(0.8, 1.0)
            )

        return A.Compose([
            rand_resized_crop,

            # Color augmentations (important for food)
            A.OneOf([
                A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=0.5),
                A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=20, val_shift_limit=10, p=0.5),
                A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
            ], p=0.8),

            # Geometric transforms
            A.OneOf([
                A.HorizontalFlip(p=0.5),
                A.Rotate(limit=15, p=0.5),
                A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15, p=0.5),
            ], p=0.5),

            # Light noise/blur
            A.OneOf([
                A.GaussNoise(var_limit=(10.0, 50.0), p=0.3),
                A.GaussianBlur(blur_limit=3, p=0.3),
                A.MedianBlur(blur_limit=3, p=0.3),
            ], p=0.2),

            A.Normalize(mean=config.MEAN, std=config.STD),
            ToTensorV2(),
        ])

    # Build VAL/TEST pipeline (safe across versions)
    else:
        return A.Compose([
            A.Resize(config.INPUT_SIZE, config.INPUT_SIZE),
            A.Normalize(mean=config.MEAN, std=config.STD),
            ToTensorV2(),
        ])


In [100]:

# %%
# Cell 10: Define model architecture
class NutritionPredictor(nn.Module):
    """EfficientNet-based model for nutrition prediction"""

    def __init__(self, model_name=config.MODEL_NAME, num_targets=len(config.TARGET_COLS),
                 dropout_rate=0.3, use_attention=True):
        super().__init__()

        # Load pretrained EfficientNet
        self.backbone = EfficientNet.from_pretrained(model_name)

        # Get feature dimension
        in_features = self.backbone._fc.in_features

        # Remove original classifier
        self.backbone._fc = nn.Identity()

        # Add attention module (optional)
        self.use_attention = use_attention
        if use_attention:
            self.attention = nn.Sequential(
                nn.Linear(in_features, in_features // 8),
                nn.ReLU(),
                nn.Linear(in_features // 8, in_features),
                nn.Sigmoid()
            )

        # Multi-task prediction head with residual connections
        self.head = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(in_features, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout_rate * 0.7),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout_rate * 0.5),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, num_targets)
        )

        # Task-specific heads (optional, for better multi-task learning)
        self.task_heads = nn.ModuleDict({
            'calories': nn.Linear(128, 1),
            'mass': nn.Linear(128, 1),
            'fat': nn.Linear(128, 1),
            'carb': nn.Linear(128, 1),
            'protein': nn.Linear(128, 1)
        })

        # Initialize weights
        self._initialize_weights()

    def _initialize_weights(self):
        """Initialize weights using He initialization"""
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm1d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        # Extract features
        features = self.backbone(x)

        # Apply attention if enabled
        if self.use_attention:
            attention_weights = self.attention(features)
            features = features * attention_weights

        # Get predictions
        output = self.head(features)

        return output


In [131]:
# Cell 11: Initialize model (EfficientNet base)
print("🏗️ Initializing base EfficientNet model...")

model = NutritionPredictor(
    model_name=config.MODEL_NAME,
    num_targets=len(config.TARGET_COLS),
    dropout_rate=0.3,
    use_attention=True
).to(device)

# Model summary
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Optional torch.compile (disabled by default)
USE_TORCH_COMPILE = False
if USE_TORCH_COMPILE and hasattr(torch, "compile"):
    print("📦 Compiling model for A100 optimization...")
    try:
        model = torch.compile(model, mode="reduce-overhead", fullgraph=False)
        print("✅ torch.compile enabled")
    except Exception as e:
        print(f"⚠️ torch.compile failed: {e}")


🏗️ Initializing base EfficientNet model...
Loaded pretrained weights for efficientnet-b1
Total parameters: 7,747,402
Trainable parameters: 7,747,402


In [132]:
# Cell 11B: Multi-View Wrapper
class MultiViewWrapper(nn.Module):
    """Averages model features over multiple frames before head."""
    def __init__(self, base_model):
        super().__init__()
        self.model = base_model

    def forward(self, x):
        # x shape: (B, K, C, H, W)
        b, k, c, h, w = x.shape
        x = x.view(b * k, c, h, w)
        out = self.model(x)
        out = out.view(b, k, -1).mean(1)  # average over K views
        return out

# Wrap the base model
multi_model = MultiViewWrapper(model).to(device)
print("✅ Multi-view model ready!")


✅ Multi-view model ready!


In [137]:
# Cell 11C: EMA wrapper
import copy
ema_decay = 0.999

ema_model = copy.deepcopy(multi_model).to(device)
for p in ema_model.parameters():
    p.requires_grad_(False)

@torch.no_grad()
def ema_update(ema, online, decay=ema_decay):
    for p_ema, p in zip(ema.parameters(), online.parameters()):
        p_ema.mul_(decay).add_(p, alpha=1.0 - decay)

print("✅ EMA model ready")


✅ EMA model ready


In [103]:

# %%
# Cell 12: Define loss functions and metrics
class NutritionLoss(nn.Module):
    """Combined loss for nutrition prediction"""

    def __init__(self, weights=None):
        super().__init__()
        self.mse = nn.MSELoss()
        self.mae = nn.L1Loss()
        self.huber = nn.HuberLoss(delta=1.0)

        # Task weights (can be adjusted based on importance)
        if weights is None:
            self.weights = torch.ones(len(config.TARGET_COLS)) / len(config.TARGET_COLS)
        else:
            self.weights = torch.tensor(weights)

    def forward(self, pred, target):
        # Combine MSE and MAE for robustness
        mse_loss = self.mse(pred, target)
        mae_loss = self.mae(pred, target)
        huber_loss = self.huber(pred, target)

        # Weighted combination
        total_loss = 0.5 * mse_loss + 0.3 * mae_loss + 0.2 * huber_loss

        return total_loss

def calculate_metrics(pred, target, scaler):
    """Calculate evaluation metrics"""
    # Denormalize predictions and targets
    pred_denorm = scaler.inverse_transform(pred.cpu().numpy())
    target_denorm = scaler.inverse_transform(target.cpu().numpy())

    metrics = {}
    for i, col in enumerate(config.TARGET_COLS):
        mae = np.mean(np.abs(pred_denorm[:, i] - target_denorm[:, i]))
        mape = np.mean(np.abs((pred_denorm[:, i] - target_denorm[:, i]) / (target_denorm[:, i] + 1e-8))) * 100

        metrics[f'{col}_mae'] = mae
        metrics[f'{col}_mape'] = mape

    # Overall metrics
    metrics['overall_mae'] = np.mean([metrics[f'{col}_mae'] for col in config.TARGET_COLS])
    metrics['overall_mape'] = np.mean([metrics[f'{col}_mape'] for col in config.TARGET_COLS])

    return metrics


In [133]:
# %%
#[CELL 13B] Multi-view dataset setup (k_views=5)
print("📊 Creating multi-view dataloaders (5 views per dish)...")

scaler_global = RobustScaler().fit(train_df[config.TARGET_COLS].values)

train_dataset = Nutrition5kMultiViewDataset(
    train_df,
    config.DATA_DIR + '/images',
    transform=get_transforms('train'),
    k_views=5
)

val_dataset = Nutrition5kMultiViewDataset(
    val_df,
    config.DATA_DIR + '/images',
    transform=get_transforms('val'),
    k_views=5
)

train_loader = DataLoader(
    train_dataset,
    batch_size=max(1, config.BATCH_SIZE // 5),  # scale batch size by number of views
    shuffle=True,
    num_workers=2,
    pin_memory=True,
    persistent_workers=False
)

val_loader = DataLoader(
    val_dataset,
    batch_size=max(1, config.BATCH_SIZE // 5),
    shuffle=False,
    num_workers=2,
    pin_memory=True,
    persistent_workers=False
)

print(f"✅ Train: {len(train_dataset)} samples | Val: {len(val_dataset)} samples")
print(f"Each sample contains {train_dataset.k_views} random views per dish")


📊 Creating multi-view dataloaders (5 views per dish)...
✅ Train: 800 samples | Val: 200 samples
Each sample contains 5 random views per dish


In [105]:
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Example training image path: {train_dataset.image_paths[0][0]}")


Train dataset size: 800
Validation dataset size: 200
Example training image path: None


In [134]:
# %%
#[CELL 14] Training + validation with MixUp

def mixup(x, y, alpha=0.2):
    lam = np.random.beta(alpha, alpha)
    idx = torch.randperm(x.size(0), device=x.device)
    x_mixed = lam * x + (1 - lam) * x[idx]
    y_mixed = lam * y + (1 - lam) * y[idx]
    return x_mixed, y_mixed


def train_epoch(model, loader, criterion, optimizer, scaler, device, ema_model=None):
    model.train()
    total_loss = 0.0
    all_preds, all_targets = [], []
    pbar = tqdm(loader, desc='Training')

    optimizer.zero_grad(set_to_none=True)

    for batch_idx, (images, targets) in enumerate(pbar):
        images = images.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        # 🔹 Apply MixUp augmentation
        images, targets = mixup(images, targets, alpha=0.2)

        if hasattr(torch, "compiler") and hasattr(torch.compiler, "cudagraph_mark_step_begin"):
            torch.compiler.cudagraph_mark_step_begin()

        with autocast(enabled=config.USE_AMP):
            outputs = model(images)
            loss = criterion(outputs, targets) / config.ACCUMULATION_STEPS

        scaler.scale(loss).backward()

        if (batch_idx + 1) % config.ACCUMULATION_STEPS == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)

            # 🔹 Update EMA if provided
            if ema_model is not None:
                ema_update(ema_model, model)

        total_loss += loss.item() * config.ACCUMULATION_STEPS
        all_preds.append(outputs.detach().cpu())
        all_targets.append(targets.detach().cpu())
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})

        if batch_idx % 50 == 0:
            torch.cuda.empty_cache()

    all_preds, all_targets = torch.cat(all_preds), torch.cat(all_targets)
    metrics = calculate_metrics(all_preds, all_targets, train_dataset.scaler)
    metrics['loss'] = total_loss / len(loader)
    return metrics


In [135]:
# Cell 15: Setup training components
print("🎯 Setting up training...")

criterion = NutritionLoss().to(device)
optimizer = optim.AdamW(
    multi_model.parameters(),
    lr=config.LEARNING_RATE,
    weight_decay=config.WEIGHT_DECAY,
    betas=(0.9, 0.999)
)

scheduler = CosineAnnealingLR(optimizer, T_max=config.EPOCHS, eta_min=1e-6)
scaler = GradScaler(enabled=config.USE_AMP)

history = defaultdict(list)
best_val_loss = float('inf')
patience_counter = 0


🎯 Setting up training...


In [96]:

# %%
# Cell 16: Main training loop
print("🚀 Starting training on A100...")
print(f"Training for {config.EPOCHS} epochs")
print("=" * 50)

for epoch in range(config.EPOCHS):
    print(f"\nEpoch {epoch+1}/{config.EPOCHS}")
    print("-" * 30)

    # Train
    train_metrics = train_epoch(model, train_loader, criterion, optimizer, scaler, device)

    # Validate
    val_metrics = validate_epoch(model, val_loader, criterion, device, train_dataset.scaler)

    # Update scheduler
    scheduler.step()

    # Log metrics
    print(f"Train Loss: {train_metrics['loss']:.4f} | Val Loss: {val_metrics['loss']:.4f}")
    print(f"Train MAE: {train_metrics['overall_mae']:.2f} | Val MAE: {val_metrics['overall_mae']:.2f}")
    print(f"Train MAPE: {train_metrics['overall_mape']:.2f}% | Val MAPE: {val_metrics['overall_mape']:.2f}%")

    # Detailed metrics
    print("\nPer-nutrient MAE:")
    for col in config.TARGET_COLS:
        print(f"  {col}: Train={train_metrics[f'{col}_mae']:.2f}, Val={val_metrics[f'{col}_mae']:.2f}")

    # Save history
    for key, value in train_metrics.items():
        history[f'train_{key}'].append(value)
    for key, value in val_metrics.items():
        history[f'val_{key}'].append(value)

    # Save best model
    if val_metrics['loss'] < best_val_loss:
        best_val_loss = val_metrics['loss']
        patience_counter = 0

        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'scaler_state_dict': scaler.state_dict(),
            'best_val_loss': best_val_loss,
            'config': config,
            'scaler_params': {
                'center_': train_dataset.scaler.center_,
                'scale_': train_dataset.scaler.scale_
            }
        }, config.BEST_MODEL_PATH)

        print("✅ Saved best model!")
    else:
        patience_counter += 1

    # Early stopping
    if patience_counter >= config.PATIENCE:
        print(f"\n⏹️ Early stopping triggered after {epoch+1} epochs")
        break

    # Clear GPU cache
    torch.cuda.empty_cache()
    gc.collect()

print("\n" + "=" * 50)
print("✅ Training complete!")

🚀 Starting training on A100...
Training for 30 epochs

Epoch 1/30
------------------------------


Training:   0%|          | 0/39 [00:00<?, ?it/s]

RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [21, 3, 3, 241, 241]

In [None]:
# Cell 16B: Main multi-view training loop
print("🚀 Starting multi-view training on A100...")
print(f"Training for {config.EPOCHS} epochs")
print("=" * 50)

for epoch in range(config.EPOCHS):
    print(f"\nEpoch {epoch+1}/{config.EPOCHS}")
    print("-" * 30)

    train_metrics = train_epoch(multi_model, train_loader, criterion, optimizer, scaler, device, ema_model=ema_model)
    val_metrics   = validate_epoch(ema_model,  val_loader,  criterion, device, scaler_global)
    scheduler.step()

    print(f"Train Loss: {train_metrics['loss']:.4f} | Val Loss: {val_metrics['loss']:.4f}")
    print(f"Train MAE: {train_metrics['overall_mae']:.2f} | Val MAE: {val_metrics['overall_mae']:.2f}")

    for key, value in train_metrics.items():
        history[f'train_{key}'].append(value)
    for key, value in val_metrics.items():
        history[f'val_{key}'].append(value)

    if val_metrics['loss'] < best_val_loss:
        best_val_loss = val_metrics['loss']
        patience_counter = 0
        torch.save({
            'epoch': epoch,
            'model_state_dict': multi_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'scaler_state_dict': scaler.state_dict(),
            'best_val_loss': best_val_loss,
            'config': config
        }, config.BEST_MODEL_PATH)
        print("✅ Saved best multi-view model!")
    else:
        patience_counter += 1

    if patience_counter >= config.PATIENCE:
        print("⏹️ Early stopping triggered.")
        break

print("\n✅ Multi-view training complete!")


🚀 Starting multi-view training on A100...
Training for 30 epochs

Epoch 1/30
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Validation:   0%|          | 0/17 [00:00<?, ?it/s]

Train Loss: 2.2290 | Val Loss: 7.3676
Train MAE: 124.57 | Val MAE: 168.40
✅ Saved best multi-view model!

Epoch 2/30
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Validation:   0%|          | 0/17 [00:00<?, ?it/s]

Train Loss: 0.7377 | Val Loss: 7.7005
Train MAE: 60.86 | Val MAE: 182.27

Epoch 3/30
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Validation:   0%|          | 0/17 [00:00<?, ?it/s]

Train Loss: 0.5975 | Val Loss: 8.2085
Train MAE: 54.26 | Val MAE: 198.99

Epoch 4/30
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

In [116]:
# --- Repair the checkpoint (PyTorch 2.6 weights_only issue) and resume training ---

import gc, torch

ckpt_path = config.BEST_MODEL_PATH  # '/content/best_model.pth'
print(f"🔄 Loading (unsafe allowed) from {ckpt_path}")

# 1) Load with weights_only=False (trusted source: your own file)
ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)

# 2) Sanitize: drop any non-primitive objects (e.g., class instances)
if 'config' in ckpt:
    print("🧹 Removing non-serializable 'config' object from checkpoint")
    ckpt.pop('config')

# 3) Re-save a clean checkpoint so future loads work with default settings
torch.save(ckpt, ckpt_path)
print("✅ Re-saved cleaned checkpoint")

# 4) Restore state and resume
multi_model.load_state_dict(ckpt['model_state_dict'])
optimizer.load_state_dict(ckpt['optimizer_state_dict'])
scheduler.load_state_dict(ckpt['scheduler_state_dict'])
scaler.load_state_dict(ckpt['scaler_state_dict'])

best_val_loss = ckpt.get('best_val_loss', float('inf'))
start_epoch = ckpt.get('epoch', 0) + 1
extra_epochs = 30
config.EPOCHS = start_epoch + extra_epochs
print(f"▶️ Resuming from epoch {start_epoch} → {config.EPOCHS} (best val loss {best_val_loss:.4f})")

patience_counter = 0

for epoch in range(start_epoch, config.EPOCHS):
    print(f"\nEpoch {epoch+1}/{config.EPOCHS}")
    print("-" * 30)

    train_metrics = train_epoch(multi_model, train_loader, criterion, optimizer, scaler, device)
    # IMPORTANT: use the same scaler you used to build datasets (scaler_global or train_dataset.scaler)
    val_metrics   = validate_epoch(multi_model, val_loader, criterion, device, train_dataset.scaler)
    scheduler.step()

    print(f"Train Loss: {train_metrics['loss']:.4f} | Val Loss: {val_metrics['loss']:.4f}")
    print(f"Train MAE: {train_metrics['overall_mae']:.2f} | Val MAE: {val_metrics['overall_mae']:.2f}")

    if val_metrics['loss'] < best_val_loss:
        best_val_loss = val_metrics['loss']
        patience_counter = 0
        # Save a clean checkpoint (no class instances)
        torch.save({
            'epoch': epoch,
            'model_state_dict': multi_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'scaler_state_dict': scaler.state_dict(),
            'best_val_loss': best_val_loss,
            # store scalar scaler params only if you like:
            'scaler_params': {
                'center_': getattr(train_dataset.scaler, 'center_', None),
                'scale_': getattr(train_dataset.scaler, 'scale_', None),
            }
        }, ckpt_path)
        print("✅ Saved improved (clean) checkpoint!")
    else:
        patience_counter += 1

    if patience_counter >= config.PATIENCE:
        print(f"⏹️ Early stopping triggered again at epoch {epoch+1}")
        break

    torch.cuda.empty_cache()
    gc.collect()

print("\n🎯 Resume training complete!")


🔄 Loading (unsafe allowed) from /content/best_model.pth
🧹 Removing non-serializable 'config' object from checkpoint
✅ Re-saved cleaned checkpoint
▶️ Resuming from epoch 7 → 37 (best val loss 0.5736)

Epoch 8/37
------------------------------


Training:   0%|          | 0/39 [00:00<?, ?it/s]

Validation:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.6284 | Val Loss: 0.6096
Train MAE: 55.79 | Val MAE: 51.86

Epoch 9/37
------------------------------


Training:   0%|          | 0/39 [00:00<?, ?it/s]

Validation:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.6093 | Val Loss: 0.6392
Train MAE: 53.52 | Val MAE: 59.61

Epoch 10/37
------------------------------


Training:   0%|          | 0/39 [00:00<?, ?it/s]

Validation:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.6335 | Val Loss: 0.6984
Train MAE: 54.41 | Val MAE: 63.18

Epoch 11/37
------------------------------


Training:   0%|          | 0/39 [00:00<?, ?it/s]

Validation:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.6321 | Val Loss: 0.5838
Train MAE: 53.47 | Val MAE: 53.82

Epoch 12/37
------------------------------


Training:   0%|          | 0/39 [00:00<?, ?it/s]

Validation:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.5982 | Val Loss: 0.6161
Train MAE: 53.57 | Val MAE: 56.67
⏹️ Early stopping triggered again at epoch 12

🎯 Resume training complete!


In [117]:
# === Evaluation Cell ===
import numpy as np
import torch
from torch.cuda.amp import autocast

multi_model.eval()
all_preds, all_targets = [], []

with torch.no_grad():
    for X, y in val_loader:
        X = X.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        with autocast(enabled=config.USE_AMP):
            out = multi_model(X)
        all_preds.append(out.cpu())
        all_targets.append(y.cpu())

all_preds = torch.cat(all_preds).numpy()
all_targets = torch.cat(all_targets).numpy()

# Denormalize using the scaler from train_dataset
pred_denorm = train_dataset.scaler.inverse_transform(all_preds)
tgt_denorm  = train_dataset.scaler.inverse_transform(all_targets)

cols = config.TARGET_COLS
metrics = {}

for i, c in enumerate(cols):
    err = np.abs(pred_denorm[:, i] - tgt_denorm[:, i])
    mae = float(np.mean(err))
    mape = float(np.mean(err / (np.abs(tgt_denorm[:, i]) + 1e-8)) * 100.0)
    rmse = float(np.sqrt(np.mean((pred_denorm[:, i] - tgt_denorm[:, i])**2)))
    metrics[c] = {"MAE": mae, "MAPE%": mape, "RMSE": rmse}

print("📊 Final Validation Metrics:")
for c in cols:
    print(f"  {c:9s} | MAE={metrics[c]['MAE']:.2f} | RMSE={metrics[c]['RMSE']:.2f} | MAPE={metrics[c]['MAPE%']:.2f}%")

print("\nAverage MAE:", np.mean([metrics[c]['MAE'] for c in cols]))


📊 Final Validation Metrics:
  calories  | MAE=156.61 | RMSE=212.30 | MAPE=1389.27%
  mass      | MAE=98.14 | RMSE=134.90 | MAPE=955.29%
  fat       | MAE=8.32 | RMSE=11.52 | MAPE=5424.29%
  carb      | MAE=10.57 | RMSE=14.88 | MAPE=484.29%
  protein   | MAE=11.16 | RMSE=17.71 | MAPE=441.52%

Average MAE: 56.95838623046875


In [120]:
# ==== Turbo Tune: K=5 views + longer schedule + resume training ====

import os, gc, torch
from torch.utils.data import DataLoader

# 1) Rebuild datasets with K=5 and ONE global scaler (fit on train targets only)
scaler_global = RobustScaler().fit(train_df[config.TARGET_COLS].values)

train_dataset = Nutrition5kMultiViewDataset(
    train_df,
    config.DATA_DIR + '/images',
    transform=get_transforms('train'),
    k_views=5
)

val_dataset = Nutrition5kMultiViewDataset(
    val_df,
    config.DATA_DIR + '/images',
    transform=get_transforms('val'),
    k_views=5
)


# 2) DataLoaders (keep workers small to avoid Colab shutdown warnings)
NUM_WORKERS = 2  # set to 0 if you still see the shutdown assertion
PERSISTENT = False
PIN = True

train_loader = DataLoader(
    train_dataset,
    batch_size=max(1, config.BATCH_SIZE // 5),  # scale batch with K
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=PIN,
    persistent_workers=PERSISTENT,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=max(1, config.BATCH_SIZE // 5),
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=PIN,
    persistent_workers=PERSISTENT,
)

print(f"Train samples: {len(train_dataset)}, Val samples: {len(val_dataset)}")

# 3) Optimizer/scheduler with a slightly lower LR
config.LEARNING_RATE = 2e-3
criterion = NutritionLoss().to(device)
optimizer = optim.AdamW(
    multi_model.parameters(),
    lr=config.LEARNING_RATE,
    weight_decay=config.WEIGHT_DECAY,
    betas=(0.9, 0.999),
)
scheduler = CosineAnnealingLR(optimizer, T_max=50, eta_min=1e-6)
scaler = GradScaler(enabled=config.USE_AMP)

# 4) Load best checkpoint safely (PyTorch 2.6 changed defaults)
ckpt_path = config.BEST_MODEL_PATH
print(f"🔄 Loading best checkpoint from {ckpt_path}")
ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)

# Clean any non-primitive fields and re-save (future-safe)
if 'config' in ckpt:
    ckpt.pop('config')
    torch.save(ckpt, ckpt_path)

multi_model.load_state_dict(ckpt['model_state_dict'])
optimizer.load_state_dict(ckpt['optimizer_state_dict'])
scheduler.load_state_dict(ckpt['scheduler_state_dict'])
scaler.load_state_dict(ckpt['scaler_state_dict'])

best_val_loss = ckpt.get('best_val_loss', float('inf'))
start_epoch = ckpt.get('epoch', 0) + 1

# 5) Longer schedule + higher patience
config.PATIENCE = 8
config.EPOCHS = start_epoch + 50
print(f"▶️ Resuming from epoch {start_epoch} → {config.EPOCHS} | best val loss {best_val_loss:.4f}")

history = defaultdict(list)
patience_counter = 0

for epoch in range(start_epoch, config.EPOCHS):
    print(f"\nEpoch {epoch+1}/{config.EPOCHS}")
    print("-" * 30)

    train_metrics = train_epoch(multi_model, train_loader, criterion, optimizer, scaler, device)
    # IMPORTANT: use the SAME scaler used to build datasets
    val_metrics   = validate_epoch(multi_model, val_loader, criterion, device, scaler_global)
    scheduler.step()

    print(f"Train Loss: {train_metrics['loss']:.4f} | Val Loss: {val_metrics['loss']:.4f}")
    print(f"Train MAE: {train_metrics['overall_mae']:.2f} | Val MAE: {val_metrics['overall_mae']:.2f}")

    for k,v in train_metrics.items(): history[f"train_{k}"].append(v)
    for k,v in val_metrics.items():   history[f"val_{k}"].append(v)

    if val_metrics['loss'] < best_val_loss:
        best_val_loss = val_metrics['loss']
        patience_counter = 0
        torch.save({
            'epoch': epoch,
            'model_state_dict': multi_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'scaler_state_dict': scaler.state_dict(),
            'best_val_loss': best_val_loss,
            # store just scaler params (optional)
            'scaler_params': {
                'center_': getattr(train_dataset.scaler, 'center_', None),
                'scale_': getattr(train_dataset.scaler, 'scale_', None),
            }
        }, ckpt_path)
        print("✅ Saved improved (clean) checkpoint!")
    else:
        patience_counter += 1

    if patience_counter >= config.PATIENCE:
        print(f"⏹️ Early stopping triggered at epoch {epoch+1}")
        break

    torch.cuda.empty_cache()
    gc.collect()

print("\n🎯 Turbo tune complete! Now re-run the masked MAPE/SMAPE eval cell.")


Train samples: 800, Val samples: 200
🔄 Loading best checkpoint from /content/best_model.pth
▶️ Resuming from epoch 7 → 57 | best val loss 0.5736

Epoch 8/57
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Validation:   0%|          | 0/17 [00:00<?, ?it/s]

Train Loss: 0.6290 | Val Loss: 0.6068
Train MAE: 55.93 | Val MAE: 55.58

Epoch 9/57
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Validation:   0%|          | 0/17 [00:00<?, ?it/s]

Train Loss: 0.6234 | Val Loss: 0.6118
Train MAE: 55.66 | Val MAE: 57.78

Epoch 10/57
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Validation:   0%|          | 0/17 [00:00<?, ?it/s]

Train Loss: 0.6039 | Val Loss: 0.5717
Train MAE: 54.13 | Val MAE: 54.06
✅ Saved improved (clean) checkpoint!

Epoch 11/57
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Validation:   0%|          | 0/17 [00:00<?, ?it/s]

Train Loss: 0.6155 | Val Loss: 0.6295
Train MAE: 55.01 | Val MAE: 55.18

Epoch 12/57
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Validation:   0%|          | 0/17 [00:00<?, ?it/s]

Train Loss: 0.5988 | Val Loss: 0.6148
Train MAE: 53.65 | Val MAE: 54.43

Epoch 13/57
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Validation:   0%|          | 0/17 [00:00<?, ?it/s]

Train Loss: 0.5803 | Val Loss: 0.5754
Train MAE: 52.37 | Val MAE: 53.94

Epoch 14/57
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Validation:   0%|          | 0/17 [00:00<?, ?it/s]

Train Loss: 0.5536 | Val Loss: 0.5871
Train MAE: 50.95 | Val MAE: 53.99

Epoch 15/57
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Validation:   0%|          | 0/17 [00:00<?, ?it/s]

Train Loss: 0.5845 | Val Loss: 0.5934
Train MAE: 52.87 | Val MAE: 56.09

Epoch 16/57
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Validation:   0%|          | 0/17 [00:00<?, ?it/s]

Train Loss: 0.5772 | Val Loss: 0.5741
Train MAE: 52.94 | Val MAE: 51.23

Epoch 17/57
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Validation:   0%|          | 0/17 [00:00<?, ?it/s]

Train Loss: 0.5816 | Val Loss: 0.5742
Train MAE: 52.38 | Val MAE: 54.26

Epoch 18/57
------------------------------


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Validation:   0%|          | 0/17 [00:00<?, ?it/s]

Train Loss: 0.5682 | Val Loss: 0.5826
Train MAE: 52.15 | Val MAE: 52.11
⏹️ Early stopping triggered at epoch 18

🎯 Turbo tune complete! Now re-run the masked MAPE/SMAPE eval cell.


In [121]:
# Better evaluation: masked MAPE and SMAPE
import numpy as np, torch
from torch.cuda.amp import autocast

def masked_mape(pred, true, eps=1e-8, min_denom=5.0):
    denom = np.maximum(np.abs(true), min_denom)
    return np.mean(np.abs((pred - true) / denom)) * 100.0

def smape(pred, true, eps=1e-8):
    return np.mean(200.0 * np.abs(pred-true) / (np.abs(pred)+np.abs(true)+eps))

multi_model.eval()
preds, tgts = [], []
with torch.no_grad():
    for X, y in val_loader:
        X = X.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        with autocast(enabled=config.USE_AMP):
            o = multi_model(X)
        preds.append(o.cpu()); tgts.append(y.cpu())

preds = torch.cat(preds).numpy()
tgts  = torch.cat(tgts).numpy()

pred_den = train_dataset.scaler.inverse_transform(preds)
tgt_den  = train_dataset.scaler.inverse_transform(tgts)

cols = config.TARGET_COLS
print("📊 Metrics (MAE | RMSE | masked MAPE | SMAPE)")
for i,c in enumerate(cols):
    err  = np.abs(pred_den[:,i]-tgt_den[:,i])
    mae  = float(np.mean(err))
    rmse = float(np.sqrt(np.mean((pred_den[:,i]-tgt_den[:,i])**2)))
    mm   = float(masked_mape(pred_den[:,i], tgt_den[:,i], min_denom=10.0 if c!='calories' else 50.0))
    sm   = float(smape(pred_den[:,i], tgt_den[:,i]))
    print(f"  {c:9s}  {mae:7.2f} | {rmse:7.2f} | {mm:6.2f}% | {sm:6.2f}%")

print("\n(Notes) masked MAPE uses min denom 50 kcal for calories, 10 g for macros/mass to avoid divide-by-tiny-values.")


📊 Metrics (MAE | RMSE | masked MAPE | SMAPE)
  calories    140.77 |  207.16 | 106.91% |  92.47%
  mass         92.19 |  130.65 | 132.68% |  61.97%
  fat           7.72 |   11.39 |  49.23% | 114.28%
  carb         10.71 |   14.15 |  68.59% |  92.34%
  protein      10.14 |   16.96 |  49.51% |  87.34%

(Notes) masked MAPE uses min denom 50 kcal for calories, 10 g for macros/mass to avoid divide-by-tiny-values.
