In [3]:
import os
import tarfile
import urllib.request
from pathlib import Path
from tqdm import tqdm

In [4]:
# Cell 1: Download Food-101 with progress bar
class DownloadProgressBar(tqdm):
    """Progress bar for urllib downloads"""
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

def download_with_progress(url, output_path):
    """Download file with progress bar"""
    with DownloadProgressBar(unit='B', unit_scale=True, 
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, 
                                   reporthook=t.update_to)

# Create directory
dataset_dir = Path.home() / "calorie_estimator" / "datasets"
dataset_dir.mkdir(parents=True, exist_ok=True)

print("="*60)
print("Downloading Food-101 dataset")
print("Size: ~5GB (4.65GB exact)")
print("Estimated time: 10-20 minutes (depends on internet speed)")
print("="*60)

# Download
url = "http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz"
tar_path = dataset_dir / "food-101.tar.gz"

if not tar_path.exists():
    print(f"\nDownloading to: {tar_path}")
    download_with_progress(url, tar_path)
    print("\n✓ Download complete!")
else:
    print(f"\n✓ Already downloaded: {tar_path}")
    print(f"  Size: {tar_path.stat().st_size / (1024**3):.2f} GB")

# Extract with progress
food101_dir = dataset_dir / "food-101"
if not food101_dir.exists():
    print("\n" + "="*60)
    print("Extracting dataset...")
    print("This will take 5-10 minutes")
    print("="*60)
    
    with tarfile.open(tar_path) as tar:
        members = tar.getmembers()
        for member in tqdm(members, desc="Extracting"):
            tar.extract(member, dataset_dir)
    
    print("\n✓ Extraction complete!")
else:
    print(f"\n✓ Already extracted: {food101_dir}")

print(f"\n{'='*60}")
print("Dataset Ready!")
print(f"{'='*60}")
print(f"Location: {food101_dir}")
print(f"Total size: ~7 GB (compressed + extracted)")

Downloading Food-101 dataset
Size: ~5GB (4.65GB exact)
Estimated time: 10-20 minutes (depends on internet speed)

✓ Already downloaded: /Users/jasonzhang/calorie_estimator/datasets/food-101.tar.gz
  Size: 4.65 GB

✓ Already extracted: /Users/jasonzhang/calorie_estimator/datasets/food-101

Dataset Ready!
Location: /Users/jasonzhang/calorie_estimator/datasets/food-101
Total size: ~7 GB (compressed + extracted)


In [5]:
# Cell 2: Explore dataset
food101_path = Path.home() / "calorie_estimator" / "datasets" / "food-101"

# Check structure
print("Dataset structure:")
print(f"  Images: {food101_path / 'images'}")
print(f"  Metadata: {food101_path / 'meta'}")

# Count classes
classes_dir = food101_path / "images"
classes = sorted([d.name for d in classes_dir.iterdir() if d.is_dir()])
print(f"\nTotal classes: {len(classes)}")
print(f"First 10 classes: {classes[:10]}")

# Check image count
sample_class = classes[0]
sample_images = list((classes_dir / sample_class).glob("*.jpg"))
print(f"\nImages in '{sample_class}': {len(sample_images)}")

# Read train/test split
train_file = food101_path / "meta" / "train.txt"
test_file = food101_path / "meta" / "test.txt"

with open(train_file) as f:
    train_list = f.readlines()
with open(test_file) as f:
    test_list = f.readlines()

print(f"\nTraining images: {len(train_list)}")
print(f"Test images: {len(test_list)}")
print(f"Total: {len(train_list) + len(test_list)}")

Dataset structure:
  Images: /Users/jasonzhang/calorie_estimator/datasets/food-101/images
  Metadata: /Users/jasonzhang/calorie_estimator/datasets/food-101/meta

Total classes: 101
First 10 classes: ['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio', 'beef_tartare', 'beet_salad', 'beignets', 'bibimbap', 'bread_pudding', 'breakfast_burrito']

Images in 'apple_pie': 1000

Training images: 75750
Test images: 25250
Total: 101000


In [7]:
# Cell 3: Convert Food-101 to YOLO format
import shutil
from tqdm import tqdm
import yaml

def convert_food101_to_yolo(food101_path, output_path, subset_classes=None):
    """
    Convert Food-101 dataset to YOLO format
    
    Args:
        food101_path: Path to food-101 directory
        output_path: Where to save YOLO format dataset
        subset_classes: List of classes to use (None = use all 101)
    """
    
    food101_path = Path(food101_path)
    output_path = Path(output_path)
    
    # Create YOLO directory structure
    (output_path / "images" / "train").mkdir(parents=True, exist_ok=True)
    (output_path / "images" / "val").mkdir(parents=True, exist_ok=True)
    (output_path / "labels" / "train").mkdir(parents=True, exist_ok=True)
    (output_path / "labels" / "val").mkdir(parents=True, exist_ok=True)
    
    # Get all classes
    all_classes = sorted([d.name for d in (food101_path / "images").iterdir() if d.is_dir()])
    
    # Use subset if specified
    if subset_classes:
        classes = [c for c in all_classes if c in subset_classes]
        print(f"Using {len(classes)} classes: {classes}")
    else:
        classes = all_classes
        print(f"Using all {len(classes)} classes")
    
    # Create class_to_idx mapping
    class_to_idx = {cls_name: idx for idx, cls_name in enumerate(classes)}
    
    # Read train/test splits
    with open(food101_path / "meta" / "train.txt") as f:
        train_files = [line.strip() for line in f.readlines()]
    
    with open(food101_path / "meta" / "test.txt") as f:
        test_files = [line.strip() for line in f.readlines()]
    
    # Filter for selected classes
    if subset_classes:
        train_files = [f for f in train_files if f.split('/')[0] in classes]
        test_files = [f for f in test_files if f.split('/')[0] in classes]
    
    print(f"\nProcessing {len(train_files)} training images...")
    print(f"Processing {len(test_files)} validation images...")
    
    # Process training set
    for file_path in tqdm(train_files, desc="Converting train"):
        class_name, img_name = file_path.split('/')
        class_idx = class_to_idx[class_name]
        
        # Copy image
        src_img = food101_path / "images" / class_name / f"{img_name}.jpg"
        dst_img = output_path / "images" / "train" / f"{class_name}_{img_name}.jpg"
        shutil.copy(src_img, dst_img)
        
        # Create label (full image bounding box for classification)
        # Format: class_id x_center y_center width height (normalized)
        label_file = output_path / "labels" / "train" / f"{class_name}_{img_name}.txt"
        with open(label_file, 'w') as f:
            f.write(f"{class_idx} 0.5 0.5 1.0 1.0\n")
    
    # Process test/validation set
    for file_path in tqdm(test_files, desc="Converting val"):
        class_name, img_name = file_path.split('/')
        class_idx = class_to_idx[class_name]
        
        # Copy image
        src_img = food101_path / "images" / class_name / f"{img_name}.jpg"
        dst_img = output_path / "images" / "val" / f"{class_name}_{img_name}.jpg"
        shutil.copy(src_img, dst_img)
        
        # Create label
        label_file = output_path / "labels" / "val" / f"{class_name}_{img_name}.txt"
        with open(label_file, 'w') as f:
            f.write(f"{class_idx} 0.5 0.5 1.0 1.0\n")
    
    # Create data.yaml
    data_yaml = {
        'path': str(output_path.absolute()),
        'train': 'images/train',
        'val': 'images/val',
        'names': {idx: name for name, idx in class_to_idx.items()}
    }
    
    with open(output_path / "data.yaml", 'w') as f:
        yaml.dump(data_yaml, f, sort_keys=False)
    
    print(f"\n✓ Conversion complete!")
    print(f"✓ Dataset saved to: {output_path}")
    print(f"✓ Classes: {len(classes)}")
    print(f"✓ Training images: {len(train_files)}")
    print(f"✓ Validation images: {len(test_files)}")
    
    return output_path / "data.yaml"

In [8]:
# Cell 4: Run conversion
food101_path = Path.home() / "calorie_estimator" / "datasets" / "food-101"
yolo_output = Path.home() / "calorie_estimator" / "datasets" / "food101_yolo"

# Option A: Use ALL 101 classes (recommended for full project)
data_yaml_path = convert_food101_to_yolo(food101_path, yolo_output)

# Option B: Quick test with subset (10-20 classes) - faster training
# subset = ['pizza', 'apple_pie', 'hamburger', 'hot_dog', 'ice_cream', 
#           'french_fries', 'sushi', 'steak', 'chicken_wings', 'donuts']
# data_yaml_path = convert_food101_to_yolo(food101_path, yolo_output, subset)

print(f"\ndata.yaml location: {data_yaml_path}")

Using all 101 classes

Processing 75750 training images...
Processing 25250 validation images...


Converting train: 100%|█████████████████████████████████████████| 75750/75750 [00:33<00:00, 2250.10it/s]
Converting val: 100%|███████████████████████████████████████████| 25250/25250 [00:10<00:00, 2319.51it/s]


✓ Conversion complete!
✓ Dataset saved to: /Users/jasonzhang/calorie_estimator/datasets/food101_yolo
✓ Classes: 101
✓ Training images: 75750
✓ Validation images: 25250

data.yaml location: /Users/jasonzhang/calorie_estimator/datasets/food101_yolo/data.yaml



