In [1]:
#1) Unmount then remount (force prompt to pick account)
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive', force_remount=True)

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [4]:
import os, glob
print("List top level MyDrive:")
print(os.listdir('/content/drive/MyDrive')[:100])   # quick peek

# Search for 'food-10' directory and any .pth in mounted drive
print("Searching for food-10 folders...")
print(glob.glob('/content/drive/MyDrive/**/food-10*', recursive=True)[:50])
print("Searching for .pth checkpoints (this may take a moment)...")
print(glob.glob('/content/drive/MyDrive/**/*.pth', recursive=True)[:200])

List top level MyDrive:
['Classroom', 'AD&ML Project- Construction Website', 'tik-qmcv-hkh â€“ 31 May 2024.pdf', 'Colab Notebooks', 'food_project', 'food-10']
Searching for food-10 folders...
['/content/drive/MyDrive/food-10', '/content/drive/MyDrive/food-10/food-10']
Searching for .pth checkpoints (this may take a moment)...
['/content/drive/MyDrive/food-10/food-10/outputs/best_resnet50.pth', '/content/drive/MyDrive/food-10/food-10/outputs_experiments/best_effnetb0_freeze1.pth', '/content/drive/MyDrive/food-10/food-10/outputs_experiments/best_resnet101_full.pth']


In [9]:
ckpt_path="/content/drive/MyDrive/food-10/food-10/outputs_experiments/best_effnetb0_freeze1.pth"
import os
print("Exists:",os.path.exists(ckpt_path))

Exists: True


In [10]:
import shutil, os
local_out="/content/food10_outputs"
os.makedirs(local_out,exist_ok=True)
shutil.copy(ckpt_path,os.path.join(local_out,os.path.basename(ckpt_path)))
print("Copied to",local_out)

Copied to /content/food10_outputs


In [3]:
import torch, glob, json, os
ckpts = glob.glob("/content/drive/MyDrive/food-10/food-10/**/best_*.pth", recursive=True)
print("Found checkpoints:", ckpts)

for p in ckpts:
    print("\n---", p)
    ck = torch.load(p, map_location='cpu')
    print("Keys in checkpoint:", list(ck.keys()))
    print("Epoch in ckpt:", ck.get('epoch', 'N/A'))
    print("Best val f1 in ckpt:", ck.get('best_f1', ck.get('best_val_f1', 'N/A')))
    if 'cfg' in ck:
        print("cfg.model_name:", ck['cfg'].get('model_name', None))
# History JSONs
hist_files = glob.glob("/content/drive/MyDrive/food-10/food-10/**/history*.json", recursive=True)
print("\nFound history files:", hist_files)
for h in hist_files:
    print("History:", h)
    try:
        with open(h,'r') as f:
            hist = json.load(f)
        print("history keys:", hist.keys())
        # print last epoch stats (if present)
        valf = hist.get('val_f1', [])
        if valf:
            print("last val_f1:", valf[-1], "best val_f1:", max(valf))
    except Exception as e:
        print("Could not read history:", e)

Found checkpoints: ['/content/drive/MyDrive/food-10/food-10/outputs/best_resnet50.pth', '/content/drive/MyDrive/food-10/food-10/outputs_experiments/best_effnetb0_freeze1.pth', '/content/drive/MyDrive/food-10/food-10/outputs_experiments/best_resnet101_full.pth']

--- /content/drive/MyDrive/food-10/food-10/outputs/best_resnet50.pth
Keys in checkpoint: ['model_state', 'optimizer_state', 'scheduler_state', 'epoch', 'best_f1', 'cfg']
Epoch in ckpt: 10
Best val f1 in ckpt: 0.9023548931635321
cfg.model_name: resnet50

--- /content/drive/MyDrive/food-10/food-10/outputs_experiments/best_effnetb0_freeze1.pth
Keys in checkpoint: ['model_state', 'optimizer_state', 'cfg', 'epoch', 'best_f1']
Epoch in ckpt: 11
Best val f1 in ckpt: 0.8739598559535151
cfg.model_name: None

--- /content/drive/MyDrive/food-10/food-10/outputs_experiments/best_resnet101_full.pth
Keys in checkpoint: ['model_state', 'optimizer_state', 'cfg', 'epoch', 'best_f1']
Epoch in ckpt: 6
Best val f1 in ckpt: 0.9046313937669149
cfg.mo

In [4]:
ckpt_path = "/content/drive/MyDrive/food-10/food-10/outputs_experiments/best_resnet101_full.pth"
ck = torch.load(ckpt_path, map_location='cpu')

# infer num_classes
ms = ck['model_state']
fc_key = next(k for k in ms.keys() if 'fc.weight' in k)
num_classes = ms[fc_key].shape[0]

import timm, torch
model = timm.create_model('resnet101', pretrained=False, num_classes=num_classes)
model.load_state_dict(ms)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
optimizer.load_state_dict(ck['optimizer_state'])

start_epoch = ck['epoch'] + 1
best_f1 = ck['best_f1']

print("Resuming from epoch:", start_epoch, "Best F1:", best_f1)

Resuming from epoch: 7 Best F1: 0.9046313937669149


In [5]:
#Resuming the ResNet-101 training cell
import os, json, shutil, time
import numpy as np, pandas as pd
from pathlib import Path
from tqdm import tqdm
import torch, timm
import torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from sklearn.metrics import f1_score

# --- Config (edit only if you want different final epoch/batch) ---
ROOT = "/content/drive/MyDrive/food-10/food-10"
SPLIT_DIR = os.path.join(ROOT, "prepared_splits")
DRIVE_OUT_DIR = os.path.join(ROOT, "outputs_experiments")
LOCAL_OUT = "/content/food10_outputs"
os.makedirs(LOCAL_OUT, exist_ok=True)
os.makedirs(DRIVE_OUT_DIR, exist_ok=True)

TOTAL_EPOCHS = 12            # final target epochs (was 12 in your runs)
BATCH_SIZE = 32              # keep same as before
IMG_SIZE = 224
LR = 3e-4
WEIGHT_DECAY = 1e-4
NUM_WORKERS = min(8, os.cpu_count() or 4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# --- Paths & checkpoint to resume from ---
ckpt_path = "/content/drive/MyDrive/food-10/food-10/outputs_experiments/best_resnet101_full.pth"
assert os.path.exists(ckpt_path), "Checkpoint not found: " + ckpt_path

# --- Load checkpoint (fast step you already did) ---
ck = torch.load(ckpt_path, map_location='cpu')
ms = ck['model_state']
# detect fc weight key and num_classes
fc_key = next((k for k in ms.keys() if 'fc.weight' in k or k.endswith('fc.weight')), None)
if fc_key is None:
    raise RuntimeError("Could not find fc.weight key in checkpoint.")
num_classes = ms[fc_key].shape[0]
print("Num classes inferred:", num_classes)

# --- Build model and load state ---
model = timm.create_model('resnet101', pretrained=False, num_classes=num_classes)
model.load_state_dict(ms)
model.to(device)

# --- Rebuild data pipeline (same as training) ---
train_df = pd.read_csv(os.path.join(SPLIT_DIR, "train.csv"))
val_df   = pd.read_csv(os.path.join(SPLIT_DIR, "val.csv"))
assert 'fullpath' in train_df.columns and 'label' in train_df.columns

train_tf = transforms.Compose([
    transforms.RandomResizedCrop(IMG_SIZE),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2,0.2,0.2,0.05),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])
val_tf = transforms.Compose([
    transforms.Resize(int(IMG_SIZE*1.15)),
    transforms.CenterCrop(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

class Food10Dataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        img = Image.open(r['fullpath']).convert('RGB')
        if self.transform: img = self.transform(img)
        label = int(r['label'])
        return img, label

train_ds = Food10Dataset(train_df, transform=train_tf)
val_ds   = Food10Dataset(val_df, transform=val_tf)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=(device.type=='cuda'))
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=(device.type=='cuda'))

# --- Optimizer / scheduler / criterion / scaler ---
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
if 'optimizer_state' in ck:
    try:
        optimizer.load_state_dict(ck['optimizer_state'])
        print("Loaded optimizer state.")
    except Exception as e:
        print("Warning: couldn't load optimizer state:", e)

# Scheduler: try to restore if present, else create new
scheduler = None
if 'scheduler_state' in ck:
    try:
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TOTAL_EPOCHS)
        scheduler.load_state_dict(ck['scheduler_state'])
        print("Loaded scheduler state.")
    except Exception as e:
        print("Warning: couldn't load scheduler state:", e)
if scheduler is None:
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TOTAL_EPOCHS)

criterion = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))

# --- History (load & continue) ---
history_local = {"train_loss":[], "train_f1":[], "val_loss":[], "val_f1":[]}
# Try to find existing history file near checkpoint
hist_candidates = []
hist_candidates += [os.path.join(LOCAL_OUT, "history_resnet101_resume.json")]
hist_candidates += list(glob.glob(os.path.join(os.path.dirname(ckpt_path), "history_*.json")))
hist_candidates += list(glob.glob(os.path.join(LOCAL_OUT, "history_*.json")))
hist_candidates = [p for p in hist_candidates if os.path.exists(p)]
if hist_candidates:
    try:
        with open(hist_candidates[0],'r') as f:
            history_local = json.load(f)
        print("Loaded history from", hist_candidates[0])
    except Exception as e:
        print("Couldn't load history file:", e)

best_f1 = ck.get('best_f1', 0.0)
start_epoch = int(ck.get('epoch', -1)) + 1
print("Resuming training from epoch", start_epoch, "best_f1 so far", best_f1)

# --- helper train/val functions (same as before) ---
from tqdm import tqdm
def train_one_epoch(model, loader, optimizer, criterion, device, scaler):
    model.train()
    losses=[]; preds=[]; targets=[]
    loop = tqdm(loader, desc="Train", leave=False)
    for imgs, lbls in loop:
        imgs = imgs.to(device, non_blocking=True)
        lbls = lbls.to(device, non_blocking=True)
        optimizer.zero_grad()
        with torch.amp.autocast(device_type='cuda', enabled=(device.type=='cuda')):
            out = model(imgs)
            loss = criterion(out, lbls)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        losses.append(loss.item())
        preds.extend(out.argmax(dim=1).cpu().numpy().tolist())
        targets.extend(lbls.cpu().numpy().tolist())
        loop.set_postfix(loss=np.mean(losses))
    return float(np.mean(losses)), float(f1_score(targets, preds, average='macro'))

def validate_model(model, loader, criterion, device):
    model.eval()
    losses=[]; preds=[]; targets=[]
    with torch.no_grad():
        loop = tqdm(loader, desc="Val", leave=False)
        for imgs, lbls in loop:
            imgs = imgs.to(device, non_blocking=True)
            lbls = lbls.to(device, non_blocking=True)
            out = model(imgs)
            loss = criterion(out, lbls)
            losses.append(loss.item())
            preds.extend(out.argmax(dim=1).cpu().numpy().tolist())
            targets.extend(lbls.cpu().numpy().tolist())
    return float(np.mean(losses)), float(f1_score(targets, preds, average='macro')), targets, preds

# --- Resume loop ---
best_path = os.path.join(LOCAL_OUT, "best_resnet101_resumed.pth")
drive_best_path = os.path.join(DRIVE_OUT_DIR, os.path.basename(best_path))

for epoch in range(start_epoch, TOTAL_EPOCHS):
    print(f"\n=== Epoch {epoch+1}/{TOTAL_EPOCHS} ===")
    t0 = time.time()
    tr_loss, tr_f1 = train_one_epoch(model, train_loader, optimizer, criterion, device, scaler)
    val_loss, val_f1, val_targets, val_preds = validate_model(model, val_loader, criterion, device)
    # step scheduler after epoch
    try:
        scheduler.step()
    except Exception:
        pass

    history_local['train_loss'].append(tr_loss)
    history_local['train_f1'].append(tr_f1)
    history_local['val_loss'].append(val_loss)
    history_local['val_f1'].append(val_f1)

    print(f"Epoch {epoch+1} Train loss {tr_loss:.4f} f1 {tr_f1:.4f} | Val loss {val_loss:.4f} f1 {val_f1:.4f} | time {(time.time()-t0):.1f}s")

    # save checkpoint if improved
    if val_f1 > best_f1:
        best_f1 = val_f1
        ckpt = {
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "scheduler_state": scheduler.state_dict() if hasattr(scheduler, 'state_dict') else None,
            "epoch": epoch,
            "best_f1": best_f1,
            "cfg": {"model_name":"resnet101", "lr": LR}
        }
        torch.save(ckpt, best_path)
        # copy to Drive
        shutil.copy(best_path, drive_best_path)
        print("Saved NEW best model with val_f1:", best_f1)

    # save periodic checkpoint (every epoch) to local and drive
    epoch_ckpt = os.path.join(LOCAL_OUT, f"ckpt_resnet101_epoch{epoch+1}.pth")
    torch.save({"model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), "epoch": epoch, "best_f1": best_f1}, epoch_ckpt)
    try:
        shutil.copy(epoch_ckpt, os.path.join(DRIVE_OUT_DIR, os.path.basename(epoch_ckpt)))
    except Exception as e:
        print("Warning: could not copy epoch ckpt to Drive:", e)

    # save history
    hist_local_path = os.path.join(LOCAL_OUT, "history_resnet101_resumed.json")
    with open(hist_local_path, "w") as f:
        json.dump(history_local, f)
    try:
        shutil.copy(hist_local_path, os.path.join(DRIVE_OUT_DIR, os.path.basename(hist_local_path)))
    except Exception:
        pass

print("\nResume training finished. Best val F1:", best_f1)
print("Best checkpoint on local:", best_path)
print("Best checkpoint on Drive:", drive_best_path)

Device: cuda
Num classes inferred: 10


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))


Loaded optimizer state.
Resuming training from epoch 7 best_f1 so far 0.9046313937669149

=== Epoch 8/12 ===




Epoch 8 Train loss 0.3840 f1 0.8720 | Val loss 0.3043 f1 0.9055 | time 1756.5s
Saved NEW best model with val_f1: 0.9054960671834689

=== Epoch 9/12 ===




Epoch 9 Train loss 0.3413 f1 0.8876 | Val loss 0.3003 f1 0.9121 | time 97.0s
Saved NEW best model with val_f1: 0.912115214177519

=== Epoch 10/12 ===




Epoch 10 Train loss 0.3393 f1 0.8867 | Val loss 0.3005 f1 0.9096 | time 108.4s

=== Epoch 11/12 ===




Epoch 11 Train loss 0.3042 f1 0.9017 | Val loss 0.3068 f1 0.9076 | time 94.7s

=== Epoch 12/12 ===




Epoch 12 Train loss 0.3028 f1 0.8998 | Val loss 0.3175 f1 0.9097 | time 95.8s

Resume training finished. Best val F1: 0.912115214177519
Best checkpoint on local: /content/food10_outputs/best_resnet101_resumed.pth
Best checkpoint on Drive: /content/drive/MyDrive/food-10/food-10/outputs_experiments/best_resnet101_resumed.pth
