In [1]:
!pip install --quiet torch torchvision pandas scikit-learn pillow tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
!pip install kaggle



In [10]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"vishruthbharath","key":"f27c8b108dae01498970cf312cffe7ca"}'}

In [11]:
! mkdir ~/.dataset

In [12]:
cp kaggle.json ~/.dataset/

In [14]:
!chmod 600 ~/.dataset/kaggle.json

In [16]:
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/skin-cancer-mnist-ham10000


In [19]:
%%writefile dataset.py
# dataset.py – points to Kaggle’s read-only mount
import pathlib, pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torchvision.transforms as T

KAGGLE_DIR = pathlib.Path("/kaggle/input/skin-cancer-mnist-ham10000")
IMG_DIR = KAGGLE_DIR if KAGGLE_DIR.exists() else pathlib.Path(__file__).parent / "dataset"

class HAMDataset(Dataset):
    def __init__(self, df, tfms):
        self.df, self.tfms = df.reset_index(drop=True), tfms
    def __len__(self):  return len(self.df)
    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        img = Image.open(IMG_DIR / r.image_rel_path).convert("RGB")
        return self.tfms(img), int(r.label)

def build_dataloaders(batch=32, val_split=0.15, seed=42, workers=2):
    meta = pd.read_csv(IMG_DIR / "HAM10000_metadata.csv")
    meta["image_rel_path"] = meta["image_id"].apply(
        lambda iid: f"HAM10000_images_part_1/{iid}.jpg"
        if (IMG_DIR / "HAM10000_images_part_1" / f"{iid}.jpg").exists()
        else f"HAM10000_images_part_2/{iid}.jpg"
    )
    label2idx = {d:i for i,d in enumerate(sorted(meta.dx.unique()))}
    meta["label"] = meta.dx.map(label2idx)

    train_df, val_df = train_test_split(meta, stratify=meta.label,
                                        test_size=val_split, random_state=seed)

    tfms = T.Compose([
        T.Resize((224,224)),
        T.RandomHorizontalFlip(),
        T.ToTensor(),
        T.Normalize([0.5]*3, [0.5]*3)
    ])
    train_ds, val_ds = HAMDataset(train_df, tfms), HAMDataset(val_df, tfms)

    # weighted sampler to handle class imbalance
    counts = train_df.label.value_counts().sort_index().values
    weights = 1. / torch.tensor(counts, dtype=torch.float)
    samp_w  = weights[train_df.label.values]
    sampler = WeightedRandomSampler(samp_w, len(samp_w), replacement=True)

    train_dl = DataLoader(train_ds, batch_size=batch, sampler=sampler,
                          num_workers=workers, pin_memory=True)
    val_dl   = DataLoader(val_ds,   batch_size=batch, shuffle=False,
                          num_workers=workers, pin_memory=True)
    return train_dl, val_dl, label2idx


Writing dataset.py


In [21]:
%%writefile train.py
# train.py – uses dataset.py
import argparse, torch, torch.nn as nn, torchvision.models as models
from torch.optim.lr_scheduler import CosineAnnealingLR
from tqdm import tqdm
from dataset import build_dataloaders

def train_epoch(model, dl, loss_fn, opt, device):
    model.train(); run = 0.
    for x,y in tqdm(dl, leave=False):
        x, y = x.to(device), y.to(device)
        opt.zero_grad()
        loss = loss_fn(model(x), y)
        loss.backward(); opt.step()
        run += loss.item()*x.size(0)
    return run / len(dl.dataset)

@torch.no_grad()
def eval_epoch(model, dl, loss_fn, device):
    model.eval(); run, correct = 0., 0
    for x,y in dl:
        x, y = x.to(device), y.to(device)
        out  = model(x)
        run += loss_fn(out,y).item()*x.size(0)
        correct += (out.argmax(1)==y).sum().item()
    return run/len(dl.dataset), correct/len(dl.dataset)

def main(epochs, bs, lr):
    train_dl, val_dl, label2idx = build_dataloaders(batch=bs, workers=2)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1)
    model.classifier[1] = nn.Linear(model.classifier[1].in_features, len(label2idx))
    model.to(device)

    loss_fn, opt = nn.CrossEntropyLoss(), torch.optim.AdamW(model.parameters(), lr=lr)
    sched = CosineAnnealingLR(opt, T_max=epochs)

    best = 0.
    for ep in range(1, epochs+1):
        tr = train_epoch(model, train_dl, loss_fn, opt, device)
        vl, acc = eval_epoch(model, val_dl, loss_fn, device)
        sched.step()
        print(f"[{ep}/{epochs}] train {tr:.4f} | val {vl:.4f} | acc {acc:.3%}")
        if acc > best:
            torch.save(model.state_dict(), "best.pt")
            best = acc
    print("✓ training done, best.pt saved to /kaggle/working/")

if __name__ == "__main__":
    p = argparse.ArgumentParser()
    p.add_argument("--epochs", type=int, default=4)
    p.add_argument("--bs",     type=int, default=32)
    p.add_argument("--lr",     type=float, default=3e-4)
    args = p.parse_args()
    main(args.epochs, args.bs, args.lr)

Overwriting train.py


In [24]:
!python train.py --epochs 20 --bs 32

[1/20] train 0.6300 | val 0.6984 | acc 74.451%
[2/20] train 0.2777 | val 0.4836 | acc 83.566%
[3/20] train 0.2062 | val 0.5583 | acc 80.506%
[4/20] train 0.1687 | val 0.6125 | acc 77.645%
[5/20] train 0.1246 | val 0.4486 | acc 85.096%
[6/20] train 0.0982 | val 0.4298 | acc 86.693%
[7/20] train 0.0862 | val 0.4477 | acc 86.494%
[8/20] train 0.0717 | val 0.4259 | acc 87.558%
[9/20] train 0.0621 | val 0.3964 | acc 88.290%
[10/20] train 0.0424 | val 0.3906 | acc 89.288%
[11/20] train 0.0433 | val 0.4697 | acc 87.891%
[12/20] train 0.0328 | val 0.4097 | acc 89.820%
[13/20] train 0.0197 | val 0.3591 | acc 89.687%
[14/20] train 0.0191 | val 0.4076 | acc 89.754%
[15/20] train 0.0173 | val 0.3994 | acc 89.820%
[16/20] train 0.0134 | val 0.3950 | acc 89.887%
[17/20] train 0.0134 | val 0.3995 | acc 90.086%
[18/20] train 0.0166 | val 0.3903 | acc 90.818%
[19/20] train 0.0078 | val 0.4044 | acc 90.752%
[20/20] train 0.0117 | val 0.4042 | acc 89.953%
✓ training done, best.pt saved to /kaggle/working