In [6]:
# Cell 0: training + inference pipeline (PyTorch) for image classification
# Adjust data paths and hyperparams as needed.

import os
import random
from pathlib import Path
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

# Config
DATA_DIR = Path(".")  # change if needed
TRAIN_DIR = DATA_DIR / "train"
TEST_DIR = DATA_DIR / "test"
TRAIN_CSV = DATA_DIR / "train.csv"
SUBMISSION_PATH = DATA_DIR / "submission.csv"  # output
BATCH_SIZE = 32
IMG_SIZE = 224
NUM_EPOCHS = 8
LR = 1e-4
RANDOM_SEED = 42
NUM_WORKERS = 4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Updated column names to match train.csv
LABEL_COL = "diagnosis"  # column name in train.csv with the target
ID_COL = "id_code"       # id column name
SUBMISSION_COL = LABEL_COL  # column name in submission; adjust if example differs

torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Dataset
class ImageDataset(Dataset):
    def __init__(self, df, images_dir, transforms=None, id_col=ID_COL, label_col=LABEL_COL, label_encoder=None, is_test=False):
        self.df = df.reset_index(drop=True)
        self.images_dir = Path(images_dir)
        self.transforms = transforms
        self.id_col = id_col
        self.label_col = label_col
        self.is_test = is_test
        self.label_encoder = label_encoder

    def __len__(self):
        return len(self.df)

    def _open_image(self, img_path):
        return Image.open(img_path).convert("RGB")

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_id = str(row[self.id_col])
        # try common extensions
        for ext in ["png", "jpg", "jpeg"]:
            p = self.images_dir / f"{img_id}.{ext}"
            if p.exists():
                img = self._open_image(p)
                break
        else:
            # fallback: if ids are full filenames
            p = self.images_dir / img_id
            img = self._open_image(p)

        if self.transforms:
            img = self.transforms(img)

        if self.is_test:
            return img, img_id

        label = row[self.label_col]
        # Safe label handling:
        # - If a LabelEncoder was used earlier to transform the dataframe, labels are already ints.
        # - If labels are strings and a label_encoder is provided, transform them.
        # - Otherwise cast to int.
        if self.label_encoder is not None:
            # avoid re-transforming already-encoded ints
            if isinstance(label, (int, np.integer)):
                label = int(label)
            else:
                try:
                    label = int(self.label_encoder.transform([str(label)])[0])
                except Exception:
                    label = int(label)
        else:
            label = int(label)

        return img, label

# Read CSV
train_df = pd.read_csv(TRAIN_CSV)
if LABEL_COL not in train_df.columns or ID_COL not in train_df.columns:
    raise RuntimeError(f"Expected columns '{ID_COL}' and '{LABEL_COL}' in {TRAIN_CSV}")

# Encode labels only when necessary (keep numeric labels intact)
le = None
if not pd.api.types.is_numeric_dtype(train_df[LABEL_COL]):
    le = LabelEncoder()
    train_df[LABEL_COL] = le.fit_transform(train_df[LABEL_COL].astype(str))
    num_classes = len(le.classes_)
else:
    train_df[LABEL_COL] = train_df[LABEL_COL].astype(int)
    num_classes = int(train_df[LABEL_COL].nunique())

# Split train/val
train_rows, val_rows = train_test_split(train_df, test_size=0.2, random_state=RANDOM_SEED, stratify=train_df[LABEL_COL])

# Transforms
train_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

val_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

# Datasets and loaders
train_dataset = ImageDataset(train_rows, TRAIN_DIR, transforms=train_transforms, label_encoder=le, is_test=False)
val_dataset = ImageDataset(val_rows, TRAIN_DIR, transforms=val_transforms, label_encoder=le, is_test=False)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

# Model (transfer learning with ResNet18)
model = models.resnet18(pretrained=True)
in_features = model.fc.in_features
model.fc = nn.Linear(in_features, num_classes)
model = model.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)

# Training loop
best_val_acc = 0.0
best_state = None

for epoch in range(1, NUM_EPOCHS + 1):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch}/{NUM_EPOCHS} (train)", leave=False)
    for imgs, labels in loop:
        imgs = imgs.to(DEVICE, non_blocking=True)
        labels = labels.to(DEVICE, non_blocking=True)

        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)
        _, preds = outputs.max(1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
        loop.set_postfix(loss=running_loss/total, acc=correct/total)

    scheduler.step()

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs = imgs.to(DEVICE, non_blocking=True)
            labels = labels.to(DEVICE, non_blocking=True)
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * imgs.size(0)
            _, preds = outputs.max(1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)

    val_acc = val_correct / val_total
    val_loss = val_loss / val_total
    print(f"Epoch {epoch}: train_loss={running_loss/total:.4f} train_acc={correct/total:.4f} val_loss={val_loss:.4f} val_acc={val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_state = model.state_dict().copy()
        torch.save(best_state, "best_model.pth")

# Load best model
if best_state is not None:
    model.load_state_dict(best_state)

# Inference on test set and create submission
# Expect test folder contains files named {id}.jpg (or png)
# If there is a sample submission with expected ids, prefer that. Try to read sample if present.
test_files = []
for p in sorted(TEST_DIR.iterdir()):
    if p.suffix.lower() in [".jpg", ".jpeg", ".png", ".bmp"]:
        test_files.append(p.stem)

example_paths = ["sample_submission.csv", "submission.csv"]
for pth in example_paths:
    if os.path.exists(pth):
        sample = pd.read_csv(pth)
        if ID_COL in sample.columns:
            test_ids = sample[ID_COL].astype(str).tolist()
        else:
            test_ids = test_files
        break
else:
    test_ids = test_files

# Build test DataFrame
test_df = pd.DataFrame({ID_COL: test_ids})
test_dataset = ImageDataset(test_df, TEST_DIR, transforms=val_transforms, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

model.eval()
preds_all = []
ids_all = []
with torch.no_grad():
    for imgs, ids in tqdm(test_loader, desc="Predict"):
        imgs = imgs.to(DEVICE)
        outputs = model(imgs)
        probs = torch.softmax(outputs, dim=1)
        preds = probs.argmax(dim=1).cpu().numpy()
        preds_all.extend(preds.tolist())
        ids_all.extend(ids)

# Convert numeric preds back to original labels
if le is not None:
    pred_labels = le.inverse_transform(preds_all)
else:
    pred_labels = [int(x) for x in preds_all]

submission = pd.DataFrame({ID_COL: ids_all, SUBMISSION_COL: pred_labels})
submission.to_csv(SUBMISSION_PATH, index=False)
print(f"Saved submission to {SUBMISSION_PATH}")

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'