# CNN Cancer Detection – Week‑3 Notebook
Kaggle Histopathologic Cancer Detection challenge

*Goal:* build & iterate CNN models to classify histology image tiles as cancer vs. normal.

In [1]:
# 📦----------------------------------------------------------------------
# Environment & secrets (run once per kernel / after pod restart)
%pip install -q kaggle wandb pandas matplotlib pillow

import os, random, pathlib, numpy as np, torch, wandb

# ── Reproducibility -----------------------------------------------------
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# ── Paths ---------------------------------------------------------------
PROJECT_ROOT = pathlib.Path.cwd()                  # usually /workspace
DATA_DIR      = pathlib.Path("/workspace/data")    # persistent dataset
DATA_DIR.mkdir(exist_ok=True)

# ── Kaggle CLI: tell it where the token lives ---------------------------
os.environ["KAGGLE_CONFIG_DIR"] = "/workspace/.kaggle"

# ── W&B: persistent, prompt-once login ----------------------------------
KEY_FILE = pathlib.Path("/workspace/.wandb_api_key")
if KEY_FILE.exists():                         # reuse stored key
    os.environ["WANDB_API_KEY"] = KEY_FILE.read_text().strip()
    wandb.login(key=os.environ["WANDB_API_KEY"], relogin=True)
else:                                         # first run → ask & cache
    wandb.login()                             # paste key when prompted
    saved_key = os.getenv("WANDB_API_KEY")
    if saved_key:
        KEY_FILE.write_text(saved_key)
        KEY_FILE.chmod(0o600)                 # read/write for you only
        print(f"W&B API key saved to {KEY_FILE}")
# -----------------------------------------------------------------------

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Data download & verification

In [1]:
# Only run first time on new server!!

from pathlib import Path
DATA_DIR = Path("/workspace/data")   # persistent volume
DATA_DIR.mkdir(exist_ok=True)
# Download competition data (~1.2 GB) to ./data
!kaggle competitions download -c histopathologic-cancer-detection -p data
!unzip -q data/histopathologic-cancer-detection.zip -d data
import glob, json, subprocess, pathlib
print(len(glob.glob('data/train/*.tif')), 'training tiles')

Downloading histopathologic-cancer-detection.zip to data
100%|██████████████████████████████████████▉| 6.30G/6.31G [00:19<00:00, 350MB/s]
100%|███████████████████████████████████████| 6.31G/6.31G [00:20<00:00, 331MB/s]
/bin/bash: line 1: unzip: command not found
0 training tiles


## Quick EDA

In [1]:
import pandas as pd, matplotlib.pyplot as plt, random, pathlib
from PIL import Image

DATA_DIR = pathlib.Path("/workspace/data")
df       = pd.read_csv(DATA_DIR / "train_labels.csv")

# Class balance
df.label.value_counts().sort_index().plot.bar(rot=0, width=0.6)
plt.title("Class balance (0 = benign, 1 = tumor)")
plt.ylabel("Tile count"); plt.xlabel("Label")
plt.show()

# 3×3 sample grid
sample = random.sample(df.id.tolist(), 9)
fig, axes = plt.subplots(3, 3, figsize=(6, 6))
for ax, img_id in zip(axes.ravel(), sample):
    ax.imshow(Image.open(DATA_DIR / "train" / f"{img_id}.tif"))
    ax.set_title(f"Label: {int(df[df.id == img_id].label.values[0])}")
    ax.axis("off")
plt.tight_layout(); plt.show()

ModuleNotFoundError: No module named 'pandas'

In [None]:
# TODO: show class balance & a 3×3 grid of random tiles
pass

## Pre‑processing & Dataset class

In [None]:
# TODO: define HistologyDataset with transforms
pass

## Baseline model – ResNet‑18

In [None]:
# TODO: load pretrained ResNet‑18 and replace fc layer
pass

## Training loop & metrics

In [None]:
# TODO: implement epoch loop, log loss & AUC to wandb
pass

## Experiment variants (Model‑1, Model‑2, …)

In [None]:
# TODO: try deeper backbones / augmentations / samplers
pass

## Comparison & analysis

In [None]:
# TODO: pull runs via wandb API and plot val AUCs
pass

## Submission & leaderboard

In [None]:
# TODO: create submission.csv and submit via Kaggle CLI
pass

## Conclusions & next steps
*Key findings here.*

**TODO Week‑4:** …