# Lung and Colon Cancer Classification
## About Dataset
This dataset contains 25,000 histopathological images with 5 classes. All images are 768 x 768 pixels in size and are in jpeg file format.
The images were generated from an original sample of HIPAA compliant and validated sources, consisting of 750 total images of lung tissue (250 benign lung tissue, 250 lung adenocarcinomas, and 250 lung squamous cell carcinomas) and 500 total images of colon tissue (250 benign colon tissue and 250 colon adenocarcinomas) and augmented to 25,000 using the Augmentor package.
There are five classes in the dataset, each with 5,000 images, being:

* Lung benign tissue
* Lung adenocarcinoma
* Lung squamous cell carcinoma
* Colon adenocarcinoma
* Colon benign tissue


How to Cite this Dataset
If you use in your research, please credit the author of the dataset:

Original Article
Borkowski AA, Bui MM, Thomas LB, Wilson CP, DeLand LA, Mastorides SM. Lung and Colon Cancer Histopathological Image Dataset (LC25000). arXiv:1912.12142v1 [eess.IV], 2019

Relevant Links
https://arxiv.org/abs/1912.12142v1
https://github.com/tampapath/lung_colon_image_set
Dataset BibTeX
@article{,
title= {LC25000 Lung and colon histopathological image dataset},
keywords= {cancer,histopathology},
author= {Andrew A. Borkowski, Marilyn M. Bui, L. Brannon Thomas, Catherine P. Wilson, Lauren A. DeLand, Stephen M. Mastorides},
url= {https://github.com/tampapath/lung_colon_image_set}
}


## Imports

In [1]:
import os

import pyrootutils

root = pyrootutils.setup_root(
    search_from=os.path.dirname(os.getcwd()),
    indicator=[".git", "pyproject.toml"],
    pythonpath=True,
    dotenv=True,
)

if os.getenv("DATA_ROOT") is None:
    os.environ["DATA_ROOT"] = f"{root}"

In [None]:
from pathlib import Path
from typing import Any

import hydra
import opendatasets as od
import pytorch_lightning as pl
import torch
import torch.nn.functional as F
from hydra import compose, initialize
from omegaconf import OmegaConf
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder

In [3]:
#  Register a resolver for torch dtypes
OmegaConf.register_new_resolver("torch_dtype", lambda name: getattr(torch, name))

## Download datasets

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
# https://gist.github.com/bdsaglam/586704a98336a0cf0a65a6e7c247d248

with initialize(version_base="1.2", config_path="../configs"):
    cfg = compose(config_name="train")
    print(cfg.paths.train_processed_dir)

datasets/processed/train


In [6]:
DATASET_DIR = Path(root) / cfg.data.dataset_dir

In [7]:
DATASET_DIR.mkdir(exist_ok=True)
if len(list(DATASET_DIR.iterdir())) == 0:
    # Download the dataset
    od.download(dataset_id_or_url=cfg.data.dataset_url, data_dir=str(DATASET_DIR))

## Loading Images

In [8]:
CLASS_NAMES = [
    "colon-adenocarcinoma",
    "colon-benign-tissue",
    "lung-adenocarcinoma",
    "lung-benign-tissue",
    "lung-squamous-cell-carcinoma",
]

class_mapping = dict(zip(range(len(CLASS_NAMES)), CLASS_NAMES, strict=False))
class_mapping

{0: 'colon-adenocarcinoma',
 1: 'colon-benign-tissue',
 2: 'lung-adenocarcinoma',
 3: 'lung-benign-tissue',
 4: 'lung-squamous-cell-carcinoma'}

In [9]:
class LungColonDataModule(pl.LightningDataModule):
    def __init__(
        self,
        train_processed_dir: str,
        valid_processed_dir: str,
        test_processed_dir: str,
        augmentations: Any,
        valid_transforms: Any,
        num_workers: int = 8,
        pin_memory: bool = True,
        persistent_workers: bool = True,
        batch_size: int = 32,
    ):
        super().__init__()
        self.train_data_dir = train_processed_dir
        self.valid_data_dir = valid_processed_dir
        self.test_data_dir = test_processed_dir
        self.augmentations = hydra.utils.instantiate(augmentations) if augmentations is not None else None
        self.valid_transforms = hydra.utils.instantiate(valid_transforms) if valid_transforms is not None else None
        self.kwargs = {
            "batch_size": batch_size,
            "num_workers": num_workers,
            "pin_memory": pin_memory,
            "persistent_workers": persistent_workers,
        }

    def prepare_data(self):
        pass

    def setup(self, stage=None):
        # Set up the dataset for training and validation
        self.train_dataset = ImageFolder(root=self.train_data_dir, transform=self.augmentations)
        self.val_dataset = ImageFolder(root=self.valid_data_dir, transform=self.valid_transforms)
        self.test_dataset = ImageFolder(root=self.test_data_dir, transform=self.valid_transforms)

    def train_dataloader(self) -> DataLoader:
        return DataLoader(
            self.train_dataset,
            shuffle=True,
            **self.kwargs,
        )

    def val_dataloader(self) -> DataLoader:
        return DataLoader(
            self.val_dataset,
            **self.kwargs,
        )

    def test_dataloader(self) -> DataLoader:
        return DataLoader(
            self.test_dataset,
            **self.kwargs,
        )

In [10]:
data_module = LungColonDataModule(
    train_processed_dir=cfg.paths.train_processed_dir,
    valid_processed_dir=cfg.paths.valid_processed_dir,
    test_processed_dir=cfg.paths.test_processed_dir,
    augmentations=hydra.utils.instantiate(cfg.datamodule.augmentations),
    valid_transforms=hydra.utils.instantiate(cfg.datamodule.valid_transforms),
    num_workers=cfg.datamodule.num_workers,
    pin_memory=cfg.datamodule.pin_memory,
    persistent_workers=cfg.datamodule.persistent_workers,
    batch_size=cfg.datamodule.batch_size,
)

##  Model Define

In [None]:
import torch.nn as nn


class Net(nn.Module):
    def __init__(self, input_shape: tuple[int, int, int], output_dims: list[int], num_classes: int) -> None:
        super().__init__()
        input_dim = input_shape[0]
        self.output_dims = output_dims
        layers: list[nn.Module] = []

        for out_dim in output_dims:
            layers.append(nn.Conv2d(input_dim, out_dim, kernel_size=3, stride=1, padding=1, bias=False))
            layers.append(nn.BatchNorm2d(out_dim))
            layers.append(nn.ReLU())
            layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
            input_dim = out_dim

        self.conv_layers = nn.Sequential(*layers)
        self.flattener = nn.Flatten()
        # To determine the input size for the linear layer
        with torch.no_grad():
            dummy_input = torch.randn(1, *input_shape)
            dummy_output = self.conv_layers(dummy_input)
            self.flatten_dim = self.flattener(dummy_output).shape[1]
        fc_hidden_dim = 512
        self.classification_head = nn.Sequential(
            self.flattener,
            nn.Linear(self.flatten_dim, fc_hidden_dim, bias=False),
            nn.BatchNorm1d(fc_hidden_dim),
            nn.ReLU(),
            nn.Linear(fc_hidden_dim, num_classes),
        )

        self.model = nn.Sequential(self.conv_layers, self.classification_head)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        print(f"Net.forward() received input x with shape: {x.shape}")
        return self.model(x)

In [19]:
from torchinfo import summary

net = Net(
    input_shape=(3, 224, 224),
    output_dims=[32, 64, 128],
    num_classes=len(CLASS_NAMES),
)
summary(net, (1, 3, 224, 224), device="cpu")

Net.forward() received input x with shape: torch.Size([1, 3, 224, 224])


Layer (type:depth-idx)                   Output Shape              Param #
Net                                      [1, 5]                    --
├─Sequential: 1-1                        [1, 5]                    --
│    └─Sequential: 2-1                   [1, 128, 28, 28]          --
│    │    └─Conv2d: 3-1                  [1, 32, 224, 224]         864
│    │    └─BatchNorm2d: 3-2             [1, 32, 224, 224]         64
│    │    └─ReLU: 3-3                    [1, 32, 224, 224]         --
│    │    └─MaxPool2d: 3-4               [1, 32, 112, 112]         --
│    │    └─Conv2d: 3-5                  [1, 64, 112, 112]         18,432
│    │    └─BatchNorm2d: 3-6             [1, 64, 112, 112]         128
│    │    └─ReLU: 3-7                    [1, 64, 112, 112]         --
│    │    └─MaxPool2d: 3-8               [1, 64, 56, 56]           --
│    │    └─Conv2d: 3-9                  [1, 128, 56, 56]          73,728
│    │    └─BatchNorm2d: 3-10            [1, 128, 56, 56]          256
│   

In [None]:
class LungColonClassifier(pl.LightningModule):
    def __init__(self, model: Any, lr: float = 1e-3):
        super().__init__()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        return loss