# Multilabel Classification Using the ChestX-ray14 Dataset

## Step 0: Install PyHealth

In [None]:
%pip install pyhealth ipywidgets

## Step 1: Load Dataset

In [2]:
from pyhealth.datasets import ChestXray14Dataset

dataset = ChestXray14Dataset(download=True, partial=True)
dataset.stats()

Downloading ./images_01.tar.gz...
Checking MD5 checksum for ./images_01.tar.gz...
Extracting ./images_01.tar.gz...
Deleting ./images_01.tar.gz...
Download complete
Initializing ChestX-ray14 dataset from . (dev mode: False)
No cache_dir provided. Using default cache dir: /root/.cache/pyhealth/fb6e8a46-32a1-580b-bb6c-4015d54b1bc1
Scanning table: chestxray14 from /root/chestxray14-metadata-pyhealth.csv
Caching event dataframe to /root/.cache/pyhealth/fb6e8a46-32a1-580b-bb6c-4015d54b1bc1/global_event_df.parquet...
Dataset: ChestX-ray14
Dev mode: False
Number of patients: 1335
Number of events: 4999


## Step 2: Define Task

In [3]:
samples = dataset.set_task()

Setting task ChestXray14MultilabelClassification for ChestX-ray14 base dataset...
Applying task transformations on data with 1 workers...
Detected Jupyter notebook environment, setting num_workers to 1
Single worker mode, processing sequentially
Worker 0 started processing 1335 patients. (Polars threads: 22)


  0%|          | 0/1335 [00:00<?, ?it/s]

Rank 0 inferred the following `['bytes']` data format.


100%|██████████| 1335/1335 [00:04<00:00, 298.99it/s]

Worker 0 finished processing patients.





Fitting processors on the dataset...
Label labels vocab: {'atelectasis': 0, 'cardiomegaly': 1, 'consolidation': 2, 'edema': 3, 'effusion': 4, 'emphysema': 5, 'fibrosis': 6, 'hernia': 7, 'infiltration': 8, 'mass': 9, 'nodule': 10, 'pleural_thickening': 11, 'pneumonia': 12, 'pneumothorax': 13}
Processing samples and saving to /root/.cache/pyhealth/fb6e8a46-32a1-580b-bb6c-4015d54b1bc1/tasks/ChestXray14MultilabelClassification_f8cedbe4-72a8-53c3-922d-4cc8730f4c2d/samples_e4cb1532-b4bc-5434-aac5-9269556ad11e.ld...
Applying processors on data with 1 workers...
Detected Jupyter notebook environment, setting num_workers to 1
Single worker mode, processing sequentially
Worker 0 started processing 4999 samples. (0 to 4999)


  0%|          | 0/4999 [00:00<?, ?it/s]

Rank 0 inferred the following `['tensor', 'no_header_tensor:1']` data format.


100%|██████████| 4999/4999 [01:19<00:00, 62.88it/s]

Worker 0 finished processing samples.
Cached processed samples to /root/.cache/pyhealth/fb6e8a46-32a1-580b-bb6c-4015d54b1bc1/tasks/ChestXray14MultilabelClassification_f8cedbe4-72a8-53c3-922d-4cc8730f4c2d/samples_e4cb1532-b4bc-5434-aac5-9269556ad11e.ld





In [4]:
from pyhealth.datasets import get_dataloader, split_by_sample

train_dataset, val_dataset, test_dataset = split_by_sample(samples, [0.7, 0.1, 0.2])

train_loader = get_dataloader(train_dataset, batch_size=16, shuffle=True)
val_loader = get_dataloader(val_dataset, batch_size=16, shuffle=False)
test_loader = get_dataloader(test_dataset, batch_size=16, shuffle=False)

## Step 3: Define Model

In [5]:
from pyhealth.models import CNN

model = CNN(dataset=samples)

  import pkg_resources




## Step 4: Train Model

In [6]:
from pyhealth.trainer import Trainer

# Only measure accurancy because with the "partial" dataset it is likely that
# there are not positive samples of every label present in the validation and test sets
trainer = Trainer(model=model, metrics=["accuracy"])
trainer.train(train_dataloader=train_loader, val_dataloader=val_loader, epochs=1)

CNN(
  (embedding_model): EmbeddingModel(embedding_layers=ModuleDict())
  (cnn): ModuleDict(
    (image): CNNLayer(
      (cnn): ModuleList(
        (0): CNNBlock(
          (conv1): Sequential(
            (0): Conv2d(1, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU()
          )
          (conv2): Sequential(
            (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          )
          (downsample): Sequential(
            (0): Conv2d(1, 128, kernel_size=(1, 1), stride=(1, 1))
            (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          )
          (relu): ReLU()
        )
      )
      (pooling): AdaptiveAvgPool2d(output_size=1)
    )
  )
  (fc): Linear(in_features=128, out_features=1

Epoch 0 / 1:   0%|          | 0/219 [00:00<?, ?it/s]

--- Train epoch-0, step-219 ---
loss: 0.2072


Evaluation: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]

--- Eval epoch-0, step-219 ---
accuracy: 0.9539
loss: 0.1717





## Step 5: Evaluate Model

In [7]:
trainer.evaluate(test_loader)

Evaluation: 100%|██████████| 63/63 [00:48<00:00,  1.30it/s]


{'accuracy': 0.9532857142857143, 'loss': 0.1734933808209404}