<a href="https://colab.research.google.com/github/williamsdoug/skin_lesion_ml/blob/master/Explore%20Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from fastai.vision import *
import pandas as pd
import collections
import math
import csv

In [2]:
import random
random.seed(1234)
import numpy as np
np.random.seed(1234)
import torch
torch.manual_seed(1234)

<torch._C.Generator at 0x1840cb11fb0>

## Code

In [3]:
def get_labels(file_path, known_classes=['normal', 'bacteria', 'virus'], default='normal'): 
    base = file_path.stem
    for k in known_classes:
        if k in base:
            return k
    return default

## Explore Data Folders

In [4]:
PATH = Path()
images = PATH / 'data' / 'chest_xray'

In [5]:
images.ls()

[WindowsPath('data/chest_xray/data'),
 WindowsPath('data/chest_xray/test'),
 WindowsPath('data/chest_xray/train'),
 WindowsPath('data/chest_xray/val')]

In [6]:
(images/'train').ls()

[WindowsPath('data/chest_xray/train/NORMAL'),
 WindowsPath('data/chest_xray/train/PNEUMONIA')]

In [7]:
(images/'train'/'NORMAL').ls()[:10]

[WindowsPath('data/chest_xray/train/NORMAL/IM-0115-0001.jpeg'),
 WindowsPath('data/chest_xray/train/NORMAL/IM-0117-0001.jpeg'),
 WindowsPath('data/chest_xray/train/NORMAL/IM-0119-0001.jpeg'),
 WindowsPath('data/chest_xray/train/NORMAL/IM-0122-0001.jpeg'),
 WindowsPath('data/chest_xray/train/NORMAL/IM-0125-0001.jpeg'),
 WindowsPath('data/chest_xray/train/NORMAL/IM-0127-0001.jpeg'),
 WindowsPath('data/chest_xray/train/NORMAL/IM-0128-0001.jpeg'),
 WindowsPath('data/chest_xray/train/NORMAL/IM-0129-0001.jpeg'),
 WindowsPath('data/chest_xray/train/NORMAL/IM-0131-0001.jpeg'),
 WindowsPath('data/chest_xray/train/NORMAL/IM-0133-0001.jpeg')]

## Extract Labels (from file name)

In [8]:
collections.Counter([str(get_labels(fn)) for fn in (images/'train'/'NORMAL').ls()])

Counter({'normal': 1341})

In [9]:
collections.Counter([str(get_labels(fn)) for fn in (images/'train'/'PNEUMONIA').ls()])

Counter({'bacteria': 2530, 'virus': 1345})

In [10]:
for fn in (images/'train'/'NORMAL').ls()[:2]:
    base = fn.stem
    print(base, get_labels(fn))

IM-0115-0001 normal
IM-0117-0001 normal


In [11]:
for fn in (images/'train'/'PNEUMONIA').ls()[:2]:
    base = fn.stem
    print(base, get_labels(fn))

person1000_bacteria_2931 bacteria
person1000_virus_1681 virus


## Construct Databunch from training data

In [12]:
def get_databunch(path, bs=8, size=500, workers=1, valid_pct=0.2, seed=None, tfms=None):
    if tfms is None:
        tfms = get_transforms()
        
    def get_labels(file_path, known_classes=['normal', 'bacteria', 'virus'], default='normal'): 
        base = file_path.stem
        for k in known_classes:
            if k in base:
                return k
        return default

    il = ImageList.from_folder(path).split_by_rand_pct(valid_pct).label_from_func(get_labels).transform(tfms, size=size)
    db = il.databunch(bs=bs, num_workers=workers).normalize(imagenet_stats)
    return db

In [13]:
path = Path()/'data'/'chest_xray' / 'train'
#path.ls()

In [14]:
data = get_databunch(path)
print(data.c, data.classes)
data

3 ['bacteria', 'normal', 'virus']


ImageDataBunch;

Train: LabelList (4173 items)
x: ImageList
Image (3, 500, 500),Image (3, 500, 500),Image (3, 500, 500),Image (3, 500, 500),Image (3, 500, 500)
y: CategoryList
normal,normal,normal,normal,normal
Path: data\chest_xray\train;

Valid: LabelList (1043 items)
x: ImageList
Image (3, 500, 500),Image (3, 500, 500),Image (3, 500, 500),Image (3, 500, 500),Image (3, 500, 500)
y: CategoryList
normal,normal,virus,bacteria,bacteria
Path: data\chest_xray\train;

Test: None