In [12]:
project_environment_variable = "SKIN_LESION_CLASSIFICATION"

import os
from pathlib import Path
project_path = Path(os.environ.get(project_environment_variable))

scripts_path = project_path.joinpath("scripts")

import sys
sys.path.append(str(scripts_path)) 

from utils import path_setup
path = path_setup.subfolders(project_path)

path['project'] : D:\projects\skin-lesion-classification
path['images'] : D:\projects\skin-lesion-classification\images
path['models'] : D:\projects\skin-lesion-classification\models
path['expository'] : D:\projects\skin-lesion-classification\expository
path['literature'] : D:\projects\skin-lesion-classification\literature
path['notebooks'] : D:\projects\skin-lesion-classification\notebooks
path['presentation'] : D:\projects\skin-lesion-classification\presentation
path['scripts'] : D:\projects\skin-lesion-classification\scripts


In [13]:
from multiclass_models_copy_and_experiment import process

In [16]:
from typing import Type

# process class attributes
data_dir: Path = path["images"] # Path to directory containing metadata csv file.
filename: str = "metadata.csv"  # Name of metadata csv file.
tvr: int = 3                    # Train/val ratio (number of lesions represented in train set is approx. tvr times number represented in val set).
seed: int = 0                   # Random seed for selection of single image per lesion to be assigned to training set (if keep_first is False).
keep_first: bool = False        # If True, keep first image for each lesion; otherwise, select random image for each lesion.
dxs: list = ["nv",
             "mel",
             "bcc",
             "akiec",]          # Subset of all diagnoses we are interested in classifying (missing dxs will be placed in "other" class).

In [17]:
metadata = process(data_dir, 
                   filename,
                   tvr,
                   seed, 
                   keep_first,
                   dxs,)

Successfully loaded file 'D:\projects\skin-lesion-classification\images\metadata.csv'.
Inserted 'num_images' column.
Inserted 'label' column.
Added 'set' column with tags 't1', 'v1', 'ta', and 'va'.


See ```01_binary_classification.ipynb``` for an explanation of the tags ```t1``` etc.

In [18]:
metadata.df.head(4)

Unnamed: 0,lesion_id,num_images,image_id,dx,label,dx_type,age,sex,localization,set
0,HAM_0000118,2,ISIC_0027419,bkl,other,histo,80.0,male,scalp,va
1,HAM_0000118,2,ISIC_0025030,bkl,other,histo,80.0,male,scalp,v1
2,HAM_0002730,2,ISIC_0026769,bkl,other,histo,80.0,male,scalp,ta
3,HAM_0002730,2,ISIC_0025661,bkl,other,histo,80.0,male,scalp,t1


In [19]:
for across in ["lesions", "images"]:
    for subset in ["all", "train", "val"]:
        process.dx_dist(metadata, subset = subset, across = across)

DISTRIBUTION OF LESIONS BY DIAGNOSIS: OVERALL


dx,nv,other,mel,bcc,akiec
freq,5403.0,898.0,614.0,327.0,228.0
%,72.33,12.02,8.22,4.38,3.05


Total lesions: 7470.

DISTRIBUTION OF LESIONS BY DIAGNOSIS: TRAIN


dx,nv,other,mel,bcc,akiec
freq,4024.0,681.0,476.0,251.0,170.0
%,71.83,12.16,8.5,4.48,3.03


Total lesions: 5602 (74.99% of all lesions).

DISTRIBUTION OF LESIONS BY DIAGNOSIS: VAL


dx,nv,other,mel,bcc,akiec
freq,1379.0,217.0,138.0,76.0,58.0
%,73.82,11.62,7.39,4.07,3.1


Total lesions: 1868 (25.01% of all lesions).

DISTRIBUTION OF IMAGES BY DIAGNOSIS: OVERALL


dx,nv,other,mel,bcc,akiec
freq,6705.0,1356.0,1113.0,514.0,327.0
%,66.95,13.54,11.11,5.13,3.27


Total images: 10015.

DISTRIBUTION OF IMAGES BY DIAGNOSIS: TRAIN


dx,nv,other,mel,bcc,akiec
freq,4999.0,1025.0,865.0,390.0,240.0
%,66.48,13.63,11.5,5.19,3.19


Total images: 7519 (75.08% of all images).

DISTRIBUTION OF IMAGES BY DIAGNOSIS: VAL


dx,nv,other,mel,bcc,akiec
freq,1706.0,331.0,248.0,124.0,87.0
%,68.35,13.26,9.94,4.97,3.49


Total images: 2496 (24.92% of all images).



In [20]:
test_df = metadata.df.sample(n=64,random_state=0)

In [21]:
import torchvision.transforms as transforms

transform = transforms.Compose([
transforms.CenterCrop((300, 300)),
transforms.Resize((224,224)), # Resize images to fit ResNet input size
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize with ImageNet stats
])    

In [22]:
import pandas as pd
from typing import Type
from typing import List, Callable

df: pd.DataFrame = test_df # metadata.df 
train_set: list = ["t1"]              # ["t1"] (one image per lesion in training set); ["t1", "ta"] (all images for each lesion in training set).
dxs: list = dxs                       # List of diagnoses (labels/targets) to classify.
label_codes: dict = metadata.label_codes
data_dir: Path = path["images"]       # Path to directory where images are stored.
model_dir: Path = path["models"]      # Path to directory where models/model info/model results are stored.
transform: List[Callable] = transform # Transform to be applied to images before feeding to ResNet-18
batch_size: int = 32                  # Mini-batch size: default 32.
epochs: int = 1                       # Number of epochs (all layers unfrozen from the start): default 10.
base_learning_rate: float = 0.001     # Learning rate to start with: default 0.001. Using Adam optimizer.
filename_stem: str = "rn18mc"         # For saving model and related files. train set and num epochs will be appended automatically. Default "rn18mc".
filename_suffix: str = "test"         # Something descriptive and unique for future reference and to avoid over-writing other files. Default empty string "".

In [23]:
from multiclass_models import resnet18

resnet18mc_test = resnet18(df,
                           train_set, 
                           dxs,
                           label_codes,
                           data_dir,
                           model_dir,
                           transform,
                           batch_size,
                           epochs,
                           base_learning_rate,
                           filename_stem,
                           filename_suffix,)

In [24]:
resnet18mc_test.train()

Epoch 1/1, Training Loss: 2.0213, Validation Loss: 0.8031
Saving model.state_dict() as D:\projects\skin-lesion-classification\models\rn18mc_t1_1e_test.pth.
model.state_dict() can now be accessed through state_dict attribute.
Train/val losses can now be accessed through epoch_losses attribute.


In [25]:
resnet18mc_test.epoch_losses

{'train_loss': array([2.02126753]), 'val_loss': array([0.80305725])}

In [26]:
prob_df = resnet18mc_test.inference()
prob_df

Unnamed: 0,image_id,prob_other,prob_nv,prob_mel,prob_bcc,prob_akiec
0,ISIC_0028664,0.166673,0.545753,0.151911,0.027902,0.107761
1,ISIC_0025998,0.089984,0.737531,0.090836,0.017457,0.064192
2,ISIC_0032817,0.056458,0.72,0.121035,0.037929,0.064578
3,ISIC_0026577,0.220658,0.381179,0.203652,0.044013,0.150498
4,ISIC_0026798,0.035833,0.01729,0.071095,0.631528,0.244253
...,...,...,...,...,...,...
27,ISIC_0027291,0.056162,0.000092,0.057993,0.79266,0.093092
28,ISIC_0026654,0.051349,0.78481,0.087964,0.014241,0.061636
29,ISIC_0033943,0.022072,0.927936,0.034078,0.003445,0.012468
30,ISIC_0033183,0.727905,0.011204,0.197099,0.037148,0.026644
