In [1]:
project_environment_variable = "SKIN_LESION_CLASSIFICATION"

import os
from pathlib import Path
project_path = Path(os.environ.get(project_environment_variable))

scripts_path = project_path.joinpath("scripts")

import sys
sys.path.append(str(scripts_path)) 

from utils import path_setup
path = path_setup.subfolders(project_path)

path['project'] : D:\projects\skin-lesion-classification
path['images'] : D:\projects\skin-lesion-classification\images
path['models'] : D:\projects\skin-lesion-classification\models
path['expository'] : D:\projects\skin-lesion-classification\expository
path['literature'] : D:\projects\skin-lesion-classification\literature
path['notebooks'] : D:\projects\skin-lesion-classification\notebooks
path['presentation'] : D:\projects\skin-lesion-classification\presentation
path['scripts'] : D:\projects\skin-lesion-classification\scripts


In [2]:
from processing import process

In [3]:
from typing import Type, Union
import pandas as pd

data_dir: Path = path["images"]     # Path to directory containing metadata.csv file
csv_filename: str = "metadata.csv"  # The filename
tvr: int = 3                        # Ratio of training set to validation set. See discussion below for explanation.
seed: int = 0                       # Random seed for parts of the process where randomness is called for.
keep_first: bool = False            # If False, then, for each lesion, we choose a random image to assign to our training set. 
stratified: bool = True             # If True, we stratify classes so that the proportions remain as stable as possible after train/val split. 
                                    # If False, the proportions will be roughly similar.
to_classify: list = ["mel",         # These are the lesion types we are interested in classifying. Any missing ones will be grouped together as the 0-label class.
                     "bcc", 
                     "akiec", 
                     "nv"]

In [4]:
# Create an instance of the process class with attribute values as above.
metadata = process(data_dir=data_dir,
                   csv_filename=csv_filename,
                   tvr=tvr,
                   seed=seed,
                   keep_first=keep_first,
                   stratified=stratified,
                   to_classify=to_classify)

Successfully loaded file 'D:\projects\skin-lesion-classification\images\metadata.csv'.
Inserted 'num_images' column in dataframe, to the right of 'lesion_id' column.
Created label_dict (maps labels to indices).
Inserted 'label' column in dataframe, to the right of 'dx' column.
Added 'set' column to dataframe, with values 't1', 'v1', 'ta', and 'va', to the right of 'localization' column.


In [5]:
# Let's have a look at our metadata dataframe, which is now just an attribute of the metadata instance of the process class.
metadata.df.head()

Unnamed: 0,lesion_id,num_images,image_id,dx,label,dx_type,age,sex,localization,set
0,HAM_0000118,2,ISIC_0027419,bkl,0,histo,80.0,male,scalp,ta
1,HAM_0000118,2,ISIC_0025030,bkl,0,histo,80.0,male,scalp,t1
2,HAM_0002730,2,ISIC_0026769,bkl,0,histo,80.0,male,scalp,va
3,HAM_0002730,2,ISIC_0025661,bkl,0,histo,80.0,male,scalp,v1
4,HAM_0001466,2,ISIC_0031633,bkl,0,histo,75.0,male,ear,va


In [6]:
for across in ["lesions", "images"]:
    for subset in ["all", "train", "val"]:
        process.dx_dist(metadata, subset = subset, across = across)

DISTRIBUTION OF LESIONS BY DIAGNOSIS: OVERALL


dx,nv,other,mel,bcc,akiec
freq,5403.0,898.0,614.0,327.0,228.0
%,72.33,12.02,8.22,4.38,3.05


Total lesions: 7470.

DISTRIBUTION OF LESIONS BY DIAGNOSIS: TRAIN


dx,nv,other,mel,bcc,akiec
freq,4052.0,673.0,460.0,245.0,171.0
%,72.34,12.02,8.21,4.37,3.05


Total lesions: 5601 (74.98% of all lesions).

DISTRIBUTION OF LESIONS BY DIAGNOSIS: VAL


dx,nv,other,mel,bcc,akiec
freq,1351.0,225.0,154.0,82.0,57.0
%,72.28,12.04,8.24,4.39,3.05


Total lesions: 1869 (25.02% of all lesions).

DISTRIBUTION OF IMAGES BY DIAGNOSIS: OVERALL


dx,nv,other,mel,bcc,akiec
freq,6705.0,1356.0,1113.0,514.0,327.0
%,66.95,13.54,11.11,5.13,3.27


Total images: 10015.

DISTRIBUTION OF IMAGES BY DIAGNOSIS: TRAIN


dx,nv,other,mel,bcc,akiec
freq,5007.0,1008.0,831.0,384.0,250.0
%,66.94,13.48,11.11,5.13,3.34


Total images: 7480 (74.69% of all images).

DISTRIBUTION OF IMAGES BY DIAGNOSIS: VAL


dx,nv,other,mel,bcc,akiec
freq,1698.0,348.0,282.0,130.0,77.0
%,66.98,13.73,11.12,5.13,3.04


Total images: 2535 (25.31% of all images).



In [7]:
# There are some implicit attributes of our process class:
metadata_hidden_attributes = metadata.get_hidden_attributes()
print(list(metadata_hidden_attributes.keys()))
# E.g.:
metadata_hidden_attributes["_label_codes"]

['_csv_file_path', '_label_dict', '_label_codes', '_num_labels', '_df_train1', '_df_train_a', '_df_val1', '_df_val_a', '_df_sample_batch']


{0: 'other', 1: 'bcc', 2: 'mel', 3: 'akiec', 4: 'nv'}

In [8]:
# Now let's set values for the attributes of our resnet18 class (the model we will use with out processed data).
# One of the attributes has to do with image transformations.

import torchvision.transforms as transforms

transform = transforms.Compose([
transforms.CenterCrop((300, 300)),
transforms.Resize((224,224)), # Resize images to fit ResNet input size
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize with ImageNet stats
])    

In [9]:
import pandas as pd
from typing import Type, Union, List, Callable

df: pd.DataFrame = metadata._df_sample_batch     # Background dataset for the model. metadata._df_sample_batch is a random selection of 64 rows of metadata.df. We use it for testing our code.
train_set: Union[pd.DataFrame, list, str] = "t1" # "t1" (one image per lesion in training set); ["t1", "ta"] (all images for each lesion in training set); can also specify another sub-dataframe of self.df.
val_set: Union[pd.DataFrame, list, str] = "v1"   # Similar to train_set above.
label_codes: dict = metadata._label_codes        # Correspondence between label codes like 0 and label words like 'other'.
data_dir: Path = path["images"]                  # Path to directory where images are stored.
model_dir: Path = path["models"]                 # Path to directory where models/model info/model results are stored.
transform: List[Callable] = transform            # Transform to be applied to images before feeding to ResNet-18
batch_size: int = 32                             # Mini-batch size: default 32.
epochs: int = 1                                  # Number of epochs (all layers unfrozen from the start): default 10.
base_learning_rate: float = 0.001                # Learning rate to start with: default 0.001. Using Adam optimizer.
filename_stem: str = "rn18mc"                    # For saving model and related files. train set and num epochs will be appended automatically. Default "rn18mc".
filename_suffix: str = "test"                    # Something descriptive and unique for future reference and to avoid over-writing other files. Default empty string "".

In [10]:
# Create an instance of the resnet18 class with attribute values as above.
from multiclass_models import resnet18

resnet18mc_test = resnet18(
    df,
    train_set,
    val_set,
    label_codes,
    data_dir,
    model_dir,
    transform,
    batch_size,
    epochs,
    base_learning_rate,
    filename_stem,
    filename_suffix,
)

In [11]:
# Train the model on the specified training data by calling the train method:
resnet18mc_test.train()

Epoch 1/1, Training Loss: 1.7076, Validation Loss: 1.3961
Saving model.state_dict() as D:\projects\skin-lesion-classification\models\rn18mc_t1_1e_test.pth.
model.state_dict() can now be accessed through state_dict attribute.
Train/val losses can now be accessed through epoch_losses attribute.


In [12]:
# Let's look at the training and validation loss for each epoch:
resnet18mc_test.epoch_losses

{'train_loss': array([1.70763087]), 'val_loss': array([1.39607048])}

In [13]:
# The model will be saved as a .pth file in the directory given by model_dir attribute.
# Sans .pth extension, the filename is
resnet18mc_test._filename

'rn18mc_t1_1e_test'

In [14]:
# We can feed our entire dataframe through the trained model to obtain predictions for all lesions/images.
# Data can be loaded from a pre-saved .pth file if it is not still in memory.
inference_df = resnet18mc_test.inference()
inference_df

Unnamed: 0,image_id,prob_other,prob_bcc,prob_mel,prob_akiec,prob_nv
0,ISIC_0028664,0.029869,0.078758,0.003433,0.029163,0.858776
1,ISIC_0025998,0.144295,0.148403,0.021601,0.108468,0.577232
2,ISIC_0032817,0.19008,0.287954,0.040866,0.204013,0.277086
3,ISIC_0026577,0.123418,0.11784,0.066847,0.10148,0.590416
4,ISIC_0026798,0.00136,0.992008,0.000089,0.004273,0.00227
...,...,...,...,...,...,...
27,ISIC_0027291,0.056604,0.344542,0.045485,0.121775,0.431594
28,ISIC_0026654,0.110429,0.215946,0.067882,0.159917,0.445826
29,ISIC_0033943,0.0501,0.059992,0.734468,0.147653,0.007787
30,ISIC_0033183,0.102332,0.195741,0.341025,0.24973,0.111172


In [15]:
# Or we can make predictions for individual lesions/images:
display(resnet18mc_test.prediction("HAM_0001190"))
display(resnet18mc_test.prediction("ISIC_0032817"))

Unnamed: 0,image_id,prob_other,prob_bcc,prob_mel,prob_akiec,prob_nv
0,ISIC_0026071,0.06553,0.085532,0.04628,0.079554,0.723104


Unnamed: 0,image_id,prob_other,prob_bcc,prob_mel,prob_akiec,prob_nv
0,ISIC_0032817,0.19008,0.287954,0.040866,0.204013,0.277086
