# Setup

In [1]:
import os
from pathlib import Path
import sys

# If we're using Google Colab, we set the environment variable to point to the relevant folder in our Google Drive:
if 'COLAB_GPU' in os.environ:
    from google.colab import drive
    drive.mount('/content/drive')
    os.environ['SKIN_LESION_CLASSIFICATION'] = '/content/drive/MyDrive/Colab Notebooks/skin-lesion-classification'

# Otherwise, we use the environment variable on our local system:
project_environment_variable = "SKIN_LESION_CLASSIFICATION"

# Path to the root directory of the project:
project_path = Path(os.environ.get(project_environment_variable))

# Relative path to /scripts (from where custom modules will be imported):
scripts_path = project_path.joinpath("scripts")

# Add this path to sys.path so that Python will look there for modules:
sys.path.append(str(scripts_path))

# Now import path_step from our custom utils module to create a dictionary to all subdirectories in our root directory:
from utils import path_setup
path = path_setup.subfolders(project_path)

path['project'] : D:\projects\skin-lesion-classification
path['images'] : D:\projects\skin-lesion-classification\images
path['models'] : D:\projects\skin-lesion-classification\models
path['expository'] : D:\projects\skin-lesion-classification\expository
path['literature'] : D:\projects\skin-lesion-classification\literature
path['notebooks'] : D:\projects\skin-lesion-classification\notebooks
path['presentation'] : D:\projects\skin-lesion-classification\presentation
path['scripts'] : D:\projects\skin-lesion-classification\scripts


<a id='trivial_models'></a>
# Trivial models

In [2]:
from typing import Type, Union      # For type hints
from processing import process      # Custom module for processing metadata

data_dir: Path = path["images"]     # Path to directory containing metadata.csv file
csv_filename: str = "metadata.csv"  # The filename
    
tvr: int = 3              # Ratio of training set to validation set. See discussion below for explanation.
seed: int = 0             # Random seed for parts of the process where randomness is called for.
keep_first: bool = False  # If False, then, for each lesion, we choose a random image to assign to our training set. 
stratified: bool = True   # If True, we stratify classes so that the proportions remain as stable as possible after train/val split. 
                          # If False, the proportions will be roughly similar.

to_classify: Union[list, dict] = ["mel",   # These are the lesion types we are interested in classifying. 
                                  "bcc",   # Any missing ones will be grouped together as the 0-label class: no need to write "other" here.
                                  "akiec", # If 'other' is not desired, use restrict_to attribute above
                                  "nv",]   # Can also be a dictionary, like { 'malignant' : ['mel', 'bcc'], 'benign' : ['nv', 'bkl']}

In [3]:
# Create an instance of the process class with attribute values as above.
trivial = process(data_dir=data_dir,
               csv_filename=csv_filename,
               tvr=tvr,
               seed=seed,
               keep_first=keep_first,
               stratified=stratified,
               to_classify=to_classify,)

- Loaded file 'D:\projects\skin-lesion-classification\images\metadata.csv'.
- Inserted 'num_images' column in dataframe, to the right of 'lesion_id' column.
- Inserted 'label' column in dataframe, to the right of 'dx' column: 
  {'df': 0, 'bkl': 0, 'vasc': 0, 'bcc': 1, 'nv': 2, 'akiec': 3, 'mel': 4}
- Added 'set' column to dataframe, with values 't1', 'v1', 'ta', and 'va', to the right of 'localization' column.
- Basic, overall dataframe (pre-train/test split): self.df
- Training set (not balanced, all images per lesion): self.df_train
- Validation set (not expanded, one image per lesion): self.df_val1
- Validation set (not expanded, use all images of each lesion): self.df_val_a
- Small sample dataframes for code testing: self._df_train_code_test, self._df_val1_code_test, self._df_val_a_code_test


In [5]:
from utils import print_header
import pandas as pd
from multiclass_models import trivial_prediction, final_prediction
from evaluation import weighted_average_f, confusion_matrix_with_metric, metric_dictionary

instance = trivial

y_train = instance.df_train['label']
y_val1 = instance.df_val1['label']
y_val_a = instance.df_val_a['label']
label_codes = instance.label_codes

_, prediction1, probabilities1 = trivial_prediction(y_train, 
                                                    num_preds=y_val1.shape[0],
                                                    label_codes=label_codes,
                                                    pos_label='majority_class',)

_, prediction_a, probabilities_a = trivial_prediction(y_train, 
                                                      num_preds=y_val_a.shape[0],
                                                      label_codes=label_codes,
                                                      pos_label='majority_class',)

instance.df_probabilities_val1 = instance.df_val1.copy()
instance.df_probabilities_val_a = instance.df_val_a.copy()
for i, dx in label_codes.items():
    instance.df_probabilities_val1['prob_' + dx] = probabilities1[:,i]
    instance.df_probabilities_val_a['prob_' + dx] = probabilities_a[:,i]
    
raw_probabilities_df1 = instance.df_probabilities_val1 
raw_probabilities_df_a = instance.df_probabilities_val_a

instance.df_pred_val1 = final_prediction(raw_probabilities_df=raw_probabilities_df1, 
                                         label_codes=label_codes,)   

instance.df_pred_val_a = final_prediction(raw_probabilities_df=raw_probabilities_df_a, 
                                         label_codes=label_codes,)  

map_labels = label_codes

target1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['label'] 
prediction1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['pred_final'] 

target_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['label']  
prediction_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['pred_final']  

txp1 = pd.crosstab(target1,prediction1,margins=True,dropna=False)
txp_a = pd.crosstab(target_a,prediction_a,margins=True,dropna=False)

beta = 2
# Weights inversely proportional to relative class size in the training set, giving more importance to smaller classes.
weights = 1/instance.df_train['label'].value_counts(normalize=True).sort_index().values # None

instance.cm1 = confusion_matrix_with_metric(AxB=txp1,
                                            lst=None,
                                            full_pad=True,
                                            func=weighted_average_f,
                                            beta=beta,
                                            weights=weights,
                                            percentage=False,
                                            map_labels=map_labels)

instance.cm_a = confusion_matrix_with_metric(AxB=txp_a,
                                            lst=None,
                                            full_pad=True,
                                            func=weighted_average_f,
                                            beta=beta,
                                            weights=weights,
                                            percentage=False,
                                            map_labels=map_labels)

target1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['label'] 
prediction1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['pred_final'] 
probabilities1 = instance.df_probabilities_val1.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_')
agg_probabilities1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_') 

target_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['label']  
prediction_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['pred_final']  
probabilities_a = instance.df_probabilities_val_a.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_')
agg_probabilities_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_') 

instance.metric_dict1 = metric_dictionary(target=target1, 
                                          prediction=prediction1, 
                                          probabilities=probabilities1)

instance.metric_dict_a = metric_dictionary(target=target_a, 
                                          prediction=prediction_a, 
                                          probabilities=probabilities_a)

print_header("Trivial prediction: all labels majority class")

print("\nOne image per lesion".upper())

display(instance.cm1.fillna('_'))
display(pd.DataFrame(instance.metric_dict1))

print("\nAll images per lesion".upper())
print("- This is the same as the one image per lesion case, because the validation sets are identical after dropping duplicates.")
print("- However, we include it here as a check.")
display(instance.cm_a.fillna('_'))
display(pd.DataFrame(instance.metric_dict_a))


TRIVIAL PREDICTION: ALL LABELS MAJORITY CLASS


ONE IMAGE PER LESION


predicted,other,bcc,nv,akiec,mel,All,recall
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
other,0,0,225.0,0,0,225,0.0
bcc,0,0,82.0,0,0,82,0.0
nv,0,0,1351.0,0,0,1351,1.0
akiec,0,0,57.0,0,0,57,0.0
mel,0,0,154.0,0,0,154,0.0
All,0,0,1869.0,0,0,1869,_
precision,_,_,0.722846,_,_,_,_


Unnamed: 0,ACC,BACC,precision,recall,F1/2,F1,F2,MCC,ROC-AUC mac,ROC-AUC wt,ROC-AUC wt*
0,0.722846,0.2,0.722846,0.2,0.153053,0.167826,0.185756,0.0,0.5,0.5,0.5



ALL IMAGES PER LESION
- This is the same as the one image per lesion case, because the validation sets are identical after dropping duplicates.
- However, we include it here as a check.


predicted,other,bcc,nv,akiec,mel,All,recall
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
other,0,0,225.0,0,0,225,0.0
bcc,0,0,82.0,0,0,82,0.0
nv,0,0,1351.0,0,0,1351,1.0
akiec,0,0,57.0,0,0,57,0.0
mel,0,0,154.0,0,0,154,0.0
All,0,0,1869.0,0,0,1869,_
precision,_,_,0.722846,_,_,_,_


Unnamed: 0,ACC,BACC,precision,recall,F1/2,F1,F2,MCC,ROC-AUC mac,ROC-AUC wt,ROC-AUC wt*
0,0.722846,0.2,0.722846,0.2,0.153053,0.167826,0.185756,0.0,0.5,0.5,0.5


In [6]:
from utils import print_header
import pandas as pd
from multiclass_models import trivial_prediction, final_prediction
from evaluation import weighted_average_f, confusion_matrix_with_metric, metric_dictionary

instance = trivial

y_train = instance.df_train['label']
y_val1 = instance.df_val1['label']
y_val_a = instance.df_val_a['label']
label_codes = instance.label_codes

_, prediction1, probabilities1 = trivial_prediction(y_train, 
                                                    num_preds=y_val1.shape[0],
                                                    label_codes=label_codes,
                                                    pos_label='minority_class',)

_, prediction_a, probabilities_a = trivial_prediction(y_train, 
                                                      num_preds=y_val_a.shape[0],
                                                      label_codes=label_codes,
                                                      pos_label='minority_class',)

instance.df_probabilities_val1 = instance.df_val1.copy()
instance.df_probabilities_val_a = instance.df_val_a.copy()
for i, dx in label_codes.items():
    instance.df_probabilities_val1['prob_' + dx] = probabilities1[:,i]
    instance.df_probabilities_val_a['prob_' + dx] = probabilities_a[:,i]
    
raw_probabilities_df1 = instance.df_probabilities_val1 
raw_probabilities_df_a = instance.df_probabilities_val_a

instance.df_pred_val1 = final_prediction(raw_probabilities_df=raw_probabilities_df1, 
                                         label_codes=label_codes,)   

instance.df_pred_val_a = final_prediction(raw_probabilities_df=raw_probabilities_df_a, 
                                         label_codes=label_codes,)  

map_labels = label_codes

target1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['label'] 
prediction1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['pred_final'] 

target_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['label']  
prediction_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['pred_final']  

txp1 = pd.crosstab(target1,prediction1,margins=True,dropna=False)
txp_a = pd.crosstab(target_a,prediction_a,margins=True,dropna=False)

beta = 2
# Weights inversely proportional to relative class size in the training set, giving more importance to smaller classes.
weights = 1/instance.df_train['label'].value_counts(normalize=True).sort_index().values # None

instance.cm1 = confusion_matrix_with_metric(AxB=txp1,
                                            lst=None,
                                            full_pad=True,
                                            func=weighted_average_f,
                                            beta=beta,
                                            weights=weights,
                                            percentage=False,
                                            map_labels=map_labels)

instance.cm_a = confusion_matrix_with_metric(AxB=txp_a,
                                            lst=None,
                                            full_pad=True,
                                            func=weighted_average_f,
                                            beta=beta,
                                            weights=weights,
                                            percentage=False,
                                            map_labels=map_labels)

target1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['label'] 
prediction1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['pred_final'] 
probabilities1 = instance.df_probabilities_val1.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_')
agg_probabilities1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_') 

target_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['label']  
prediction_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['pred_final']  
probabilities_a = instance.df_probabilities_val_a.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_')
agg_probabilities_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_') 

instance.metric_dict1 = metric_dictionary(target=target1, 
                                          prediction=prediction1, 
                                          probabilities=probabilities1)

instance.metric_dict_a = metric_dictionary(target=target_a, 
                                          prediction=prediction_a, 
                                          probabilities=probabilities_a)

print_header("Trivial prediction: all labels minority class")

print("\nOne image per lesion".upper())

display(instance.cm1.fillna('_'))
display(pd.DataFrame(instance.metric_dict1))

print("\nAll images per lesion".upper())
print("- This is the same as the one image per lesion case, because the validation sets are identical after dropping duplicates.")
print("- However, we include it here as a check.")
display(instance.cm_a.fillna('_'))
display(pd.DataFrame(instance.metric_dict_a))


TRIVIAL PREDICTION: ALL LABELS MINORITY CLASS


ONE IMAGE PER LESION


predicted,other,bcc,nv,akiec,mel,All,recall
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
other,0,0,0,225.0,0,225,0.0
bcc,0,0,0,82.0,0,82,0.0
nv,0,0,0,1351.0,0,1351,0.0
akiec,0,0,0,57.0,0,57,1.0
mel,0,0,0,154.0,0,154,0.0
All,0,0,0,1869.0,0,1869,_
precision,_,_,_,0.030498,_,_,_


Unnamed: 0,ACC,BACC,precision,recall,F1/2,F1,F2,MCC,ROC-AUC mac,ROC-AUC wt,ROC-AUC wt*
0,0.030498,0.2,0.030498,0.2,0.007567,0.011838,0.027182,0.0,0.5,0.5,0.5



ALL IMAGES PER LESION
- This is the same as the one image per lesion case, because the validation sets are identical after dropping duplicates.
- However, we include it here as a check.


predicted,other,bcc,nv,akiec,mel,All,recall
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
other,0,0,0,225.0,0,225,0.0
bcc,0,0,0,82.0,0,82,0.0
nv,0,0,0,1351.0,0,1351,0.0
akiec,0,0,0,57.0,0,57,1.0
mel,0,0,0,154.0,0,154,0.0
All,0,0,0,1869.0,0,1869,_
precision,_,_,_,0.030498,_,_,_


Unnamed: 0,ACC,BACC,precision,recall,F1/2,F1,F2,MCC,ROC-AUC mac,ROC-AUC wt,ROC-AUC wt*
0,0.030498,0.2,0.030498,0.2,0.007567,0.011838,0.027182,0.0,0.5,0.5,0.5


In [7]:
from utils import print_header
import pandas as pd
from multiclass_models import trivial_prediction, final_prediction
from evaluation import weighted_average_f, confusion_matrix_with_metric, metric_dictionary

instance = trivial

y_train = instance.df_train['label']
y_val1 = instance.df_val1['label']
y_val_a = instance.df_val_a['label']
label_codes = instance.label_codes

_, prediction1, probabilities1 = trivial_prediction(y_train, 
                                                    num_preds=y_val1.shape[0],
                                                    label_codes=label_codes,
                                                    pos_label='mel',)

_, prediction_a, probabilities_a = trivial_prediction(y_train, 
                                                      num_preds=y_val_a.shape[0],
                                                      label_codes=label_codes,
                                                      pos_label='mel',)

instance.df_probabilities_val1 = instance.df_val1.copy()
instance.df_probabilities_val_a = instance.df_val_a.copy()
for i, dx in label_codes.items():
    instance.df_probabilities_val1['prob_' + dx] = probabilities1[:,i]
    instance.df_probabilities_val_a['prob_' + dx] = probabilities_a[:,i]
    
raw_probabilities_df1 = instance.df_probabilities_val1 
raw_probabilities_df_a = instance.df_probabilities_val_a

instance.df_pred_val1 = final_prediction(raw_probabilities_df=raw_probabilities_df1, 
                                         label_codes=label_codes,)   

instance.df_pred_val_a = final_prediction(raw_probabilities_df=raw_probabilities_df_a, 
                                         label_codes=label_codes,)  

map_labels = label_codes

target1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['label'] 
prediction1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['pred_final'] 

target_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['label']  
prediction_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['pred_final']  

txp1 = pd.crosstab(target1,prediction1,margins=True,dropna=False)
txp_a = pd.crosstab(target_a,prediction_a,margins=True,dropna=False)

beta = 2
# Weights inversely proportional to relative class size in the training set, giving more importance to smaller classes.
weights = 1/instance.df_train['label'].value_counts(normalize=True).sort_index().values # None

instance.cm1 = confusion_matrix_with_metric(AxB=txp1,
                                            lst=None,
                                            full_pad=True,
                                            func=weighted_average_f,
                                            beta=beta,
                                            weights=weights,
                                            percentage=False,
                                            map_labels=map_labels)

instance.cm_a = confusion_matrix_with_metric(AxB=txp_a,
                                            lst=None,
                                            full_pad=True,
                                            func=weighted_average_f,
                                            beta=beta,
                                            weights=weights,
                                            percentage=False,
                                            map_labels=map_labels)

target1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['label'] 
prediction1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['pred_final'] 
probabilities1 = instance.df_probabilities_val1.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_')
agg_probabilities1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_') 

target_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['label']  
prediction_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['pred_final']  
probabilities_a = instance.df_probabilities_val_a.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_')
agg_probabilities_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_') 

instance.metric_dict1 = metric_dictionary(target=target1, 
                                          prediction=prediction1, 
                                          probabilities=probabilities1)

instance.metric_dict_a = metric_dictionary(target=target_a, 
                                          prediction=prediction_a, 
                                          probabilities=probabilities_a)

print_header("Trivial prediction: all labels mel")

print("\nOne image per lesion".upper())

display(instance.cm1.fillna('_'))
display(pd.DataFrame(instance.metric_dict1))

print("\nAll images per lesion".upper())
print("- This is the same as the one image per lesion case, because the validation sets are identical after dropping duplicates.")
print("- However, we include it here as a check.")
display(instance.cm_a.fillna('_'))
display(pd.DataFrame(instance.metric_dict_a))


TRIVIAL PREDICTION: ALL LABELS MEL


ONE IMAGE PER LESION


predicted,other,bcc,nv,akiec,mel,All,recall
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
other,0,0,0,0,225.0,225,0.0
bcc,0,0,0,0,82.0,82,0.0
nv,0,0,0,0,1351.0,1351,0.0
akiec,0,0,0,0,57.0,57,0.0
mel,0,0,0,0,154.0,154,1.0
All,0,0,0,0,1869.0,1869,_
precision,_,_,_,_,0.082397,_,_


Unnamed: 0,ACC,BACC,precision,recall,F1/2,F1,F2,MCC,ROC-AUC mac,ROC-AUC wt,ROC-AUC wt*
0,0.082397,0.2,0.082397,0.2,0.020183,0.03045,0.061972,0.0,0.5,0.5,0.5



ALL IMAGES PER LESION
- This is the same as the one image per lesion case, because the validation sets are identical after dropping duplicates.
- However, we include it here as a check.


predicted,other,bcc,nv,akiec,mel,All,recall
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
other,0,0,0,0,225.0,225,0.0
bcc,0,0,0,0,82.0,82,0.0
nv,0,0,0,0,1351.0,1351,0.0
akiec,0,0,0,0,57.0,57,0.0
mel,0,0,0,0,154.0,154,1.0
All,0,0,0,0,1869.0,1869,_
precision,_,_,_,_,0.082397,_,_


Unnamed: 0,ACC,BACC,precision,recall,F1/2,F1,F2,MCC,ROC-AUC mac,ROC-AUC wt,ROC-AUC wt*
0,0.082397,0.2,0.082397,0.2,0.020183,0.03045,0.061972,0.0,0.5,0.5,0.5


In [8]:
from utils import print_header
import pandas as pd
from multiclass_models import trivial_prediction, final_prediction
from evaluation import weighted_average_f, confusion_matrix_with_metric, metric_dictionary

instance = trivial

y_train = instance.df_train['label']
y_val1 = instance.df_val1['label']
y_val_a = instance.df_val_a['label']
label_codes = instance.label_codes

_, prediction1, probabilities1 = trivial_prediction(y_train, 
                                                    num_preds=y_val1.shape[0],
                                                    label_codes=label_codes,
                                                    pos_label='bcc',)

_, prediction_a, probabilities_a = trivial_prediction(y_train, 
                                                      num_preds=y_val_a.shape[0],
                                                      label_codes=label_codes,
                                                      pos_label='bcc',)

instance.df_probabilities_val1 = instance.df_val1.copy()
instance.df_probabilities_val_a = instance.df_val_a.copy()
for i, dx in label_codes.items():
    instance.df_probabilities_val1['prob_' + dx] = probabilities1[:,i]
    instance.df_probabilities_val_a['prob_' + dx] = probabilities_a[:,i]
    
raw_probabilities_df1 = instance.df_probabilities_val1 
raw_probabilities_df_a = instance.df_probabilities_val_a

instance.df_pred_val1 = final_prediction(raw_probabilities_df=raw_probabilities_df1, 
                                         label_codes=label_codes,)   

instance.df_pred_val_a = final_prediction(raw_probabilities_df=raw_probabilities_df_a, 
                                         label_codes=label_codes,)  

map_labels = label_codes

target1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['label'] 
prediction1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['pred_final'] 

target_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['label']  
prediction_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['pred_final']  

txp1 = pd.crosstab(target1,prediction1,margins=True,dropna=False)
txp_a = pd.crosstab(target_a,prediction_a,margins=True,dropna=False)

beta = 2
# Weights inversely proportional to relative class size in the training set, giving more importance to smaller classes.
weights = 1/instance.df_train['label'].value_counts(normalize=True).sort_index().values # None

instance.cm1 = confusion_matrix_with_metric(AxB=txp1,
                                            lst=None,
                                            full_pad=True,
                                            func=weighted_average_f,
                                            beta=beta,
                                            weights=weights,
                                            percentage=False,
                                            map_labels=map_labels)

instance.cm_a = confusion_matrix_with_metric(AxB=txp_a,
                                            lst=None,
                                            full_pad=True,
                                            func=weighted_average_f,
                                            beta=beta,
                                            weights=weights,
                                            percentage=False,
                                            map_labels=map_labels)

target1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['label'] 
prediction1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['pred_final'] 
probabilities1 = instance.df_probabilities_val1.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_')
agg_probabilities1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_') 

target_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['label']  
prediction_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['pred_final']  
probabilities_a = instance.df_probabilities_val_a.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_')
agg_probabilities_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_') 

instance.metric_dict1 = metric_dictionary(target=target1, 
                                          prediction=prediction1, 
                                          probabilities=probabilities1)

instance.metric_dict_a = metric_dictionary(target=target_a, 
                                          prediction=prediction_a, 
                                          probabilities=probabilities_a)

print_header("Trivial prediction: all labels bcc")

print("\nOne image per lesion".upper())

display(instance.cm1.fillna('_'))
display(pd.DataFrame(instance.metric_dict1))

print("\nAll images per lesion".upper())
print("- This is the same as the one image per lesion case, because the validation sets are identical after dropping duplicates.")
print("- However, we include it here as a check.")
display(instance.cm_a.fillna('_'))
display(pd.DataFrame(instance.metric_dict_a))


TRIVIAL PREDICTION: ALL LABELS BCC


ONE IMAGE PER LESION


predicted,other,bcc,nv,akiec,mel,All,recall
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
other,0,225.0,0,0,0,225,0.0
bcc,0,82.0,0,0,0,82,1.0
nv,0,1351.0,0,0,0,1351,0.0
akiec,0,57.0,0,0,0,57,0.0
mel,0,154.0,0,0,0,154,0.0
All,0,1869.0,0,0,0,1869,_
precision,_,0.043874,_,_,_,_,_


Unnamed: 0,ACC,BACC,precision,recall,F1/2,F1,F2,MCC,ROC-AUC mac,ROC-AUC wt,ROC-AUC wt*
0,0.043874,0.2,0.043874,0.2,0.010849,0.016812,0.037324,0.0,0.5,0.5,0.5



ALL IMAGES PER LESION
- This is the same as the one image per lesion case, because the validation sets are identical after dropping duplicates.
- However, we include it here as a check.


predicted,other,bcc,nv,akiec,mel,All,recall
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
other,0,225.0,0,0,0,225,0.0
bcc,0,82.0,0,0,0,82,1.0
nv,0,1351.0,0,0,0,1351,0.0
akiec,0,57.0,0,0,0,57,0.0
mel,0,154.0,0,0,0,154,0.0
All,0,1869.0,0,0,0,1869,_
precision,_,0.043874,_,_,_,_,_


Unnamed: 0,ACC,BACC,precision,recall,F1/2,F1,F2,MCC,ROC-AUC mac,ROC-AUC wt,ROC-AUC wt*
0,0.043874,0.2,0.043874,0.2,0.010849,0.016812,0.037324,0.0,0.5,0.5,0.5


In [9]:
from utils import print_header
import pandas as pd
from multiclass_models import trivial_prediction, final_prediction
from evaluation import weighted_average_f, confusion_matrix_with_metric, metric_dictionary

instance = trivial

y_train = instance.df_train['label']
y_val1 = instance.df_val1['label']
y_val_a = instance.df_val_a['label']
label_codes = instance.label_codes

_, prediction1, probabilities1 = trivial_prediction(y_train, 
                                                    num_preds=y_val1.shape[0],
                                                    label_codes=label_codes,
                                                    pos_label='other',)

_, prediction_a, probabilities_a = trivial_prediction(y_train, 
                                                      num_preds=y_val_a.shape[0],
                                                      label_codes=label_codes,
                                                      pos_label='other',)

instance.df_probabilities_val1 = instance.df_val1.copy()
instance.df_probabilities_val_a = instance.df_val_a.copy()
for i, dx in label_codes.items():
    instance.df_probabilities_val1['prob_' + dx] = probabilities1[:,i]
    instance.df_probabilities_val_a['prob_' + dx] = probabilities_a[:,i]
    
raw_probabilities_df1 = instance.df_probabilities_val1 
raw_probabilities_df_a = instance.df_probabilities_val_a

instance.df_pred_val1 = final_prediction(raw_probabilities_df=raw_probabilities_df1, 
                                         label_codes=label_codes,)   

instance.df_pred_val_a = final_prediction(raw_probabilities_df=raw_probabilities_df_a, 
                                         label_codes=label_codes,)  

map_labels = label_codes

target1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['label'] 
prediction1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['pred_final'] 

target_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['label']  
prediction_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['pred_final']  

txp1 = pd.crosstab(target1,prediction1,margins=True,dropna=False)
txp_a = pd.crosstab(target_a,prediction_a,margins=True,dropna=False)

beta = 2
# Weights inversely proportional to relative class size in the training set, giving more importance to smaller classes.
weights = 1/instance.df_train['label'].value_counts(normalize=True).sort_index().values # None

instance.cm1 = confusion_matrix_with_metric(AxB=txp1,
                                            lst=None,
                                            full_pad=True,
                                            func=weighted_average_f,
                                            beta=beta,
                                            weights=weights,
                                            percentage=False,
                                            map_labels=map_labels)

instance.cm_a = confusion_matrix_with_metric(AxB=txp_a,
                                            lst=None,
                                            full_pad=True,
                                            func=weighted_average_f,
                                            beta=beta,
                                            weights=weights,
                                            percentage=False,
                                            map_labels=map_labels)

target1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['label'] 
prediction1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id')['pred_final'] 
probabilities1 = instance.df_probabilities_val1.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_')
agg_probabilities1 = instance.df_pred_val1.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_') 

target_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['label']  
prediction_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id')['pred_final']  
probabilities_a = instance.df_probabilities_val_a.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_')
agg_probabilities_a = instance.df_pred_val_a.drop_duplicates(subset='lesion_id').filter(regex=r'^prob_') 

instance.metric_dict1 = metric_dictionary(target=target1, 
                                          prediction=prediction1, 
                                          probabilities=probabilities1)

instance.metric_dict_a = metric_dictionary(target=target_a, 
                                          prediction=prediction_a, 
                                          probabilities=probabilities_a)

print_header("Trivial prediction: all labels other")

print("\nOne image per lesion".upper())

display(instance.cm1.fillna('_'))
display(pd.DataFrame(instance.metric_dict1))

print("\nAll images per lesion".upper())
print("- This is the same as the one image per lesion case, because the validation sets are identical after dropping duplicates.")
print("- However, we include it here as a check.")
display(instance.cm_a.fillna('_'))
display(pd.DataFrame(instance.metric_dict_a))


TRIVIAL PREDICTION: ALL LABELS OTHER


ONE IMAGE PER LESION


predicted,other,bcc,nv,akiec,mel,All,recall
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
other,225.0,0,0,0,0,225,1.0
bcc,82.0,0,0,0,0,82,0.0
nv,1351.0,0,0,0,0,1351,0.0
akiec,57.0,0,0,0,0,57,0.0
mel,154.0,0,0,0,0,154,0.0
All,1869.0,0,0,0,0,1869,_
precision,0.120385,_,_,_,_,_,_


Unnamed: 0,ACC,BACC,precision,recall,F1/2,F1,F2,MCC,ROC-AUC mac,ROC-AUC wt,ROC-AUC wt*
0,0.120385,0.2,0.120385,0.2,0.029217,0.04298,0.081257,0.0,0.5,0.5,0.5



ALL IMAGES PER LESION
- This is the same as the one image per lesion case, because the validation sets are identical after dropping duplicates.
- However, we include it here as a check.


predicted,other,bcc,nv,akiec,mel,All,recall
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
other,225.0,0,0,0,0,225,1.0
bcc,82.0,0,0,0,0,82,0.0
nv,1351.0,0,0,0,0,1351,0.0
akiec,57.0,0,0,0,0,57,0.0
mel,154.0,0,0,0,0,154,0.0
All,1869.0,0,0,0,0,1869,_
precision,0.120385,_,_,_,_,_,_


Unnamed: 0,ACC,BACC,precision,recall,F1/2,F1,F2,MCC,ROC-AUC mac,ROC-AUC wt,ROC-AUC wt*
0,0.120385,0.2,0.120385,0.2,0.029217,0.04298,0.081257,0.0,0.5,0.5,0.5


<a id='baseline_models'></a>
# Baseline models
↑↑ [Contents](#contents) ↑ [Trivial models](#trivial_models) ↓ [Models with balancing](#models_with)

In [74]:
from typing import Type, Union      # For type hints
from processing import process      # Custom module for processing metadata

data_dir: Path = path["images"]     # Path to directory containing metadata.csv file
csv_filename: str = "metadata.csv"  # The filename
    
tvr: int = 3              # Ratio of training set to validation set. See discussion below for explanation.
seed: int = 0             # Random seed for parts of the process where randomness is called for.
keep_first: bool = False  # If False, then, for each lesion, we choose a random image to assign to our training set. 
stratified: bool = True   # If True, we stratify classes so that the proportions remain as stable as possible after train/val split. 
                          # If False, the proportions will be roughly similar.

to_classify: Union[list, dict] = ["mel",   # These are the lesion types we are interested in classifying. 
                                  "bcc",   # Any missing ones will be grouped together as the 0-label class: no need to write "other" here.
                                  "akiec", # If 'other' is not desired, use restrict_to attribute above
                                  "nv",]   # Can also be a dictionary, like { 'malignant' : ['mel', 'bcc'], 'benign' : ['nv', 'bkl']}
    
train_one_img_per_lesion: Union[None, bool] = True

In [75]:
# Create an instance of the process class with attribute values as above.
rn18_defaults = process(data_dir=data_dir,
                        csv_filename=csv_filename,
                        tvr=tvr,
                        seed=seed,
                        keep_first=keep_first,
                        stratified=stratified,
                        to_classify=to_classify,
                        train_one_img_per_lesion=train_one_img_per_lesion,)

- Loaded file 'D:\projects\skin-lesion-classification\images\metadata.csv'.
- Inserted 'num_images' column in dataframe, to the right of 'lesion_id' column.
- Inserted 'label' column in dataframe, to the right of 'dx' column: 
  {'bkl': 0, 'vasc': 0, 'df': 0, 'mel': 1, 'bcc': 2, 'akiec': 3, 'nv': 4}
- Added 'set' column to dataframe, with values 't1', 'v1', 'ta', and 'va', to the right of 'localization' column.
- Basic, overall dataframe (pre-train/test split): self.df
- Training set (not balanced, one image per lesion): self.df_train
- Validation set (not expanded, one image per lesion): self.df_val1
- Validation set (not expanded, use all images of each lesion): self.df_val_a
- Small sample dataframes for code testing: self._df_train_code_test, self._df_val1_code_test, self._df_val_a_code_test


In [76]:
transform = transforms.Compose([
transforms.Resize((224,224)), # Resize images to fit ResNet input size
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize with ImageNet stats
])        

In [77]:
import pandas as pd
from typing import Union, List, Callable
import torchvision.models as models
import torchvision.transforms as transforms

source: Union[process, pd.DataFrame] = rn18_defaults  # Processed data to be fed into model for training.
                                                      # Must either be an instance of the process class, or a dataframe of the same format as source.df if source were an instance of the process class.
model_dir: Path = path["models"]                      # Path to directory where models/model info/model results are stored.
    
transform: Union[None, 
                 transforms.Compose, 
                 List[Callable]] = transform     # Transform to be applied to images before feeding into neural network.
    
filename_stem: Union[None, str] = "rn18"         # For saving model and related files. Default "rn18" (if ResNet model) or "EffNet" (if EfficientNet), or "cnn".
filename_suffix: Union[None, str] = "defaults"   # Something descriptive and unique for future reference. Default empty string "".

# model: Union[None, models.ResNet, models.EfficientNet] = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT) # Pre-trained model. Default: ResNet18.   
model: Union[None, models.ResNet, models.EfficientNet] = models.resnet18(weights="ResNet18_Weights.DEFAULT") 

In [78]:
# Create an instance of the resnet18 class with attribute values as above.
from multiclass_models import cnn

rn18_defaults = cnn(source=source,                                           
                    model_dir=model_dir,
                    transform=transform,
                    filename_stem=filename_stem,
                    filename_suffix=filename_suffix,                         
                    model=model,)

New files will be created. 
Base filename: rn18_t1_10e_defaults_00
Attributes saved to file: D:\projects\skin-lesion-classification\models\rn18_t1_10e_defaults_00_attributes.json


In [79]:
# from utils import print_header
import torch
import torch.nn as nn

instance = rn18_defaults

model = models.resnet18()  
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, len(instance.label_codes))

# state_dict = torch.load(file_path_pth)
# model.load_state_dict(state_dict)

# model = models.efficientnet_b0()  
# num_ftrs = model.classifier[1].in_features
# model.classifier[1] = nn.Linear(num_ftrs, len(instance.label_codes))

instance.model = model

file_path = instance.model_dir.joinpath(instance._filename + ".pth")
print(f"Saving model.state_dict() as {file_path}.")
torch.save(model.state_dict(), file_path)

Saving model.state_dict() as D:\projects\skin-lesion-classification\models\rn18_t1_10e_defaults_00.pth.


In [81]:
# from utils import print_header
import torch
import torch.nn as nn

instance = rn18_defaults

if instance.state_dict is None:
    print("Loading model and state dictionary from file\n".upper())
    file_path_pth = instance.model_dir.joinpath(instance._filename + ".pth")

    # model = models.efficientnet_b0()  
    model = models.resnet18()  
    if isinstance(model,models.ResNet):
        num_ftrs = model.fc.in_features
        model.fc = nn.Linear(num_ftrs, len(instance.label_codes))
    elif isinstance(model,models.EfficientNet):
        num_ftrs = model.classifier[1].in_features
        model.classifier[1] = nn.Linear(num_ftrs, len(instance.label_codes))

    # Load the state dictionary into the model
    state_dict = torch.load(file_path_pth)
    model.load_state_dict(state_dict)

    instance.model = model
    instance.state_dict = state_dict
    
print_header("Model architecture")
print(f"Note: \'out_features = {len(instance.label_codes)}\' at the end".upper())
display(instance.model)


MODEL ARCHITECTURE

NOTE: 'OUT_FEATURES = 5' AT THE END


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [126]:
from utils import print_header
from multiclass_models import get_probabilities

instance = rn18_defaults

instance.df_probabilities_val1 = get_probabilities(df=instance.df_val1,
                                                   data_dir=instance.data_dir,
                                                   model_dir=instance.model_dir,
                                                   model=instance.model,
                                                   filename=instance._filename,
                                                   label_codes=instance.label_codes,
                                                   transform=instance.transform,
                                                   batch_size=instance.batch_size,
                                                   Print=False,
                                                   save_as=instance._filename + "_val1",)

instance.df_probabilities_val_a = get_probabilities(df=instance.df_val_a,
                                                    data_dir=instance.data_dir,
                                                    model_dir=instance.model_dir,
                                                    model=instance.model,
                                                    filename=instance._filename,
                                                    label_codes=instance.label_codes,
                                                    transform=instance.transform,
                                                    batch_size=instance.batch_size,
                                                    Print=False,
                                                    save_as=instance._filename + "_val_a",)

In [111]:
file_path1 = instance.model_dir.joinpath("rn18_t1_10e_defaults_00_val1_probabilities.csv")
file_path_a = instance.model_dir.joinpath("rn18_t1_10e_defaults_00_val_a_probabilities.csv")

instance.df_probabilities_val1 = pd.read_csv(file_path1, index_col=0)
instance.df_probabilities_val_a = pd.read_csv(file_path_a, index_col=0)

In [96]:
print_header("...")
display_columns = ['lesion_id', 'image_id', 'dx'] + [col for col in instance.df_probabilities_val1.columns if col.startswith('prob')]
display(instance.df_probabilities_val1[display_columns].head())

print_header("...")
display(instance.df_probabilities_val_a[display_columns].head())


===
...
===



KeyError: "['image_id'] not in index"

<a id='models_with'></a>
# Models with balancing
↑↑ [Contents](#contents) ↑ [Baseline models](#baseline_models) 