# Skin Cancer Detection


Training of neural networks for automated diagnosis of pigmented skin lesions is hampered by the small size and lack of diversity of available dataset of dermatoscopic images. We tackle this problem by releasing the HAM10000 ("Human Against Machine with 10000 training images") dataset. We collected dermatoscopic images from different populations, acquired and stored by different modalities. The final dataset consists of 10015 dermatoscopic images which can serve as a training set for academic machine learning purposes. Cases include a representative collection of all important diagnostic categories in the realm of pigmented lesions: Actinic keratoses and intraepithelial carcinoma / Bowen's disease (akiec), basal cell carcinoma (bcc), benign keratosis-like lesions (solar lentigines / seborrheic keratoses and lichen-planus like keratoses, bkl), dermatofibroma (df), melanoma (mel), melanocytic nevi (nv) and vascular lesions (angiomas, angiokeratomas, pyogenic granulomas and hemorrhage, vasc).

More than 50% of lesions are confirmed through histopathology (histo), the ground truth for the rest of the cases is either follow-up examination (follow_up), expert consensus (consensus), or confirmation by in-vivo confocal microscopy (confocal). The dataset includes lesions with multiple images, which can be tracked by the lesion_id-column within the HAM10000_metadata file.

![skin cancer](http://www.justscience.in/wp-content/uploads/2017/12/what-causes-skin-cancer.jpg)





In [None]:
from fastai import *
from fastai.vision import *
from fastai.callbacks.hooks import *

import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import auc,roc_curve

import os
print(os.listdir("/content/"))

## Exploratory Data Analysis

In [None]:
# Paths and roots to the important files
path='/content/'
csv_file='/content/HAM10000_metadata.csv'

In [None]:
df=pd.read_csv(csv_file).set_index('image_id')
df.head()

In [None]:
# Categories of the diferent diseases
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

In [None]:
df.dx=df.dx.astype('category',copy=True)
df['labels']=df.dx.cat.codes # Convert the labels to numbers
df['lesion']= df.dx.map(lesion_type_dict)
df.head()

In [None]:
print(df.lesion.value_counts())


In [None]:
df.loc['ISIC_0027419','lesion']

## Countplot
Here we notice tha we have data imbalance 

In [None]:
fig, ax1 = plt.subplots(1, 1, figsize = (10, 5))
sns.countplot(y='lesion',data=df, hue="lesion",ax=ax1)

## Dataset

In [None]:
class CustomImageItemList(ImageItemList):
    def custom_label(self,df, **kwargs)->'LabelList':
        """Custom Labels from path"""
        file_names=np.vectorize(lambda files: str(files).split('/')[-1][:-4])
        get_labels=lambda x: df.loc[x,'lesion']
        #self.items is an np array of PosixPath objects with each image path
        labels= get_labels(file_names(self.items))
        y = CategoryList(items=labels)
        res = self._label_list(x=self,y=y)
        return res

In [None]:
def get_data(bs, size):
    train_ds = (CustomImageItemList.from_folder('/content/', extensions='.jpg')
                    .random_split_by_pct(0.15)
                    .custom_label(df)
                    .transform(tfms=get_transforms(flip_vert=True),size=size)
                    .databunch(num_workers=2, bs=bs)
                    .normalize(imagenet_stats))
    return train_ds

In [None]:
data=get_data(16,224)

In [None]:
data.classes=list(np.unique(df.lesion))  
data.c= len(np.unique(df.lesion))  

In [None]:
data.show_batch(rows=3)

## Model ResNet50 

In [1]:
learner=create_cnn(data,models.resnet50,metrics=[accuracy], model_dir="/tmp/model/")

NameError: ignored

In [2]:
learner.loss_func=nn.CrossEntropyLoss()

NameError: ignored

In [None]:
learner.lr_find()
learner.recorder.plot()

In [None]:
learner.fit_one_cycle(10, 3e-3)

In [None]:
learner.unfreeze()

In [None]:
learner.lr_find()
learner.recorder.plot()

In [None]:
lr=1e-6
learner.fit_one_cycle(3, slice(3*lr,10*lr))

In [None]:
learner.save('stage-1')

In [None]:
interp = ClassificationInterpretation.from_learner(learner)

In [None]:
interp.plot_confusion_matrix(figsize=(10,8))

In [None]:
interp.most_confused()

## Inference

In [None]:
pred_data=get_data(16,224)

In [None]:
pred_data.classes=list(np.unique(df.lesion))  
pred_data.c= len(np.unique(df.lesion)) 

In [None]:
pred_data.single_from_classes(path, pred_data.classes)

In [None]:
predictor = create_cnn(pred_data, models.resnet50, model_dir="/tmp/model/").load('stage-1')

In [None]:
img = open_image('/content/ham10000_images_part_2/ISIC_0029886.jpg')
img

In [None]:
pred_class,pred_idx,outputs = predictor.predict(img)
pred_class

## Predictions

In [None]:
# Predictions of the validation data
preds_val, y_val=learner.get_preds()

### Roc Curve
With the ROC curve we will mesuare how good it's our model

In [None]:
#  ROC curve
fpr, tpr, thresholds = roc_curve(y_val.numpy(), preds_val.numpy()[:,1], pos_label=1)

#  ROC area
pred_score = auc(fpr, tpr)
print(f'ROC area is {pred_score}')

In [None]:
plt.figure()
plt.plot(fpr, tpr, color='orange', label='ROC curve (area = %0.2f)' % pred_score)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([-0.01, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")

## Heatmap

In [None]:
x,y = data.valid_ds[2]
x.show()
data.valid_ds.y[2]

In [None]:
def heatMap(x,y,data, learner, size=(0,224,224,0)):
    """HeatMap"""
    
    # Evaluation mode
    m=learner.model.eval()
    
    # Denormalize the image
    xb,_ = data.one_item(x)
    xb_im = Image(data.denorm(xb)[0])
    xb = xb.cuda()
    
    # hook the activations
    with hook_output(m[0]) as hook_a: 
        with hook_output(m[0], grad=True) as hook_g:
            preds = m(xb)
            preds[0,int(y)].backward()

    # Activations    
    acts=hook_a.stored[0].cpu()
    
    # Avg of the activations
    avg_acts=acts.mean(0)
    
    # Show HeatMap
    _,ax = plt.subplots()
    xb_im.show(ax)
    ax.imshow(avg_acts, alpha=0.6, extent=size,
              interpolation='bilinear', cmap='magma')
    

In [None]:
heatMap(x,y,pred_data,learner)