In [1]:
# General
import os, pickle
from pathlib import Path
from tqdm import tqdm

# Data manipulation
import pandas as pd
import numpy as np
from random import sample, shuffle, randint
from sklearn import metrics, model_selection
from math import ceil

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Image manipulation
import PIL, cv2
from skimage.io import imread, imshow

# Visualizations
import matplotlib as plt
import matplotlib.patches as patches
import seaborn as sns
import fastai2
# Matplotlib options
plt.rcParams['font.size']=14

# DL libraries
from fastai2.basics import *
from fastai2.callback.all import *
from fastai2.vision.all import *
from torch import torch
from keras.preprocessing.image import ImageDataGenerator, img_to_array, array_to_img, load_img



KeyboardInterrupt



In [None]:
GT = pd.read_csv('/content/GroundTruth.csv')
gt = GT.sum().to_frame().reset_index().drop(0)
gt.columns = ['toc', 'sum']

plt.figure(figsize=(15,9))
explode = [0, 0.1, 0, 0, 0, 0, 0]
plt.pie(gt['sum'], labels=gt['toc'],explode=explode, autopct='%.0f%%')
plt.show()

In [None]:
data_path = Path("/content/skin-cancer-mnist-ham10000")
print(os.listdir(data_path))

In [None]:
csv_path = data_path / 'HAM10000_metadata.csv'
df_data=pd.read_csv(csv_path).set_index('image_id')
df_data.head()

In [None]:
def bar_plt(y, x, title):
    sns.barplot(y=y, x=x, palette="dark")
    plt.xticks(rotation=60, fontsize=11)
    plt.yticks(fontsize=14)
    plt.title(title, fontsize=14, pad=20)

In [None]:
def show_examples(df, col, n_samples = 5):
    "Show `n_samples` of each type of value in `col` of dataframe `df`"
    groups = df.sort_values([col]).groupby(col)
    n_rows = len(groups)
    fig, m_axs = plt.subplots(n_rows, n_samples, figsize = (3*n_samples, 2*n_rows))
    for n_axs, (type_name, type_rows) in zip(m_axs,groups):
        n_axs[0].set_title(type_name)
        for c_ax, (_, c_row) in zip(n_axs, type_rows.sample(n_samples).iterrows()):
            c_ax.imshow(imread(c_row['path']))
            c_ax.axis('off')

In [None]:
# Categories of the diferent diseases (thanks to kaggle.com/ingbiodanielh/skin-cancer-classification-with-resnet-50-fastai)
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

df_data.dx=df_data.dx.astype('category',copy=True)
df_data['label']=df_data.dx.cat.codes # Convert the labels to numbers
df_data['lesion_type']= df_data.dx.map(lesion_type_dict)
df_data.head()

In [None]:
# for some reason the images are spread over two folders, HAM10000_images_part_1 and ...part_2
# add a column to the df with the path to the correct folders

# {filename : path} for all files in both image folders
imageid_path_dict = {str(x).split('/')[-1][:-4]: str(x) for x in list(data_path.glob('*/*.jpg'))}

# use {filename: path} dict to select items from the correct folders
df_data['path'] = [Path(data_path/imageid_path_dict[fn].split('/')[3]/f'{fn}.jpg') for fn in df_data.index.values]

In [None]:
df_data.iloc[0]

In [None]:
# we have some missing patient ages, which we can probably ignore.
pd.isnull(df_data).sum()

In [None]:
# plot histogram of patient ages
fig, axes = plt.subplots(1, 3, figsize=(20, 3))
plt.subplot(1, 3, 1)
data = df_data['age'].value_counts()
bar_plt(data.tolist(), [int(x) for x in data.keys().tolist()], title='Patient ages')

# plot bar chart of patients sex
plt.subplot(1, 3, 2)
data = df_data['sex'].value_counts()
bar_plt(data.tolist(), data.keys().tolist(), title='Sex of patients')

# plot barchart of localization of lesion on the body
plt.subplot(1, 3, 3)
data = df_data['localization'].value_counts()
bar_plt(data.tolist(), [x[:8] for x in data.keys().tolist()], title='Lesion localization')