wait for fill

> Step 1. Data analysis and preprocessing

> Step 2. Model selection

> Step 3. Model training

> Step 4. Model evaluation

> Step 5. Model prediction

> Step 6. Model submission

> Step 7. Conclusion

First, import the necessary libraries section.

In [56]:
# Python library
import os
import sys
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
import glob

import yaml

from PIL import Image

# Sklearn library
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Pytorch library
import torch
from torch import optim, nn
from torch.autograd import Variable
from torch.utils.data import DataLoader,Dataset
from torchvision import models,transforms

# CV2 library
import cv2

In [34]:
# Load configuration
with open('config.yaml') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

# Project information
print('Project name: ', config['project']['name'])
print('Project version: ', config['project']['description'])
print('Project author: ', config['project']['version'])


Project name:  Cancer Detection
Project version:  DPL302m Project - Fall 2024 of Group 1
Project author:  1.0.0
Data path:  {'data_path': 'data/', 'csv_path': 'data/HAM10000_metadata.csv', 'images_path': 'data/images/'}


In [36]:
# Data location
print('Data path: ', config['data'])
data_path = config['data']['data_path']
csv_path = config['data']['csv_path']
images_path = config['data']['images_path']

# Data reading
data = pd.read_csv(csv_path)

# Create a dictionary for images location

Data path:  {'data_path': 'data/', 'csv_path': 'data/HAM10000_metadata.csv', 'images_path': 'data/images/'}
Number of images:  10015


In [55]:
def data_check(data):
    # Show data types
    print('Data types:')
    print(data.dtypes)
    print()
    # Show unique values for each column
    print('Unique values for each column:')
    for col in data.columns:
        if col == 'image_id' or col == 'lesion_id':
            continue
        print(col, data[col].unique())
    print()
    # Count nan values
    print('Nan values in each column:')
    for col in data.columns:
        print(col, data[col].isnull().sum())
    print()
    # Count unknown values
    print('Unknown values in each column:')
    for col in data.columns:
        print(col, data[data[col] == 'unknown'].shape[0])

data_check(data)

Data types:
lesion_id       object
image_id        object
dx              object
dx_type         object
age              int32
sex             object
localization    object
dtype: object

Unique values for each column:
dx ['bkl' 'nv' 'df' 'mel' 'vasc' 'bcc' 'akiec']
dx_type ['histo' 'consensus' 'confocal' 'follow_up']
age [80 75 60 70 55 85 65 40 50 45 35  0 30  5 25 20 10 15]
sex ['male' 'female']
localization ['scalp' 'ear' 'face' 'back' 'trunk' 'chest' 'upper extremity' 'abdomen'
 'lower extremity' 'genital' 'neck' 'hand' 'foot' 'acral']

Nan values in each column:
lesion_id 0
image_id 0
dx 0
dx_type 0
age 0
sex 0
localization 0

Unknown values in each column:
lesion_id 0
image_id 0
dx 0
dx_type 0
age 0
sex 0
localization 0


In [58]:
# Drop duplicates for image_id and lesion_id
data = data.drop_duplicates(subset=['image_id', 'lesion_id'])

# Drop nan and "unknown" values
data = data.dropna()
data = data[data['age'] != 'unknown']
data = data[data['sex'] != 'unknown']
data = data[data['localization'] != 'unknown']

# Convert age to int
data['age'] = data['age'].astype(int)

# Encode labels
label_encoder = LabelEncoder()
data['dx_code'] = label_encoder.fit_transform(data['dx'])
data['dx_type_code'] = label_encoder.fit_transform(data['dx_type'])
data['sex_code'] = label_encoder.fit_transform(data['sex'])
data['localization_code'] = label_encoder.fit_transform(data['localization'])

print('Data after cleaning:')
data_check(data)

Data after cleaning:
Data types:
lesion_id            object
image_id             object
dx                   object
dx_type              object
age                   int32
sex                  object
localization         object
dx_code               int32
dx_type_code          int32
sex_code              int32
localization_code     int32
dtype: object

Unique values for each column:
dx ['bkl' 'nv' 'df' 'mel' 'vasc' 'bcc' 'akiec']
dx_type ['histo' 'consensus' 'confocal' 'follow_up']
age [80 75 60 70 55 85 65 40 50 45 35  0 30  5 25 20 10 15]
sex ['male' 'female']
localization ['scalp' 'ear' 'face' 'back' 'trunk' 'chest' 'upper extremity' 'abdomen'
 'lower extremity' 'genital' 'neck' 'hand' 'foot' 'acral']
dx_code [2 5 3 4 6 1 0]
dx_type_code [3 1 0 2]
sex_code [1 0]
localization_code [11  4  5  2 12  3 13  0  9  7 10  8  6  1]

Nan values in each column:
lesion_id 0
image_id 0
dx 0
dx_type 0
age 0
sex 0
localization 0
dx_code 0
dx_type_code 0
sex_code 0
localization_code 0

Unknown val

In [59]:
# Image loading
def load_images(data, images_path):
    images = []
    for index, row in data.iterrows():
        image_name = row['image_id'] + '.jpg'
        image_path = os.path.join(images_path, image_name)
        image = cv2.imread(image_path)
        if image is None:
            print('Image not found: ', image_path)
            continue
        # image preprocessing
        image = cv2.resize(image, (224, 224))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images.append(image)
    return images

# Load images
images = load_images(data, images_path)
# Print the number of images
print('Number of images: ', len(images))

Number of images:  9761
