In [5]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [3]:
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000

Dataset URL: https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000
License(s): CC-BY-NC-SA-4.0
Downloading skin-cancer-mnist-ham10000.zip to /content
100% 5.20G/5.20G [04:26<00:00, 21.7MB/s]
100% 5.20G/5.20G [04:26<00:00, 21.0MB/s]


In [6]:
import zipfile
zip_ref = zipfile.ZipFile('/content/skin-cancer-mnist-ham10000.zip','r')
zip_ref.extractall('/content')
zip_ref.close()

In [7]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [8]:
import pandas as pd
import numpy as np
import os
import cv2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

In [9]:

# Paths to your Excel files
train_confirm_path = '/content/KCDH2024_Training_LesionGroupings.xlsx'
train_labels_path = '/content/KCDH2024_Training_GroundTruth.xlsx'
test_labels_path = '/content/KCDH2024_Test_GroundTruth.xlsx'


In [11]:
# Load the Excel files
train_confirm_df = pd.read_excel(train_confirm_path)
train_labels_df = pd.read_excel(train_labels_path)
test_labels_df = pd.read_excel(test_labels_path)

In [12]:
# Print the first few rows of the dataframes to understand their structure
print("Training Confirm DF:")
print(train_confirm_df.head())
print("Training Labels DF:")
print(train_labels_df.head())
print("Test Labels DF:")
print(test_labels_df.head())

Training Confirm DF:
          image    lesion_id            diagnosis_confirm_type
0  ISIC_0024306  HAM_0000550  serial imaging showing no change
1  ISIC_0024307  HAM_0003577  serial imaging showing no change
2  ISIC_0024308  HAM_0001477  serial imaging showing no change
3  ISIC_0024309  HAM_0000484  serial imaging showing no change
4  ISIC_0024310  HAM_0003350                    histopathology
Training Labels DF:
          image  MEL  NV  BCC  AKIEC  BKL  DF  VASC
0  ISIC_0024306    0   1    0      0    0   0     0
1  ISIC_0024307    0   1    0      0    0   0     0
2  ISIC_0024308    0   1    0      0    0   0     0
3  ISIC_0024309    0   1    0      0    0   0     0
4  ISIC_0024310    1   0    0      0    0   0     0
Test Labels DF:
          image  MEL  NV  BCC  AKIEC  BKL  DF  VASC
0  ISIC_0034524    0   1    0      0    0   0     0
1  ISIC_0034525    0   1    0      0    0   0     0
2  ISIC_0034526    0   0    0      0    1   0     0
3  ISIC_0034527    0   1    0      0    0   0

In [13]:
import zipfile
zip_ref = zipfile.ZipFile('/content/drive/MyDrive/KCDH2024_Test_Input.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [14]:

# Define image paths
train_image_dir = '/content/HAM10000_images_part_1'
test_image_dir = '/content/KCDH2024_Test_Input'

# Image parameters
IMG_HEIGHT = 128
IMG_WIDTH = 128

def load_images(image_dir, dataframe, disease_categories):
    images = []
    labels = []
    missing_files = []
    for index, row in dataframe.iterrows():
        img_path = os.path.join(image_dir, row['image'] + '.jpg')
        img = cv2.imread(img_path)
        if img is not None:
            img = cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT))
            images.append(img)
            labels.append(row[disease_categories].values.astype('float32'))
        else:
            missing_files.append(img_path)
    images = np.array(images) / 255.0
    labels = np.array(labels)
    if missing_files:
        print(f"Missing files: {missing_files}")
    return images, labels



In [15]:
# Disease categories
disease_categories = ['MEL', 'NV', 'BCC', 'AKIEC', 'BKL', 'DF', 'VASC']

In [16]:
# # Load training and test images
X_train, y_train = load_images(train_image_dir, train_labels_df, disease_categories)
X_test, y_test = load_images(test_image_dir, test_labels_df, disease_categories)
print(f'Training data shape: {X_train.shape}')
print(f'Test data shape: {X_test.shape}')

Missing files: ['/content/HAM10000_images_part_1/ISIC_0029306.jpg', '/content/HAM10000_images_part_1/ISIC_0029307.jpg', '/content/HAM10000_images_part_1/ISIC_0029308.jpg', '/content/HAM10000_images_part_1/ISIC_0029309.jpg', '/content/HAM10000_images_part_1/ISIC_0029310.jpg', '/content/HAM10000_images_part_1/ISIC_0029311.jpg', '/content/HAM10000_images_part_1/ISIC_0029312.jpg', '/content/HAM10000_images_part_1/ISIC_0029313.jpg', '/content/HAM10000_images_part_1/ISIC_0029314.jpg', '/content/HAM10000_images_part_1/ISIC_0029315.jpg', '/content/HAM10000_images_part_1/ISIC_0029316.jpg', '/content/HAM10000_images_part_1/ISIC_0029317.jpg', '/content/HAM10000_images_part_1/ISIC_0029318.jpg', '/content/HAM10000_images_part_1/ISIC_0029319.jpg', '/content/HAM10000_images_part_1/ISIC_0029320.jpg', '/content/HAM10000_images_part_1/ISIC_0029321.jpg', '/content/HAM10000_images_part_1/ISIC_0029322.jpg', '/content/HAM10000_images_part_1/ISIC_0029323.jpg', '/content/HAM10000_images_part_1/ISIC_0029324.jp

In [19]:
# Check if the data is loaded correctly
if X_train.size == 0 or y_train.size == 0:
    raise ValueError("Training data or labels not loaded correctly.")
if X_test.size == 0 or y_test.size == 0:
    raise ValueError("Test data or labels not loaded correctly.")

In [20]:

# Convert labels to float32
y_train = np.array(y_train).astype('float32')
y_test = np.array(y_test).astype('float32')


In [21]:
# Build the model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(disease_categories), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [31]:
# Train the model
history = model.fit(X_train, y_train, epochs=50, validation_split=0.2, batch_size=32)

Epoch 1/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9803 - loss: 0.0562 - val_accuracy: 0.7450 - val_loss: 2.7066
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.9870 - loss: 0.0416 - val_accuracy: 0.7510 - val_loss: 2.9427
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.9907 - loss: 0.0340 - val_accuracy: 0.7250 - val_loss: 3.0912
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.9754 - loss: 0.0634 - val_accuracy: 0.7380 - val_loss: 2.9569
Epoch 5/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.9893 - loss: 0.0397 - val_accuracy: 0.7370 - val_loss: 2.7407
Epoch 6/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.9835 - loss: 0.0466 - val_accuracy: 0.7390 - val_loss: 3.0821
Epoch 7/50
[1m125/125

In [32]:
# Evaluate on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_accuracy}')


[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.6688 - loss: 6.2026
Test accuracy: 0.6256613731384277


In [34]:

# Save the model
model.save('ham10000_classifier.keras')