In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil

os.chdir('/content/drive/MyDrive/FYP/data')

In this section, we will
1. Combines both folders into single folder (all_images_dir)
2. Reorganize the combined folders into 7 subfolders based on their labels (data_dir)



```
all_images_dir = /content/drive/MyDrive/FYP/data/all_images/
data_dir = /content/drive/MyDrive/FYP/data/reorganized/
```



### Combine folders
Since the original dataset in https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000 spilt the images into 2 parts, we have to combine contents in both folders (HAM10000_images_part_1 & HAM10000_images_part_2) into a single folder (all_images)

In [None]:
# Create the 'all_images' directory if it doesn't exist
all_images_dir = os.getcwd() + "/all_images/"
if not os.path.exists(all_images_dir):
    os.makedirs(all_images_dir)

# Path to the two source folders
part_1_dir = os.getcwd() + "/HAM10000_images_part_1/"
part_2_dir = os.getcwd() + "/HAM10000_images_part_2/"


In [None]:
# Check if 'all_images' directory is empty
if not os.listdir(all_images_dir):
    # Combine contents of part_1_dir into all_images_dir
    for filename in os.listdir(part_1_dir):
        shutil.copy(os.path.join(part_1_dir, filename), all_images_dir)

    # Combine contents of part_2_dir into all_images_dir
    for filename in os.listdir(part_2_dir):
        shutil.copy(os.path.join(part_2_dir, filename), all_images_dir)
else:
    print("'all_images' directory is not empty. No files copied.")

## Folder reorganization

In this step, we will reorganizes the preprocessed data into subfolders based on their labels

then use keras image_dataset_from_directory to read images with
folder names as labels

Reorganizes the images into subfolders (/reorganized) based on their labels, where
    
    nv: Melanocytic nevi
    mel: Melanoma
    bkl: Benign keratosis-like lesions
    bcc: Basal cell carcinoma
    akiec: Actinic keratoses
    vasc: Vascular lesions
    df: Dermatofibroma



In [None]:
import pandas as pd
import os
import shutil

# Set the source and destination directories
# destination directory: subfolders will be created based on the labels.
all_images_dir = os.getcwd() + "/all_images/"
data_dir = os.getcwd() + "/reorganized/"

# Check if the destination directory already exists, if not, create it
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Read the CSV file containing image names and corresponding labels into a dataframe
skin_df = pd.read_csv('HAM10000_metadata.csv')


In [None]:
# Check if the 'reorganized' directory is empty
if not os.listdir(data_dir):
    # Create subfolders and copy images
    for label in skin_df['dx'].unique():
        label_dir = os.path.join(data_dir, str(label))

        # Check if the subfolder already exists, if not, create it
        if not os.path.exists(label_dir):
            os.makedirs(label_dir)

        # Copy images to the new subfolder
        for _, row in skin_df[skin_df['dx'] == label].iterrows():
            image_id = row['image_id']
            source_path = os.path.join(all_images_dir, f"{image_id}.jpg")
            destination_path = os.path.join(label_dir, f"{image_id}.jpg")
            shutil.copyfile(source_path, destination_path)
else:
    print("'reorganized' directory is not empty. No files copied.")

The reorganized structure is as follow:


```
reorganized/
...nv/
......nv_image_1.jpg
......nv_image_2.jpg
...mel/
......mel_image_1.jpg
......mel_image_2.jpg
...bkl/
......bkl_image_1.jpg
......bkl_image_2.jpg
...bcc/
......bcc_image_1.jpg
......bcc_image_2.jpg
...akiec/
......akiec_image_1.jpg
......akiec_image_2.jpg
...vasc/
......vasc_image_1.jpg
......vasc_image_2.jpg
...df/
......df_image_1.jpg
......df_image_2.jpg
```

