Understanding Data

In [1]:
import os
from pathlib import Path

In [2]:
from skinCancerDiagnosis.constants import*
from skinCancerDiagnosis.utils.common import read_yaml

In [3]:
%pwd

'/Volumes/T7/DL_Skin_Cancer_Project/skin_cancer_diagnosis/research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'/Volumes/T7/DL_Skin_Cancer_Project/skin_cancer_diagnosis'

Get Data Path

In [6]:
config = read_yaml(CONFIG_FILE_PATH)

[2024-04-18 08:57:40,963: INFO: common: yaml file: config/config.yaml loaded successfully]


In [7]:
data_path = config.data_path

print(data_path)

artifacts/data_ingestion/ISIC - 2019


Data Classification

In [8]:
os.listdir(data_path)

['test', 'train', 'val']

In [9]:
train_dir = os.path.join(data_path,"train")

print("Classes in the Dataset:")

print("-----------------")

for i in os.listdir(train_dir):
    print(i)

print("-----------------")
print("Total no.of classes : {}".format(len(os.listdir(train_dir))))

Classes in the Dataset:
-----------------
Actinic keratosis
Basal cell carcinoma
Benign keratosis
Dermatofibroma
Melanocytic nevus
Melanoma
Squamous cell carcinoma
Vascular lesion
-----------------
Total no.of classes : 8


Understanding Data Distribution

In [10]:
from tabulate import tabulate
import matplotlib.pyplot as plt

In [11]:
def count_images_per_class(root_dir, split):
    # List of lists to store data for tabulation
    data = []

    # Path to the split directory
    split_path = os.path.join(root_dir, split)
    
    if os.path.isdir(split_path):
        for class_dir in os.listdir(split_path):
            class_path = os.path.join(split_path, class_dir)
            if os.path.isdir(class_path):
                # Count number of images in the class directory
                num_images = len([f for f in os.listdir(class_path) if f.endswith('.jpg') or f.endswith('.png')])
                data.append([class_dir, num_images])

    return data

In [12]:
# Directory containing train, test, and validation folders
root_directory = data_path

# Get data for tabulation for each split
train_data = count_images_per_class(root_directory, "train")
test_data = count_images_per_class(root_directory, "test")
val_data = count_images_per_class(root_directory, "val")

Train data Distibution

In [13]:

print(tabulate(train_data, headers=["Class", "Number of Images"], tablefmt="grid"))

+-------------------------+--------------------+
| Class                   |   Number of Images |
| Actinic keratosis       |                716 |
+-------------------------+--------------------+
| Basal cell carcinoma    |               2820 |
+-------------------------+--------------------+
| Benign keratosis        |               2215 |
+-------------------------+--------------------+
| Dermatofibroma          |                206 |
+-------------------------+--------------------+
| Melanocytic nevus       |              10979 |
+-------------------------+--------------------+
| Melanoma                |               3812 |
+-------------------------+--------------------+
| Squamous cell carcinoma |                541 |
+-------------------------+--------------------+
| Vascular lesion         |                202 |
+-------------------------+--------------------+


Test data Distibution

In [14]:

print(tabulate(test_data, headers=["Class", "Number of Images"], tablefmt="grid"))

+-------------------------+--------------------+
| Class                   |   Number of Images |
| Actinic keratosis       |                 75 |
+-------------------------+--------------------+
| Basal cell carcinoma    |                250 |
+-------------------------+--------------------+
| Benign keratosis        |                203 |
+-------------------------+--------------------+
| Dermatofibroma          |                 11 |
+-------------------------+--------------------+
| Melanocytic nevus       |                965 |
+-------------------------+--------------------+
| Melanoma                |                360 |
+-------------------------+--------------------+
| Squamous cell carcinoma |                 42 |
+-------------------------+--------------------+
| Vascular lesion         |                 24 |
+-------------------------+--------------------+


Validation data Distibution

In [15]:

print(tabulate(val_data, headers=["Class", "Number of Images"], tablefmt="grid"))

+-------------------------+--------------------+
| Class                   |   Number of Images |
| Actinic keratosis       |                 76 |
+-------------------------+--------------------+
| Basal cell carcinoma    |                253 |
+-------------------------+--------------------+
| Benign keratosis        |                206 |
+-------------------------+--------------------+
| Dermatofibroma          |                 22 |
+-------------------------+--------------------+
| Melanocytic nevus       |                931 |
+-------------------------+--------------------+
| Melanoma                |                350 |
+-------------------------+--------------------+
| Squamous cell carcinoma |                 45 |
+-------------------------+--------------------+
| Vascular lesion         |                 27 |
+-------------------------+--------------------+
