In [2]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
import shutil
from sklearn.model_selection import train_test_split

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [None]:
data_pd = pd.read_csv('D:/New_Capstone/Casptone/Data/HAM10000/HAM10000_metadata.csv')
data_pd.head()

In [None]:
train_dir = os.path.join('D:/New_Capstone/Casptone/Data/Data-model', 'train_dir')
vali_dir = os.path.join('D:/New_Capstone/Casptone/Data/Data-model', 'vali_dir')

In [None]:
test_dir = os.path.join('D:/New_Capstone/Casptone/Data/Data-model', 'test_dir')

In [None]:
df_count = data_pd.groupby('lesion_id').count()
df_count.head()

In [None]:
df_count = df_count[df_count['dx'] == 1]
df_count.reset_index(inplace=True)

In [None]:
def duplicates(x):
    unique = set(df_count['lesion_id'])
    if x in unique:
        return 'no' 
    else:
        return 'duplicates'

In [None]:
data_pd['is_duplicate'] = data_pd['lesion_id'].apply(duplicates)
data_pd.head()

In [None]:
df_count = data_pd[data_pd['is_duplicate'] == 'no']

In [None]:
train, vali_df = train_test_split(df_count, test_size=0.45, stratify=df_count['dx'])

In [None]:
test_df = pd.read_csv('D:/New_Capstone/Casptone/Data/HAM10000/ISIC2018_Task3_Test_GroundTruth.csv')

In [None]:
def identify_trainOrtest(x):
    vali_data = set(vali_df['image_id'])
    if str(x) in vali_data:
        return 'vali'
    else:
        return 'train'

#creating train_df
data_pd['train_vali_split'] = data_pd['image_id'].apply(identify_trainOrtest)
train_df = data_pd[data_pd['train_vali_split'] == 'train']
train_df.head()

In [None]:
train_df= pd.read_csv('D:/New_Capstone/Casptone/Data/Data-model/Dataset_model_split45_clean.csv')
vali_df= pd.read_csv('D:/New_Capstone/Casptone/Ham10000 models/Model without Soft Attention/vali_df_split45.csv')
test_df= pd.read_csv('D:/New_Capstone/Casptone/Data/HAM10000/ISIC2018_Task3_Test_GroundTruth.csv')

In [None]:
train_df.head()

In [None]:
vali_df.head()

In [None]:
test_df.head()

In [None]:
# Image id of train and test images
train_list = list(train_df['image_id'])
vali_list = list(vali_df['image_id'])

In [None]:
test_list = list(test_df['image_id'])

In [None]:
so_luong_benh = train_df['dx'].value_counts()
so_luong_benh

In [None]:
so_luong_benh = vali_df['dx'].value_counts()
so_luong_benh

In [None]:
so_luong_benh = test_df['dx'].value_counts()
so_luong_benh

In [None]:
split = 45
df_count.to_csv(f'df_count_split{split}.csv')
train_df.to_csv(f'train_df_split{split}.csv')
vali_df.to_csv(f'vali_df_split{split}.csv')

In [None]:
len(test_list)

In [None]:
len(vali_list)

In [None]:
len(train_list)

In [None]:
# Set the image_id as the index in data_pd
train_df.set_index('image_id', inplace=True)

In [None]:
vali_df.set_index('image_id', inplace=True)

In [None]:
test_df.set_index('image_id', inplace=True)

In [None]:
os.mkdir(train_dir)
os.mkdir(vali_dir)

In [None]:
os.mkdir(test_dir)

In [None]:
targetnames = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']

In [None]:
for i in targetnames:
  directory1=train_dir+'/'+i
  directory2=vali_dir+'/'+i
  os.mkdir(directory1)
  os.mkdir(directory2)

In [None]:
for i in targetnames:
  directory3=test_dir+'/'+i
  os.mkdir(directory3)

In [None]:
for image in train_list:
    file_name = image+'.jpg'
    label = train_df.loc[image, 'dx']

    # path of source image 
    source = os.path.join('D:/New_Capstone/Casptone/Data/Data-model/Dataset_images_split45', file_name)

    # copying the image from the source to target file
    target = os.path.join(train_dir, label, file_name)

    shutil.copyfile(source, target)

In [None]:
for image in vali_list:

    file_name = image+'.jpg'
    label = vali_df.loc[image, 'dx']

    # path of source image 
    source = os.path.join('D:/New_Capstone/Casptone/Data/HAM10000/HAM10000_images', file_name)

    # copying the image from the source to target file
    target = os.path.join(vali_dir, label, file_name)

    shutil.copyfile(source, target)

In [None]:
for image in test_list:

    file_name = image+'.jpg'
    label = test_df.loc[image, 'dx']

    # path of source image 
    source = os.path.join('D:/New_Capstone/Casptone/Data/HAM10000/ISIC2018_Task3_Test_Images/ISIC2018_Task3_Test_Images', file_name)

    # copying the image from the source to target file
    target = os.path.join(test_dir, label, file_name)

    shutil.copyfile(source, target)

In [4]:
targetnames = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']

# Augmenting images and storing them in temporary directories 
for img_class in targetnames:

    #creating temporary directories
    # creating a base directory
    aug_dir = 'D:/New_Capstone/Casptone/Data/Data-model/aug_dir'
    os.mkdir(aug_dir)
    # creating a subdirectory inside the base directory for images of the same class
    img_dir = os.path.join(aug_dir, 'img_dir')
    os.mkdir(img_dir)

    img_list = os.listdir('D:/New_Capstone/Casptone/Data/Data-model/train_dir/' + img_class)

    # Copy images from the class train dir to the img_dir 
    for file_name in img_list:

        # path of source image in training directory
        source = os.path.join('D:/New_Capstone/Casptone/Data/Data-model/train_dir/' + img_class, file_name)

        # creating a target directory to send images 
        target = os.path.join(img_dir, file_name)

        # copying the image from the source to target file
        shutil.copyfile(source, target)

    # Temporary augumented dataset directory.
    source_path = aug_dir

    # Augmented images will be saved to training directory
    save_path = 'D:/New_Capstone/Casptone/Data/Data-model/train_dir/' + img_class

    # Creating Image Data Generator to augment images
    datagen = tf.keras.preprocessing.image.ImageDataGenerator(
        rotation_range=180,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        fill_mode='nearest',
        brightness_range=[0.2,1.2],
    )

    
    batch_size = 50

    aug_datagen = datagen.flow_from_directory(source_path,save_to_dir=save_path,save_format='jpg',target_size=(224, 224),batch_size=batch_size)
    
    # Generate the augmented images
    aug_images = 20000 

    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((aug_images - num_files) / batch_size))

    # creating 8000 augmented images per class
    for i in range(0, num_batches):
        images, labels = next(aug_datagen)

    # delete temporary directory 
    shutil.rmtree('D:/New_Capstone/Casptone/Data/Data-model/aug_dir')


Found 2068 images belonging to 1 classes.
Found 4707 images belonging to 1 classes.
Found 4572 images belonging to 1 classes.
Found 239 images belonging to 1 classes.
Found 5153 images belonging to 1 classes.
Found 4415 images belonging to 1 classes.
Found 253 images belonging to 1 classes.


In [5]:
import os

def dem_so_luong_anh(folder_path, extensions=['jpg', 'jpeg', 'png', 'gif']):
    if not os.path.exists(folder_path):
        print(f"Thư mục '{folder_path}' không tồn tại.")
        return

    danh_sach_anh = [file for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))]

    so_luong_anh = 0
    for file in danh_sach_anh:
        phan_mo_rong = file.split('.')[-1].lower()
        if phan_mo_rong in extensions:
            so_luong_anh += 1

    print(f"Số lượng file hình ảnh trong thư mục '{folder_path}': {so_luong_anh}")

folder_path = 'D:/New_Capstone/Casptone/Data/Data-model/train_dir/'
targetnames = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']
for i in targetnames:
    dem_so_luong_anh(folder_path+i)


Số lượng file hình ảnh trong thư mục 'D:/New_Capstone/Casptone/Data/Data-model/train_dir/akiec': 19762
Số lượng file hình ảnh trong thư mục 'D:/New_Capstone/Casptone/Data/Data-model/train_dir/bcc': 19878
Số lượng file hình ảnh trong thư mục 'D:/New_Capstone/Casptone/Data/Data-model/train_dir/bkl': 19938
Số lượng file hình ảnh trong thư mục 'D:/New_Capstone/Casptone/Data/Data-model/train_dir/df': 19170
Số lượng file hình ảnh trong thư mục 'D:/New_Capstone/Casptone/Data/Data-model/train_dir/mel': 19909
Số lượng file hình ảnh trong thư mục 'D:/New_Capstone/Casptone/Data/Data-model/train_dir/nv': 19910
Số lượng file hình ảnh trong thư mục 'D:/New_Capstone/Casptone/Data/Data-model/train_dir/vasc': 16948
