Melakukan Augmentasi Data pada tanggal 13 Juni 2024 jam 21.00 WIB

In [1]:
import os
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img, array_to_img, save_img
from PIL import Image

def augment_images(dataset_path, output_path, output_csv, augment_count=1000, target_size=(128, 128)):
    # Create an ImageDataGenerator object with augmentation parameters
    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0,
        height_shift_range=0,
        shear_range=0,
        zoom_range=0,
        horizontal_flip=False,
        vertical_flip=False,
        fill_mode='nearest'
    )
    
    labels = []

    for class_dir in os.listdir(dataset_path):
        class_path = os.path.join(dataset_path, class_dir)
        if not os.path.isdir(class_path):
            continue
        
        class_labels = []  # Keep track of labels for each class
        
        img_files = [f for f in os.listdir(class_path) if f.endswith(('.jpg', '.jpeg', '.png'))]
        num_images = len(img_files)
        
        if num_images == 0:
            continue
        
        augmentations_per_image = augment_count // num_images
        
        for img_file in img_files:
            img_path = os.path.join(class_path, img_file)
            
            img = load_img(img_path)
            x = img_to_array(img)
            x = x.reshape((1,) + x.shape)
            
            i = 0
            for batch in datagen.flow(x, batch_size=1):
                augmented_img = array_to_img(batch[0])
                
                # Resize the augmented image
                augmented_img = augmented_img.resize(target_size, Image.LANCZOS)
                
                augmented_img_name = f'{os.path.splitext(img_file)[0]}_aug_{i}.png'
                augmented_img_path = os.path.join(output_path, class_dir)
                if not os.path.exists(augmented_img_path):
                    os.makedirs(augmented_img_path)
                
                save_img(os.path.join(augmented_img_path, augmented_img_name), augmented_img)
                
                class_labels.append([os.path.join(augmented_img_path, augmented_img_name), class_dir])
                
                i += 1
                if i >= augmentations_per_image:
                    break
        
        labels.extend(class_labels)  # Append the labels for this class to the main list
    
    # Save labels to CSV
    df = pd.DataFrame(labels, columns=['file_path', 'label'])
    df.to_csv(output_csv, index=False)
    print(f'Augmented images and labels CSV file created at: {output_csv}')

# Usage
dataset_path = "../data/data_original/Pengenalan Aksara Jawa-tensorflow/CustomData/v0.1 - Clasifikasi Class"
output_path = "../data/data_preprocessing/v2.3/"
output_csv = '../data/data_preprocessing/v2.3/augmented_labels.csv'

augment_images(dataset_path, output_path, output_csv, target_size=(128, 128))

Augmented images and labels CSV file created at: ../data/data_preprocessing/v2.3/augmented_labels.csv


In [2]:
# Lokasi direktori dataset asli
dataset_path = output_path

# Daftar kelas (nama subfolder)
classes = os.listdir(dataset_path)

# Dictionary untuk menyimpan jumlah gambar dalam setiap kelas
class_counts = {}

# Iterasi melalui setiap kelas
for class_name in classes:
    class_folder = os.path.join(dataset_path, class_name)
    if os.path.isdir(class_folder):
        # Hitung jumlah file gambar dalam subfolder (kelas)
        num_images = len([name for name in os.listdir(class_folder) if os.path.isfile(os.path.join(class_folder, name))])
        class_counts[class_name] = num_images

# Tampilkan jumlah gambar dalam setiap kelas
for class_name, count in class_counts.items():
    print(f"Kelas {class_name}: {count} gambar")

Kelas class 1: 990 gambar
Kelas class 2: 992 gambar
Kelas class 3: 990 gambar
Kelas class 4: 988 gambar
Kelas class 5: 990 gambar
Kelas class 6: 986 gambar
Kelas class 7: 988 gambar


Mengatasi Imbalanced Dataset atau ketidak seimbangan data menggunakan Metode Random Oversampling

In [None]:
# # Path dataset
# data = output_img_aug

# # List kelas
# kelas_aksara = ['ba', 'ca', 'da', 'dha', 'ga', 'ha', 'ja', 'ka', 'la', 'ma', 'na', 'nga', 'nya', 'pa', 'ra', 'sa', 'ta', 'tha', 'wa', 'ya']

# # Target jumlah gambar per kelas
# target_jumlah = 493

# # Loop untuk setiap kelas
# for kelas in kelas_aksara:
#     # Path kelas
#     kelas_path = os.path.join(data, kelas)
    
#     # Jumlah gambar saat ini
#     jumlah_gambar = len(os.listdir(kelas_path))
    
#     # Selisih gambar yang perlu ditambahkan
#     selisih = target_jumlah - jumlah_gambar
    
#     # Jika selisih positif, lakukan oversampling
#     if selisih > 0:
#         # Ambil sampel acak dari gambar yang sudah ada
#         gambar_oversampling = random.sample(os.listdir(kelas_path), selisih)
        
#         # Copy gambar oversampling ke dalam kelas
#         for gambar in gambar_oversampling:
#             source_path = os.path.join(kelas_path, gambar)
            
#             # Generate sufiks acak
#             sufiks_acak = ''.join(random.choices(string.ascii_letters + string.digits, k=8))
            
#             # Tentukan path tujuan dengan menambahkan sufiks acak
#             target_path = os.path.join(kelas_path, f'{os.path.splitext(gambar)[0]}_{sufiks_acak}{os.path.splitext(gambar)[1]}')
            
#             # Salin gambar ke target path
#             shutil.copy(source_path, target_path)

# # Cetak ulang jumlah gambar per kelas setelah penyamaan
# for kelas in kelas_aksara:
#     kelas_path = os.path.join(data, kelas)
#     jumlah_gambar = len(os.listdir(kelas_path))
#     print(f'Aksara {kelas} jumlah: {jumlah_gambar}')