In [21]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

print("Semua pustaka penting berhasil diimpor!")
print("TensorFlow Version:", tf.__version__)

Semua pustaka penting berhasil diimpor!
TensorFlow Version: 2.20.0


In [22]:
# Di file model_code/01_data_preprocessing.ipynb - Sel 1

import pandas as pd             # Wajib untuk membaca Excel/CSV
import numpy as np              # Wajib untuk operasi array
import os                       # Wajib untuk operasi file/path
import tensorflow as tf         # Untuk verifikasi dan Deep Learning
from sklearn.model_selection import train_test_split 
from sklearn.utils import compute_class_weight

print("TensorFlow Version:", tf.__version__)
print("Pustaka Berhasil Diimpor!")

# --- DEFINISI JALUR FILE (Lanjutan) ---
# ... (Baris-baris definisi jalur Anda)

# --- DEFINISI JALUR FILE ---
EXCEL_PATH_TB = '../data/metadata_tuberkulosis.xlsx' # Pastikan nama file TB sama
EXCEL_PATH_NORMAL = '../data/metadata_normal.xlsx'   
IMAGE_DIR = '../data/all_image/'                   

# --- NAMA KOLOM HANYA SATU YANG RELEVAN ---
# Kolom di Excel yang berisi nama file citra
NAMA_KOLOM_FILE = 'FILE NAME' # HARUS SAMA PERSIS! (Huruf Besar)
# -----------------------------------------------------------------

TensorFlow Version: 2.20.0
Pustaka Berhasil Diimpor!


In [23]:
# A. Membaca Data Tuberkulosis (TB)
try:
    df_tb = pd.read_excel(EXCEL_PATH_TB)
    # Gunakan NAMA_KOLOM_FILE
    df_tb.rename(columns={NAMA_KOLOM_FILE: 'File_Name_Clean'}, inplace=True)
    df_tb['Label'] = 'Tuberculosis' 
    print(f"Data TB dimuat: {len(df_tb)} baris")
except FileNotFoundError:
    print(f"ERROR: File TB tidak ditemukan. Mohon periksa nama file dan path-nya.")
    raise

# B. Membaca Data Normal
try:
    df_normal = pd.read_excel(EXCEL_PATH_NORMAL)
    # Gunakan NAMA_KOLOM_FILE
    df_normal.rename(columns={NAMA_KOLOM_FILE: 'File_Name_Clean'}, inplace=True)
    df_normal['Label'] = 'Normal'
    print(f"Data Normal dimuat: {len(df_normal)} baris")
except FileNotFoundError:
    print(f"ERROR: File Normal tidak ditemukan. Mohon periksa nama file dan path-nya.")
    raise

# C. Gabungkan Kedua DataFrame
df = pd.concat([df_tb, df_normal], ignore_index=True)

# 3. Finalisasi Jalur Citra
# Membuat kolom 'path' untuk jalur lengkap citra
# Citra memiliki ekstensi PNG, jadi tambahkan .png
df['path'] = df['File_Name_Clean'].astype(str) + '.png'
df['path'] = df['path'].apply(lambda x: os.path.join(IMAGE_DIR, x))

# 4. Verifikasi Akhir
print("\n--- Verifikasi DataFrame Gabungan ---")
print("Total Data Keseluruhan:", len(df))
print("\nDistribusi Kelas:\n", df['Label'].value_counts())
print("\nContoh 5 Baris Pertama:\n", df[['path', 'Label']].head())


Data TB dimuat: 700 baris
Data Normal dimuat: 3500 baris

--- Verifikasi DataFrame Gabungan ---
Total Data Keseluruhan: 4200

Distribusi Kelas:
 Label
Normal          3500
Tuberculosis     700
Name: count, dtype: int64

Contoh 5 Baris Pertama:
                                    path         Label
0  ../data/all_image/Tuberculosis-1.png  Tuberculosis
1  ../data/all_image/Tuberculosis-2.png  Tuberculosis
2  ../data/all_image/Tuberculosis-3.png  Tuberculosis
3  ../data/all_image/Tuberculosis-4.png  Tuberculosis
4  ../data/all_image/Tuberculosis-5.png  Tuberculosis


In [24]:
from sklearn.model_selection import train_test_split

# --- ASUMSI: df sudah dimuat dan digabungkan ---

# Pembagian data: 70% Training, 30% Sisa
# 'stratify' memastikan rasio kelas (Normal/TB) sama di setiap set
df_train, df_temp = train_test_split(
    df, 
    test_size=0.30, 
    stratify=df['Label'], 
    random_state=42
)

# Pembagian Sisa: 15% Validation, 15% Testing
df_val, df_test = train_test_split(
    df_temp, 
    test_size=0.50, 
    stratify=df_temp['Label'], 
    random_state=42
)

# Verifikasi Jumlah Sampel
print(f"Total Citra Latih (Training): {len(df_train)}")
print(f"Total Citra Validasi (Validation): {len(df_val)}")
print(f"Total Citra Uji (Testing): {len(df_test)}")

print("\nDistribusi Kelas di Data Latih:")
print(df_train['Label'].value_counts())

Total Citra Latih (Training): 2940
Total Citra Validasi (Validation): 630
Total Citra Uji (Testing): 630

Distribusi Kelas di Data Latih:
Label
Normal          2450
Tuberculosis     490
Name: count, dtype: int64


In [25]:
# Ubah '.png' menjadi '.jpg'
df['path'] = df['File_Name_Clean'].astype(str) + '.jpg'

In [26]:
from sklearn.utils import compute_class_weight

# Definisikan semua nama kelas unik
nama_kelas_unik = np.sort(df['Label'].unique())

# Hitung bobot dari Data Latih
bobot_kelas_array = compute_class_weight(
    class_weight='balanced', 
    classes=nama_kelas_unik,
    y=df_train['Label'] 
)

# Konversi ke dictionary agar bisa digunakan oleh Keras
class_indices = {name: i for i, name in enumerate(nama_kelas_unik)}

# Keras menggunakan indeks (0, 1) sebagai kunci
bobot_kelas_dictionary = {
    class_indices[name]: bobot_kelas_array[i] 
    for i, name in enumerate(nama_kelas_unik)
}

print("\nClass Weights (Bobot Kelas) Dihitung:")
print(bobot_kelas_dictionary)


Class Weights (Bobot Kelas) Dihitung:
{0: np.float64(0.6), 1: np.float64(3.0)}


In [27]:
# Sel 1: Impor Pustaka dan Definisi Jalur Absolut

import pandas as pd
import numpy as np
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import compute_class_weight
from tensorflow.keras.preprocessing.image import ImageDataGenerator

print("TensorFlow Version:", tf.__version__)

# --- DEFINISI JALUR FILE ABSOLUT (HARUS DIKOREKSI) ---
# GANTI JALUR DI BAWAH INI dengan jalur ABSOLUT FOLDER all_image Anda
# Contoh: 'C:/Users/NamaAnda/Documents/project/tb_skripsi/data/all_image/'
IMAGE_DIR_ABSOLUT = 'C:/Users/Administrator/Documents/project/tb_skripsi/data/all_image/'

EXCEL_PATH_TB = '../data/metadata_tuberkulosis.xlsx' 
EXCEL_PATH_NORMAL = '../data/metadata_normal.xlsx'   

# --- NAMA KOLOM DARI EXCEL (PASTIKAN SAMA PERSIS!) ---
NAMA_KOLOM_FILE = 'FILE NAME'      # Berdasarkan screenshot Anda
# NAMA_KOLOM_LABEL tidak perlu karena kita beri label secara manual

TensorFlow Version: 2.20.0


In [28]:
# Di file model_code/01_data_preprocessing.ipynb - Sel 2

# ASUMSI: EXCEL_PATH_TB, EXCEL_PATH_NORMAL, IMAGE_DIR_ABSOLUT sudah didefinisikan di Sel 1
# ASUMSI: NAMA_KOLOM_FILE = 'FILE NAME' (sesuai screenshot)
# ASUMSI: Kolom label (Diagnosis) akan diberi label secara manual.

# --- A. Membaca Data Tuberkulosis (TB) ---
try:
    df_tb = pd.read_excel(EXCEL_PATH_TB)
    # Ganti nama kolom file citra untuk kemudahan dan tambahkan label
    df_tb.rename(columns={NAMA_KOLOM_FILE: 'File_Name_Clean'}, inplace=True)
    df_tb['Label'] = 'Tuberculosis' 
except FileNotFoundError:
    print(f"ERROR: File TB tidak ditemukan. Periksa jalur: {EXCEL_PATH_TB}")
    raise

# --- B. Membaca Data Normal ---
try:
    df_normal = pd.read_excel(EXCEL_PATH_NORMAL)
    # Ganti nama kolom file citra dan tambahkan label
    df_normal.rename(columns={NAMA_KOLOM_FILE: 'File_Name_Clean'}, inplace=True)
    df_normal['Label'] = 'Normal'
except FileNotFoundError:
    print(f"ERROR: File Normal tidak ditemukan. Periksa jalur: {EXCEL_PATH_NORMAL}")
    raise

# --- C. Gabungkan Kedua DataFrame ---
df = pd.concat([df_tb, df_normal], ignore_index=True)


# --- 1. PEMBUATAN FILE_NAME_CLEAN (PEMBERSIHAN STRING & EKSTENSI) ---
# Hapus spasi di awal/akhir string (KRITIS untuk data Excel)
df['File_Name_Clean'] = df['File_Name_Clean'].astype(str).str.strip() 

# Tambahkan ekstensi PNG (sesuai file fisik Anda)
df['File_Name_Clean'] = df['File_Name_Clean'] + '.png' 


# --- 2. FINALISASI JALUR ABSOLUT (PENGGABUNGAN STRING PALING AMAN) ---
# Menggabungkan Jalur Absolut Folder (IMAGE_DIR_ABSOLUT) dengan Nama File Bersih
# Ini mengatasi masalah os.path.join dan backslash
df['path'] = IMAGE_DIR_ABSOLUT + df['File_Name_Clean'] 
df['Label'] = df['Label'].astype(str) # Pastikan Label bertipe string

# 3. Verifikasi Data Gabungan
print("--- Verifikasi DataFrame Gabungan ---")
print("Total Data Keseluruhan:", len(df))
print("\nDistribusi Kelas:\n", df['Label'].value_counts())
print("\nContoh 5 Jalur Citra yang Dicari Keras:")
print(df['path'].head())

--- Verifikasi DataFrame Gabungan ---
Total Data Keseluruhan: 4200

Distribusi Kelas:
 Label
Normal          3500
Tuberculosis     700
Name: count, dtype: int64

Contoh 5 Jalur Citra yang Dicari Keras:
0    C:/Users/Administrator/Documents/project/tb_sk...
1    C:/Users/Administrator/Documents/project/tb_sk...
2    C:/Users/Administrator/Documents/project/tb_sk...
3    C:/Users/Administrator/Documents/project/tb_sk...
4    C:/Users/Administrator/Documents/project/tb_sk...
Name: path, dtype: object


In [29]:
# Sel 3: Pembagian Data

# Pembagian data: 70% Training, 30% Sisa
df_train, df_temp = train_test_split(
    df, 
    test_size=0.30, 
    stratify=df['Label'], 
    random_state=42
)

# Pembagian Sisa: 15% Validation, 15% Testing
df_val, df_test = train_test_split(
    df_temp, 
    test_size=0.50, 
    stratify=df_temp['Label'], 
    random_state=42
)

print(f"Total Citra Latih: {len(df_train)}, Validasi: {len(df_val)}, Uji: {len(df_test)}")

Total Citra Latih: 2940, Validasi: 630, Uji: 630


In [30]:
# Sel 4: Perhitungan Class Weights

nama_kelas_unik = np.sort(df['Label'].unique())

# Hitung bobot dari Data Latih
bobot_kelas_array = compute_class_weight(
    class_weight='balanced', 
    classes=nama_kelas_unik,
    y=df_train['Label'] 
)

# Konversi ke dictionary untuk Keras
class_indices = {name: i for i, name in enumerate(nama_kelas_unik)}

bobot_kelas_dictionary = {
    class_indices[name]: bobot_kelas_array[i] 
    for i, name in enumerate(nama_kelas_unik)
}

print("\nClass Weights (Bobot Kelas) Dihitung:")
print(bobot_kelas_dictionary)


Class Weights (Bobot Kelas) Dihitung:
{0: np.float64(0.6), 1: np.float64(3.0)}


In [31]:
# Ganti semua '\' dengan '/' dan tambahkan '/' di akhir
# IMAGE_DIR_ABSOLUT = 'C:/Users/NamaAnda/Documents/project/tb_skripsi/data/all_image/'

In [32]:
# Sel 5: Setup Data Generator

TARGET_SIZE = (224, 224) 
BATCH_SIZE = 32          

# 1. Generator untuk TRAINING (dengan Augmentasi)
train_datagen = ImageDataGenerator(
    rescale=1./255,             
    rotation_range=20,          
    zoom_range=0.15,            
    horizontal_flip=True,       
)

# 2. Generator untuk VALIDASI dan TESTING (Hanya Normalisasi)
val_test_datagen = ImageDataGenerator(rescale=1./255)

# 3. Buat Generator Data dari DataFrame
train_generator = train_datagen.flow_from_dataframe(
    dataframe=df_train,
    x_col='path',          
    y_col='Label',         
    target_size=TARGET_SIZE,
    class_mode='binary',       
    batch_size=BATCH_SIZE
)

val_generator = val_test_datagen.flow_from_dataframe(
    dataframe=df_val,
    x_col='path',          
    y_col='Label',         
    target_size=TARGET_SIZE,
    class_mode='binary',       
    batch_size=BATCH_SIZE
)

test_generator = val_test_datagen.flow_from_dataframe(
    dataframe=df_test,
    x_col='path',          
    y_col='Label',         
    target_size=TARGET_SIZE,
    class_mode='binary',       
    batch_size=BATCH_SIZE,
    shuffle=False 
)

Found 2940 validated image filenames belonging to 2 classes.
Found 630 validated image filenames belonging to 2 classes.
Found 630 validated image filenames belonging to 2 classes.
