In [6]:
import pandas as pd
import os
from google.colab import drive

print("Menghubungkan ke Google Drive...")
drive.mount('/content/drive')

# --- KONFIGURASI PATH ---
BASE_DRIVE_PATH = "/content/drive/MyDrive/Earthquake_Prediction/"
BASE_IMAGE_DIR = os.path.join(BASE_DRIVE_PATH, "dataset_gambar_3komponen/")
CSV_DIR = os.path.join(BASE_DRIVE_PATH, "dataset_numerik/")

# Daftar file CSV yang akan divalidasi
CSV_FILES_TO_VALIDATE = {
    'precursor': os.path.join(CSV_DIR, "hasil_fitur_precursor.csv"),
    'normal': os.path.join(CSV_DIR, "hasil_fitur_normal.csv"),
    'sintetik': os.path.join(CSV_DIR, "hasil_fitur_sintetik.csv")
}

NUM_AUGMENTATIONS_PER_SAMPLE = 5 # Pastikan angka ini sama dengan di skrip augmentasi

Menghubungkan ke Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import numpy as np # Import numpy here
import pandas as pd
import os
from google.colab import drive

def validate_and_clean_dataset(df_path, data_class, is_synthetic=False):
    """
    Memvalidasi setiap baris, memperbaiki jam yang salah, dan membuat file CSV baru yang bersih.
    """
    print(f"\n--- Memvalidasi dataset: {data_class} ---")

    try:
        df = pd.read_csv(df_path)
    except FileNotFoundError:
        print(f"Peringatan: File {df_path} tidak ditemukan. Dilewati.")
        return

    # MODIFIKASI: Perbaiki nilai jam '24' menjadi '0'
    if 'Jam' in df.columns and (df['Jam'] == 24).any():
        print(f"  Ditemukan { (df['Jam'] == 24).sum() } baris dengan Jam=24. Memperbaiki menjadi Jam=0...")
        df['Jam'] = df['Jam'].replace(24, 0)

    valid_rows = []
    missing_files_count = 0

    df_to_check = df
    if is_synthetic:
        df_expanded = pd.DataFrame(np.repeat(df.values, NUM_AUGMENTATIONS_PER_SAMPLE, axis=0))
        df_expanded.columns = df.columns
        aug_suffixes = [f"_aug{i+1}" for i in range(NUM_AUGMENTATIONS_PER_SAMPLE)]
        df_expanded['aug_suffix'] = np.tile(aug_suffixes, len(df))
        df_to_check = df_expanded

    for index, row in df_to_check.iterrows():
        suffix = row.get('aug_suffix', ''); suffix = '' if pd.isna(suffix) else suffix
        folder_name = 'precursor_sintetik' if is_synthetic else data_class
        base_filename = f"{row['Stasiun']}_PC3_{row['Tanggal']}_{int(row['Jam'])}{suffix}"

        path_z = os.path.join(BASE_IMAGE_DIR, folder_name, f"{base_filename}_Z.png")
        path_h = os.path.join(BASE_IMAGE_DIR, folder_name, f"{base_filename}_H.png")
        path_d = os.path.join(BASE_IMAGE_DIR, folder_name, f"{base_filename}_D.png")

        if os.path.exists(path_z) and os.path.exists(path_h) and os.path.exists(path_d):
            valid_rows.append(row)
        else:
            missing_files_count += 1

    print(f"Total baris awal: {len(df_to_check)}")
    print(f"Jumlah baris valid (semua gambar ditemukan): {len(valid_rows)}")
    print(f"Jumlah baris dengan file hilang (dibuang): {missing_files_count}")

    if valid_rows:
        df_cleaned = pd.DataFrame(valid_rows)
        if 'aug_suffix' in df_cleaned.columns:
            df_cleaned = df_cleaned.drop(columns=['aug_suffix'])

        cleaned_csv_path = df_path.replace('.csv', '_cleaned.csv')
        df_cleaned.to_csv(cleaned_csv_path, index=False)
        print(f"✅ Dataset bersih disimpan di: {cleaned_csv_path}")
    else:
        print("Tidak ada baris valid yang ditemukan.")

In [8]:
validate_and_clean_dataset(CSV_FILES_TO_VALIDATE['precursor'], 'precursor')
validate_and_clean_dataset(CSV_FILES_TO_VALIDATE['normal'], 'normal')
validate_and_clean_dataset(CSV_FILES_TO_VALIDATE['sintetik'], 'sintetik', is_synthetic=True)


--- Memvalidasi dataset: precursor ---
  Ditemukan 34 baris dengan Jam=24. Memperbaiki menjadi Jam=0...
Total baris awal: 552
Jumlah baris valid (semua gambar ditemukan): 518
Jumlah baris dengan file hilang (dibuang): 34
✅ Dataset bersih disimpan di: /content/drive/MyDrive/Earthquake_Prediction/dataset_numerik/hasil_fitur_precursor_cleaned.csv

--- Memvalidasi dataset: normal ---
Total baris awal: 672
Jumlah baris valid (semua gambar ditemukan): 10
Jumlah baris dengan file hilang (dibuang): 662
✅ Dataset bersih disimpan di: /content/drive/MyDrive/Earthquake_Prediction/dataset_numerik/hasil_fitur_normal_cleaned.csv

--- Memvalidasi dataset: sintetik ---
  Ditemukan 170 baris dengan Jam=24. Memperbaiki menjadi Jam=0...
Total baris awal: 13800
Jumlah baris valid (semua gambar ditemukan): 12950
Jumlah baris dengan file hilang (dibuang): 850
✅ Dataset bersih disimpan di: /content/drive/MyDrive/Earthquake_Prediction/dataset_numerik/hasil_fitur_sintetik_cleaned.csv


In [9]:
# Ganti ini:
PRECURSOR_CSV_PATH = os.path.join(CSV_DIR, "hasil_fitur_precursor.csv")
# Menjadi ini:
PRECURSOR_CSV_PATH = os.path.join(CSV_DIR, "hasil_fitur_precursor_cleaned.csv")

# Lakukan hal yang sama untuk NORMAL_CSV_PATH dan SYNTHETIC_CSV_PATH