In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
import skfuzzy as fuzz
from skfuzzy import cluster
from pathlib import Path
import os
from glob import glob

In [8]:
BASE_DIR = Path("dataset/train")


In [9]:
KOMODITAS = {
    'bawang_merah': 'Bawang Merah.csv',
    'beras_medium': 'Beras Medium.csv',
    'beras_premium' : 'Beras Premium.csv',
    'telur_ayam': 'Telur Ayam Ras.csv',
    'gula': 'Gula Konsumsi.csv',
    'Bawang_Putih_Bonggol' : 'Bawang Putih Bonggol.csv',
    'Cabai_Merah_Keriting' : 'Cabai Merah Keriting.csv',
    'Daging_Ayam_Ras' : 'Daging Ayam Ras.csv',
    'Daging_Sapi_Murni': 'Daging Sapi Murni.csv',
    'Tepung_Terigu_Curah' : 'Tepung Terigu (Curah).csv'
}



In [10]:
data = {}
for key, filename in KOMODITAS.items():
    df = pd.read_csv(BASE_DIR / filename)
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date').reset_index(drop=True)
    data[key] = df
    print(f"✓ Loaded {key}: {df.shape}")


✓ Loaded bawang_merah: (1004, 35)
✓ Loaded beras_medium: (1004, 35)
✓ Loaded beras_premium: (1004, 35)
✓ Loaded telur_ayam: (1004, 35)
✓ Loaded gula: (1004, 35)
✓ Loaded Bawang_Putih_Bonggol: (1004, 35)
✓ Loaded Cabai_Merah_Keriting: (1004, 35)
✓ Loaded Daging_Ayam_Ras: (1004, 35)
✓ Loaded Daging_Sapi_Murni: (1004, 35)
✓ Loaded Tepung_Terigu_Curah: (1004, 35)


In [11]:
cleaned_path = "path/to/output/cleaned/"  # Path data cleaned
cleaned_files = glob(os.path.join(cleaned_path, "*_cleaned.csv"))

data_cleaned = {}
for file in cleaned_files:
    # Extract commodity name (remove "_cleaned.csv")
    commodity_name = os.path.basename(file).replace("_cleaned.csv", "")
    df = pd.read_csv(file)
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date').reset_index(drop=True)
    data_cleaned[commodity_name] = df

print(f"✓ Loaded {len(data_cleaned)} cleaned datasets")


✓ Loaded 11 cleaned datasets


In [12]:
print("\n" + "="*80)
print("VALIDATION: BEFORE vs AFTER CLEANING")
print("="*80)

validation_results = []

for commodity in data.keys():
    if commodity not in data_cleaned:
        print(f"⚠️ Warning: {commodity} not found in cleaned data!")
        continue
    
    df_before = data[commodity].drop('Date', axis=1)
    df_after = data_cleaned[commodity].drop('Date', axis=1)
    
    # Statistik
    mean_before = df_before.mean().mean()
    mean_after = df_after.mean().mean()
    std_before = df_before.std().mean()
    std_after = df_after.std().mean()
    
    # Perubahan STD (indikator over-smoothing)
    std_change_pct = ((std_after - std_before) / std_before) * 100 if std_before > 0 else 0
    
    # Missing values
    missing_before = df_before.isnull().sum().sum()
    missing_after = df_after.isnull().sum().sum()
    
    validation_results.append({
        'Commodity': commodity,
        'Mean Before': f'{mean_before:.0f}',
        'Mean After': f'{mean_after:.0f}',
        'Std Before': f'{std_before:.2f}',
        'Std After': f'{std_after:.2f}',
        'Std Change (%)': f'{std_change_pct:.1f}',
        'Missing Before': missing_before,
        'Missing After': missing_after
    })

# Display hasil
df_validation = pd.DataFrame(validation_results)
print(df_validation.to_string(index=False))

# Flag komoditas dengan over-smoothing (STD drop >30%)
print("\n" + "="*80)
print("SMOOTHING ANALYSIS")
print("="*80)

for result in validation_results:
    std_change = float(result['Std Change (%)'].replace('%', ''))
    if std_change < -30:
        print(f"⚠️ {result['Commodity']}: STD dropped {std_change:.1f}% → Potential over-smoothing!")
    elif std_change < -10:
        print(f"✓ {result['Commodity']}: STD dropped {std_change:.1f}% → Acceptable smoothing")
    else:
        print(f"✓ {result['Commodity']}: STD change {std_change:.1f}% → Good preservation")


VALIDATION: BEFORE vs AFTER CLEANING
           Commodity Mean Before Mean After Std Before Std After Std Change (%)  Missing Before  Missing After
        bawang_merah       36560      37155    7270.40   7876.62            8.3            1241              0
        beras_medium       12399      12351    1222.85   1233.70            0.9            1241              0
       beras_premium       14180      14127    1399.35   1409.41            0.7            1241              0
          telur_ayam       29375      29356    2152.68   2127.32           -1.2            1241              0
                gula       15738      15707    1583.04   1569.38           -0.9            1241              0
Bawang_Putih_Bonggol       35152      34946    5929.89   5960.69            0.5            1241              0
Cabai_Merah_Keriting       48729      49417   12626.02  13184.12            4.4            1244            140
     Daging_Ayam_Ras       37216      37275    2209.18   2215.46          