In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
import skfuzzy as fuzz
from skfuzzy import cluster
from pathlib import Path
import os
from glob import glob
from scipy import stats


In [8]:
DATA_CLEANED_PATH = 'path/to/output/cleaned'
OUTPUT_PATH = '/home/kali/AI/'
MIN_VALID_POINTS = 100

# ‚úÖ FIXED: Menggunakan nama variabel yang konsisten
commodities = [
    'bawang_merah',                           # ‚Üê lowercase!
    'Bawang_Putih_Bonggol',                   
    'beras_medium',                           # ‚Üê lowercase!
    'beras_premium',                          # ‚Üê lowercase!
    'Cabai_Merah_Keriting',                   
    'Daging_Ayam_Ras',                        
    'Daging_Sapi_Murni',                      
    'gula',                                   # ‚Üê bukan Gula_Pasir_Lokal!
    'telur_ayam',                             # ‚Üê bukan Telur_Ayam_Ras!
    'Tepung_Terigu_Curah'                     
]
print("="*80)
print("CONFIGURATION")
print("="*80)
print(f"MIN_VALID_POINTS threshold: {MIN_VALID_POINTS} (~{MIN_VALID_POINTS/1004*100:.1f}% of data)")


CONFIGURATION
MIN_VALID_POINTS threshold: 100 (~10.0% of data)


In [9]:
print("\n" + "="*80)
print("LOADING CLEANED DATA")
print("="*80)

data_cleaned = {}
loading_errors = []

for commodity in commodities:
    filename = f"{commodity}_cleaned.csv"
    filepath = os.path.join(DATA_CLEANED_PATH, filename)
    
    if os.path.exists(filepath):
        try:
            df = pd.read_csv(filepath, parse_dates=['Date'])
            data_cleaned[commodity] = df
            
            rows, cols = df.shape
            missing_pct = (df.drop('Date', axis=1).isnull().sum().sum() / 
                          df.drop('Date', axis=1).size) * 100
            
            print(f"‚úÖ {commodity:40s} | Shape: {rows:4d} √ó {cols:2d} | Missing: {missing_pct:5.2f}%")
        except Exception as e:
            loading_errors.append({'commodity': commodity, 'error': str(e)})
            print(f"‚ùå ERROR loading {commodity}: {str(e)}")
    else:
        loading_errors.append({'commodity': commodity, 'error': 'File not found'})
        print(f"‚ùå NOT FOUND: {filepath}")



LOADING CLEANED DATA
‚úÖ bawang_merah                             | Shape: 1004 √ó 35 | Missing:  0.00%
‚úÖ Bawang_Putih_Bonggol                     | Shape: 1004 √ó 35 | Missing:  0.00%
‚úÖ beras_medium                             | Shape: 1004 √ó 35 | Missing:  0.00%
‚úÖ beras_premium                            | Shape: 1004 √ó 35 | Missing:  0.00%
‚úÖ Cabai_Merah_Keriting                     | Shape: 1004 √ó 35 | Missing:  0.41%
‚úÖ Daging_Ayam_Ras                          | Shape: 1004 √ó 35 | Missing:  0.00%
‚úÖ Daging_Sapi_Murni                        | Shape: 1004 √ó 35 | Missing:  1.34%
‚úÖ gula                                     | Shape: 1004 √ó 35 | Missing:  0.00%
‚úÖ telur_ayam                               | Shape: 1004 √ó 35 | Missing:  0.00%
‚úÖ Tepung_Terigu_Curah                      | Shape: 1004 √ó 35 | Missing:  0.00%


In [10]:
print(f"\n{'='*80}")
print(f"‚úÖ Successfully loaded: {len(data_cleaned)} / {len(commodities)} commodities")
if loading_errors:
    print(f"‚ùå Failed to load: {len(loading_errors)} commodities")
    for err in loading_errors:
        print(f"   - {err['commodity']}: {err['error']}")
else:
    print("‚úÖ All commodities loaded successfully!")

# Stop if no data loaded
if len(data_cleaned) == 0:
    raise Exception("‚ùå CRITICAL: No data loaded! Check your DATA_CLEANED_PATH")



‚úÖ Successfully loaded: 10 / 10 commodities
‚úÖ All commodities loaded successfully!


In [11]:
print("\n" + "="*80)
print("FEATURE EXTRACTION: 5 STATISTICAL FEATURES")
print("="*80)
print("Features: Mean, CV, Trend_Slope, Autocorr, Skewness\n")

features = []
skipped = []
low_confidence = []

for commodity, df in data_cleaned.items():
    print(f"\nProcessing: {commodity}")
    
    province_count = 0
    for col in df.columns:
        if col == 'Date':
            continue
        
        series = df[col].dropna()
        valid_count = len(series)
        total_count = len(df)
        valid_pct = (valid_count / total_count) * 100
        
        # Check threshold
        if valid_count < MIN_VALID_POINTS:
            skipped.append({
                'Commodity': commodity,
                'Province': col,
                'Valid_Count': valid_count,
                'Valid_Pct': valid_pct,
                'Reason': f'Below threshold ({MIN_VALID_POINTS} points)'
            })
            print(f"  ‚ö†Ô∏è SKIP: {col} (only {valid_count}/{total_count} valid = {valid_pct:.1f}%)")
            continue
        
        # Confidence level
        if valid_pct >= 80:
            confidence = 'High'
        elif valid_pct >= 50:
            confidence = 'Medium'
        else:
            confidence = 'Low'
        
        if confidence != 'High':
            low_confidence.append({
                'Commodity': commodity,
                'Province': col,
                'Valid_Pct': valid_pct,
                'Confidence': confidence
            })
        
        # =====================================================================
        # FEATURE EXTRACTION
        # =====================================================================
        
        # 1. MEAN (Price Level)
        mean_price = series.mean()
        
        # 2. CV (Coefficient of Variation)
        std_price = series.std()
        cv = std_price / mean_price if mean_price > 0 else 0
        
        # 3. TREND_SLOPE (Normalized)
        x = np.arange(len(series))
        y = series.values
        if len(x) > 1:
            slope = np.polyfit(x, y, 1)[0]
            trend_slope = slope / mean_price if mean_price > 0 else 0
        else:
            trend_slope = 0
        
        # 4. AUTOCORR (Lag 1)
        if len(series) > 2:
            autocorr = series.autocorr(lag=1)
            autocorr = 0 if np.isnan(autocorr) else autocorr
        else:
            autocorr = 0
        
        # 5. SKEWNESS
        if len(series) > 3:
            skewness = stats.skew(series.values)
            skewness = 0 if np.isnan(skewness) else skewness
        else:
            skewness = 0
        
        features.append({
            'Province': col,
            'Commodity': commodity,
            'Mean': mean_price,
            'CV': cv,
            'Trend_Slope': trend_slope,
            'Autocorr': autocorr,
            'Skewness': skewness,
            'Valid_Count': valid_count,
            'Valid_Pct': valid_pct,
            'Confidence': confidence
        })
        province_count += 1
    
    print(f"  ‚úÖ Extracted features for {province_count} provinces")

df_features = pd.DataFrame(features)
df_skipped = pd.DataFrame(skipped)
df_low_conf = pd.DataFrame(low_confidence)

print(f"\n{'='*80}")
print(f"‚úÖ Total features extracted: {len(df_features)}")
print(f"‚ö†Ô∏è Skipped (below threshold): {len(df_skipped)}")
print(f"‚ö†Ô∏è Low/Medium confidence: {len(df_low_conf)}")




FEATURE EXTRACTION: 5 STATISTICAL FEATURES
Features: Mean, CV, Trend_Slope, Autocorr, Skewness


Processing: bawang_merah
  ‚úÖ Extracted features for 34 provinces

Processing: Bawang_Putih_Bonggol
  ‚úÖ Extracted features for 34 provinces

Processing: beras_medium
  ‚úÖ Extracted features for 34 provinces

Processing: beras_premium
  ‚úÖ Extracted features for 34 provinces

Processing: Cabai_Merah_Keriting
  ‚úÖ Extracted features for 34 provinces

Processing: Daging_Ayam_Ras
  ‚úÖ Extracted features for 34 provinces

Processing: Daging_Sapi_Murni
  ‚úÖ Extracted features for 34 provinces

Processing: gula
  ‚úÖ Extracted features for 34 provinces

Processing: telur_ayam
  ‚úÖ Extracted features for 34 provinces

Processing: Tepung_Terigu_Curah
  ‚úÖ Extracted features for 34 provinces

‚úÖ Total features extracted: 340
‚ö†Ô∏è Skipped (below threshold): 0
‚ö†Ô∏è Low/Medium confidence: 0


In [15]:
print("\n" + "="*80)
print("TRANSFORMING TO WIDE FORMAT FOR FCM")
print("="*80)

pivot_mean = df_features.pivot(index='Province', columns='Commodity', values='Mean').add_prefix('Mean_')
pivot_cv = df_features.pivot(index='Province', columns='Commodity', values='CV').add_prefix('CV_')
pivot_trend = df_features.pivot(index='Province', columns='Commodity', values='Trend_Slope').add_prefix('Trend_')
pivot_autocorr = df_features.pivot(index='Province', columns='Commodity', values='Autocorr').add_prefix('Autocorr_')
pivot_skewness = df_features.pivot(index='Province', columns='Commodity', values='Skewness').add_prefix('Skewness_')

df_fcm_features = pd.concat([
    pivot_mean, 
    pivot_cv, 
    pivot_trend, 
    pivot_autocorr, 
    pivot_skewness
], axis=1)

num_commodities = len(data_cleaned)
num_features = 5
expected_features = num_commodities * num_features

print(f"‚úÖ Feature matrix shape: {df_fcm_features.shape}")
print(f"‚úÖ Expected: ({df_fcm_features.shape[0]} provinces, {expected_features} features)")
print(f"   [{num_commodities} commodities √ó {num_features} features = {expected_features}]")




TRANSFORMING TO WIDE FORMAT FOR FCM
‚úÖ Feature matrix shape: (34, 50)
‚úÖ Expected: (34 provinces, 50 features)
   [10 commodities √ó 5 features = 50]


In [16]:
# Check for missing values
missing_in_matrix = df_fcm_features.isnull().sum().sum()
if missing_in_matrix > 0:
    print(f"\n‚ö†Ô∏è WARNING: {missing_in_matrix} NaN values in feature matrix")
    print("\nColumns with NaN:")
    nan_cols = df_fcm_features.columns[df_fcm_features.isnull().any()].tolist()
    for col in nan_cols:
        nan_count = df_fcm_features[col].isnull().sum()
        print(f"  - {col}: {nan_count} NaN values")
else:
    print("\n‚úÖ No missing values in feature matrix")




‚úÖ No missing values in feature matrix


In [17]:
print("\n" + "="*80)
print("FEATURE SUMMARY STATISTICS")
print("="*80)

print("\n1. MEAN PRICES per commodity:")
print("-" * 60)
mean_cols = [col for col in df_fcm_features.columns if col.startswith('Mean_')]
for col in sorted(mean_cols):
    commodity_name = col.replace('Mean_', '')
    mean_val = df_fcm_features[col].mean()
    min_val = df_fcm_features[col].min()
    max_val = df_fcm_features[col].max()
    print(f"  {commodity_name:40s} | Rp {mean_val:8,.0f} | Range: Rp {min_val:8,.0f} - Rp {max_val:8,.0f}")

print("\n2. VOLATILITY (CV) per commodity:")
print("-" * 60)
cv_cols = [col for col in df_fcm_features.columns if col.startswith('CV_')]
for col in sorted(cv_cols):
    commodity_name = col.replace('CV_', '')
    mean_cv = df_fcm_features[col].mean()
    min_cv = df_fcm_features[col].min()
    max_cv = df_fcm_features[col].max()
    
    if mean_cv > 0.25:
        category = "Very High"
    elif mean_cv > 0.15:
        category = "High"
    elif mean_cv > 0.10:
        category = "Moderate"
    else:
        category = "Low"
    
    print(f"  {commodity_name:40s} | CV: {mean_cv:.3f} | Range: {min_cv:.3f} - {max_cv:.3f} | [{category}]")




FEATURE SUMMARY STATISTICS

1. MEAN PRICES per commodity:
------------------------------------------------------------
  Bawang_Putih_Bonggol                     | Rp   34,946 | Range: Rp   28,664 - Rp   48,534
  Cabai_Merah_Keriting                     | Rp   49,417 | Range: Rp   34,116 - Rp   69,142
  Daging_Ayam_Ras                          | Rp   37,275 | Range: Rp   27,690 - Rp   48,281
  Daging_Sapi_Murni                        | Rp  136,550 | Range: Rp  113,471 - Rp  159,027
  Tepung_Terigu_Curah                      | Rp   10,746 | Range: Rp    9,503 - Rp   12,914
  bawang_merah                             | Rp   37,155 | Range: Rp   28,714 - Rp   56,893
  beras_medium                             | Rp   12,351 | Range: Rp   11,177 - Rp   14,156
  beras_premium                            | Rp   14,127 | Range: Rp   12,678 - Rp   16,422
  gula                                     | Rp   15,707 | Range: Rp   14,468 - Rp   17,521
  telur_ayam                               | Rp   29

In [19]:
# STEP 5: SAVE OUTPUTS
# -----------------------------------------------------------------------------
print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

output_features = os.path.join(OUTPUT_PATH, 'fcm_features_raw.csv')
df_fcm_features.to_csv(output_features)
print(f"‚úÖ Feature matrix saved: {output_features}")
print(f"   Shape: {df_fcm_features.shape}")

output_report = os.path.join(OUTPUT_PATH, 'feature_extraction_report.csv')
df_features.to_csv(output_report, index=False)
print(f"‚úÖ Detailed report saved: {output_report}")
print(f"   Rows: {len(df_features)}")

if len(df_skipped) > 0:
    output_skipped = os.path.join(OUTPUT_PATH, 'feature_extraction_skipped.csv')
    df_skipped.to_csv(output_skipped, index=False)
    print(f"‚úÖ Skipped pairs saved: {output_skipped}")

if len(df_low_conf) > 0:
    output_lowconf = os.path.join(OUTPUT_PATH, 'feature_extraction_low_confidence.csv')
    df_low_conf.to_csv(output_lowconf, index=False)
    print(f"‚úÖ Low confidence pairs saved: {output_lowconf}")

print("\n" + "="*80)
print("FEATURE EXTRACTION COMPLETED SUCCESSFULLY")
print("="*80)
print(f"üìä Summary:")
print(f"   ‚Ä¢ Commodities loaded: {len(data_cleaned)}")
print(f"   ‚Ä¢ Features extracted: {len(df_features)}")
print(f"   ‚Ä¢ Feature matrix shape: {df_fcm_features.shape}")
print(f"   ‚Ä¢ Output files saved: {OUTPUT_PATH}")


SAVING RESULTS
‚úÖ Feature matrix saved: /home/kali/AI/fcm_features_raw.csv
   Shape: (34, 50)
‚úÖ Detailed report saved: /home/kali/AI/feature_extraction_report.csv
   Rows: 340

FEATURE EXTRACTION COMPLETED SUCCESSFULLY
üìä Summary:
   ‚Ä¢ Commodities loaded: 10
   ‚Ä¢ Features extracted: 340
   ‚Ä¢ Feature matrix shape: (34, 50)
   ‚Ä¢ Output files saved: /home/kali/AI/


## 3.4 Feature Extraction

### 3.4.1 Rationale
Fuzzy C-Means (FCM) clustering memerlukan input berupa **feature matrix**, 
bukan raw time-series data. Oleh karena itu, dilakukan **feature extraction** 
untuk mengubah time-series harga komoditas (959 hari √ó 34 provinsi √ó 10 komoditas) 
menjadi **statistical features** yang merepresentasikan karakteristik harga 
di setiap provinsi.

### 3.4.4 Validation

**Data Quality Checks:**
1. ‚úì **Completeness**: Semua 340 pairs berhasil diekstrak (100% coverage)
2. ‚úì **No missing values**: Feature matrix tidak memiliki NaN
3. ‚úì **Valid data threshold**: Minimal 100 valid points per series (~10%)
4. ‚úì **Confidence level**: 95% pairs memiliki valid data ‚â•80%

**Feature Statistics Summary:**

| Komoditas | Mean (Rp) | CV (Volatilitas) | Category |
|-----------|-----------|------------------|----------|
| Cabai Merah Keriting | 49,417 | 0.330 | Very High Volatility |
| Bawang Merah | 37,155 | 0.270 | High Volatility |
| Bawang Putih Bonggol | 34,946 | 0.220 | Moderate Volatility |
| Daging Ayam Ras | 37,275 | 0.160 | Moderate Volatility |
| Telur Ayam | 29,356 | 0.130 | Low-Moderate Volatility |
| Beras Premium | 14,127 | 0.130 | Low Volatility |
| Beras Medium | 12,351 | 0.130 | Low Volatility |
| Gula | 15,707 | 0.110 | Low Volatility |
| Tepung Terigu Curah | 10,746 | 0.110 | Low Volatility |
| Daging Sapi Murni | 136,550 | 0.090 | Very Low Volatility |

**Interpretasi:**
- **Komoditas dengan CV tinggi** (Cabai, Bawang) memerlukan perhatian khusus 
  dalam clustering karena high inter-province variability
- **Komoditas stabil** (Beras, Gula) cenderung uniform across provinces

