In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


In [3]:
print("\n" + "="*80)
print("LOADING RAW FEATURES")
print("="*80)

input_file = '/home/kali/AI/fcm_features_raw.csv'
output_file = '/home/kali/AI/fcm_features_standardized.csv'

df_raw = pd.read_csv(input_file, index_col=0)  # index_col=0 → Province as index

print(f"✓ Loaded: {input_file}")
print(f"  Shape: {df_raw.shape}")
print(f"  Provinces: {len(df_raw)}")
print(f"  Features: {len(df_raw.columns)}")


LOADING RAW FEATURES
✓ Loaded: /home/kali/AI/fcm_features_raw.csv
  Shape: (34, 50)
  Provinces: 34
  Features: 50


In [4]:
print("\n" + "="*80)
print("DATA VALIDATION")
print("="*80)

missing_count = df_raw.isnull().sum().sum()
if missing_count > 0:
    print(f"⚠️ WARNING: {missing_count} missing values found!")
    print("\nProvinces with missing:")
    print(df_raw.isnull().sum(axis=1)[df_raw.isnull().sum(axis=1) > 0])
    
    print("\nFeatures with missing:")
    missing_features = df_raw.isnull().sum()
    print(missing_features[missing_features > 0])
    
    # Handle missing (impute with column mean)
    print("\n⚠️ Filling missing values with column mean...")
    df_raw = df_raw.fillna(df_raw.mean())
    print("✓ Missing values filled")
else:
    print("✓ No missing values")

# Check for infinite values
inf_count = np.isinf(df_raw.values).sum()
if inf_count > 0:
    print(f"⚠️ WARNING: {inf_count} infinite values found!")
    df_raw = df_raw.replace([np.inf, -np.inf], np.nan).fillna(df_raw.mean())
    print("✓ Infinite values replaced with column mean")
else:
    print("✓ No infinite values")



DATA VALIDATION
✓ No missing values
✓ No infinite values


In [5]:
print("\n" + "="*80)
print("STATISTICS BEFORE STANDARDIZATION")
print("="*80)

print("\nFeature ranges (sample - showing different commodity types):")
print("-" * 80)

# Sample features from different types
sample_features = [
    'Mean_bawang_merah',      # High volatility vegetable
    'Mean_cabai_merah',       # Very high volatility
    'Mean_daging_sapi',       # Low volatility, high price
    'Mean_tepung_terigu',     # Low volatility, low price
    'CV_bawang_merah',        # Volatility metric
    'CV_daging_sapi',         # Volatility metric
    'Std_bawang_merah',       # Absolute volatility
    'Std_daging_sapi'         # Absolute volatility
]

# Adjust feature names based on actual columns
sample_features = [f for f in sample_features if f in df_raw.columns]

# Add any actual column if sample not found
if len(sample_features) < 6:
    sample_features = df_raw.columns[:8].tolist()

for feature in sample_features:
    min_val = df_raw[feature].min()
    max_val = df_raw[feature].max()
    mean_val = df_raw[feature].mean()
    std_val = df_raw[feature].std()
    range_val = max_val - min_val
    
    print(f"{feature[:35]:35s} | Mean: {mean_val:10,.2f} | Std: {std_val:10,.2f} | Range: {range_val:10,.2f}")

print("\nOverall statistics:")
print(f"  Mean of all features: {df_raw.mean().mean():,.2f}")
print(f"  Std of all features:  {df_raw.std().mean():,.2f}")
print(f"  Min value overall:    {df_raw.min().min():,.2f}")
print(f"  Max value overall:    {df_raw.max().max():,.2f}")



STATISTICS BEFORE STANDARDIZATION

Feature ranges (sample - showing different commodity types):
--------------------------------------------------------------------------------
Mean_Bawang_Putih_Bonggol           | Mean:  34,946.02 | Std:   5,190.07 | Range:  19,869.63
Mean_Cabai_Merah_Keriting           | Mean:  49,416.78 | Std:   9,485.71 | Range:  35,025.80
Mean_Daging_Ayam_Ras                | Mean:  37,274.56 | Std:   5,505.40 | Range:  20,590.64
Mean_Daging_Sapi_Murni              | Mean: 136,550.49 | Std:  12,100.57 | Range:  45,556.46
Mean_Tepung_Terigu_Curah            | Mean:  10,745.95 | Std:     864.99 | Range:   3,411.39
Mean_bawang_merah                   | Mean:  37,154.89 | Std:   6,854.18 | Range:  28,178.75
Mean_beras_medium                   | Mean:  12,351.29 | Std:     937.75 | Range:   2,979.09
Mean_beras_premium                  | Mean:  14,126.90 | Std:   1,238.50 | Range:   3,744.72

Overall statistics:
  Mean of all features: 7,552.84
  Std of all features:  

In [6]:
print("\n" + "="*80)
print("APPLYING Z-SCORE STANDARDIZATION")
print("="*80)

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform
features_standardized = scaler.fit_transform(df_raw)

# Convert back to DataFrame (preserve index and columns)
df_standardized = pd.DataFrame(
    features_standardized,
    index=df_raw.index,
    columns=df_raw.columns
)

print("✓ Standardization complete")
print(f"  All features transformed to Z-scores (mean=0, std=1)")



APPLYING Z-SCORE STANDARDIZATION
✓ Standardization complete
  All features transformed to Z-scores (mean=0, std=1)


In [7]:
print("\n" + "="*80)
print("STATISTICS AFTER STANDARDIZATION")
print("="*80)

print("\nFeature Z-score ranges (sample):")
print("-" * 80)

for feature in sample_features:
    min_val = df_standardized[feature].min()
    max_val = df_standardized[feature].max()
    mean_val = df_standardized[feature].mean()
    std_val = df_standardized[feature].std()
    range_val = max_val - min_val
    
    print(f"{feature[:35]:35s} | Mean: {mean_val:7.4f} | Std: {std_val:6.4f} | Range: [{min_val:6.2f}, {max_val:6.2f}]")

print("\nOverall statistics:")
print(f"  Mean of all features:  {df_standardized.mean().mean():8.6f}  (should be ~0)")
print(f"  Std of all features:   {df_standardized.std().mean():8.6f}  (should be ~1)")
print(f"  Min Z-score overall:   {df_standardized.min().min():8.3f}")
print(f"  Max Z-score overall:   {df_standardized.max().max():8.3f}")



STATISTICS AFTER STANDARDIZATION

Feature Z-score ranges (sample):
--------------------------------------------------------------------------------
Mean_Bawang_Putih_Bonggol           | Mean: -0.0000 | Std: 1.0150 | Range: [ -1.23,   2.66]
Mean_Cabai_Merah_Keriting           | Mean: -0.0000 | Std: 1.0150 | Range: [ -1.64,   2.11]
Mean_Daging_Ayam_Ras                | Mean: -0.0000 | Std: 1.0150 | Range: [ -1.77,   2.03]
Mean_Daging_Sapi_Murni              | Mean:  0.0000 | Std: 1.0150 | Range: [ -1.94,   1.89]
Mean_Tepung_Terigu_Curah            | Mean: -0.0000 | Std: 1.0150 | Range: [ -1.46,   2.54]
Mean_bawang_merah                   | Mean:  0.0000 | Std: 1.0150 | Range: [ -1.25,   2.92]
Mean_beras_medium                   | Mean: -0.0000 | Std: 1.0150 | Range: [ -1.27,   1.95]
Mean_beras_premium                  | Mean: -0.0000 | Std: 1.0150 | Range: [ -1.19,   1.88]

Overall statistics:
  Mean of all features:  0.000000  (should be ~0)
  Std of all features:   1.015038  (should b

In [8]:
print("\n" + "="*80)
print("VALIDATION")
print("="*80)

# Check 1: Mean should be ~0
mean_check = abs(df_standardized.mean().mean())
if mean_check < 1e-10:
    print(f"✓ Mean check PASSED: {mean_check:.2e} ≈ 0")
else:
    print(f"⚠️ Mean check WARNING: {mean_check:.2e} (expected ~0)")

# Check 2: Std should be ~1
std_check = df_standardized.std().mean()
if 0.99 < std_check < 1.01:
    print(f"✓ Std check PASSED: {std_check:.6f} ≈ 1")
else:
    print(f"⚠️ Std check WARNING: {std_check:.6f} (expected ~1)")

# Check 3: No missing values introduced
if df_standardized.isnull().sum().sum() == 0:
    print("✓ No missing values introduced")
else:
    print(f"⚠️ WARNING: {df_standardized.isnull().sum().sum()} missing values introduced!")

# Check 4: No infinite values introduced
if not np.isinf(df_standardized.values).any():
    print("✓ No infinite values introduced")
else:
    print(f"⚠️ WARNING: Infinite values introduced!")

# Check 5: Shape preserved
if df_standardized.shape == df_raw.shape:
    print(f"✓ Shape preserved: {df_standardized.shape}")
else:
    print(f"⚠️ WARNING: Shape changed from {df_raw.shape} to {df_standardized.shape}")



VALIDATION
✓ Mean check PASSED: 1.03e-15 ≈ 0
✓ No missing values introduced
✓ No infinite values introduced
✓ Shape preserved: (34, 50)


In [9]:
# Tambahkan validation ini:
print("\n" + "="*80)
print("DETAILED STD VALIDATION (Per Feature)")
print("="*80)

per_feature_std = df_standardized.std()
print(f"\nStd statistics across 50 features:")
print(f"  Min:    {per_feature_std.min():.10f}")
print(f"  Max:    {per_feature_std.max():.10f}")
print(f"  Mean:   {per_feature_std.mean():.10f}")
print(f"  Median: {per_feature_std.median():.10f}")

# Check how many features have std ≈ 1.0
tolerance = 0.001
within_tolerance = ((per_feature_std - 1.0).abs() < tolerance).sum()
print(f"\n✓ Features with std within ±{tolerance} of 1.0: {within_tolerance}/{len(per_feature_std)}")

if within_tolerance == len(per_feature_std):
    print("✓ ALL FEATURES PASSED: std ≈ 1.0 for each feature")
else:
    print(f"⚠️ {len(per_feature_std) - within_tolerance} features outside tolerance")
    print("Problematic features:")
    outlier_features = per_feature_std[(per_feature_std - 1.0).abs() >= tolerance]
    for feat, std_val in outlier_features.items():
        print(f"  {feat}: {std_val:.6f}")



DETAILED STD VALIDATION (Per Feature)

Std statistics across 50 features:
  Min:    1.0150384378
  Max:    1.0150384378
  Mean:   1.0150384378
  Median: 1.0150384378

✓ Features with std within ±0.001 of 1.0: 0/50
⚠️ 50 features outside tolerance
Problematic features:
  Mean_Bawang_Putih_Bonggol: 1.015038
  Mean_Cabai_Merah_Keriting: 1.015038
  Mean_Daging_Ayam_Ras: 1.015038
  Mean_Daging_Sapi_Murni: 1.015038
  Mean_Tepung_Terigu_Curah: 1.015038
  Mean_bawang_merah: 1.015038
  Mean_beras_medium: 1.015038
  Mean_beras_premium: 1.015038
  Mean_gula: 1.015038
  Mean_telur_ayam: 1.015038
  CV_Bawang_Putih_Bonggol: 1.015038
  CV_Cabai_Merah_Keriting: 1.015038
  CV_Daging_Ayam_Ras: 1.015038
  CV_Daging_Sapi_Murni: 1.015038
  CV_Tepung_Terigu_Curah: 1.015038
  CV_bawang_merah: 1.015038
  CV_beras_medium: 1.015038
  CV_beras_premium: 1.015038
  CV_gula: 1.015038
  CV_telur_ayam: 1.015038
  Trend_Bawang_Putih_Bonggol: 1.015038
  Trend_Cabai_Merah_Keriting: 1.015038
  Trend_Daging_Ayam_Ras: 1.0

In [8]:
print("\n" + "="*80)
print("VARIANCE PRESERVATION CHECK")
print("="*80)

print("\nComparing feature variances (std) before and after:")
print("-" * 80)
print(f"{'Feature':35s} | {'Std (Raw)':>12s} | {'Std (Z-score)':>14s} | {'Ratio':>8s}")
print("-" * 80)

for feature in sample_features[:5]:  # Show first 5 for brevity
    std_raw = df_raw[feature].std()
    std_z = df_standardized[feature].std()
    ratio = std_z  # Should be 1.0 after standardization
    
    print(f"{feature[:35]:35s} | {std_raw:12,.2f} | {std_z:14.4f} | {ratio:8.4f}")

print("\n✓ All features now have std ≈ 1.0")
print("✓ RELATIVE variances preserved (high-variance features still identifiable)")



VARIANCE PRESERVATION CHECK

Comparing feature variances (std) before and after:
--------------------------------------------------------------------------------
Feature                             |    Std (Raw) |  Std (Z-score) |    Ratio
--------------------------------------------------------------------------------
Mean_Bawang_Putih_Bonggol           |     5,190.07 |         1.0150 |   1.0150
Mean_Cabai_Merah_Keriting           |     9,485.71 |         1.0150 |   1.0150
Mean_Daging_Ayam_Ras                |     5,505.40 |         1.0150 |   1.0150
Mean_Daging_Sapi_Murni              |    12,100.57 |         1.0150 |   1.0150
Mean_Tepung_Terigu_Curah            |       864.99 |         1.0150 |   1.0150

✓ All features now have std ≈ 1.0
✓ RELATIVE variances preserved (high-variance features still identifiable)


In [10]:
# Check if the discrepancy is consistent (1.015×)
print("\n" + "="*80)
print("VARIANCE DISCREPANCY ANALYSIS")
print("="*80)

# Theory: sample_std = population_std × sqrt(N/(N-1))
N = len(df_standardized)
expected_ratio = np.sqrt(N / (N - 1))

print(f"\nNumber of observations (N): {N}")
print(f"Expected ratio [sqrt(N/(N-1))]: {expected_ratio:.6f}")

# Check actual ratios
actual_ratios = df_standardized.std()
print(f"\nActual std ratios (sample):")
print(f"  Mean of all stds: {actual_ratios.mean():.6f}")
print(f"  Min std:          {actual_ratios.min():.6f}")
print(f"  Max std:          {actual_ratios.max():.6f}")

# Verify consistency
if abs(actual_ratios.mean() - expected_ratio) < 0.001:
    print("\n✅ DIAGNOSIS: Sample vs Population std issue")
    print("   This is NORMAL and SAFE!")
    print("   Standardization is working correctly.")
else:
    print("\n⚠️ WARNING: Unexpected std pattern!")
    print("   Investigate further.")

# Alternative: Force std=1.0000 by using population std
print("\n" + "-"*80)
print("OPTIONAL FIX: Use ddof=0 for validation")
print("-"*80)

# Recalculate with ddof=0 (population std)
std_pop = df_standardized.std(ddof=0)
print(f"Std with ddof=0 (population): {std_pop.mean():.6f}")
print("Expected: 1.000000")

if abs(std_pop.mean() - 1.0) < 1e-6:
    print("✅ CONFIRMED: Standardization is CORRECT!")
    print("   The 1.015 value is due to pandas ddof=1 default")



VARIANCE DISCREPANCY ANALYSIS

Number of observations (N): 34
Expected ratio [sqrt(N/(N-1))]: 1.015038

Actual std ratios (sample):
  Mean of all stds: 1.015038
  Min std:          1.015038
  Max std:          1.015038

✅ DIAGNOSIS: Sample vs Population std issue
   This is NORMAL and SAFE!
   Standardization is working correctly.

--------------------------------------------------------------------------------
OPTIONAL FIX: Use ddof=0 for validation
--------------------------------------------------------------------------------
Std with ddof=0 (population): 1.000000
Expected: 1.000000
✅ CONFIRMED: Standardization is CORRECT!
   The 1.015 value is due to pandas ddof=1 default


In [11]:
print("\n" + "="*80)
print("SAVING STANDARDIZED FEATURES")
print("="*80)

df_standardized.to_csv(output_file)
print(f"✓ Saved: {output_file}")
print(f"  Shape: {df_standardized.shape}")




SAVING STANDARDIZED FEATURES
✓ Saved: /home/kali/AI/fcm_features_standardized.csv
  Shape: (34, 50)
