In [1]:
import pandas as pd
import numpy as np
import os
import calendar
import re

# --- CONFIGURATION ---
YEAR = 2025

# Mapping Indonesian Month names to integers
MONTH_MAP = {
    'Januari': 1, 'Februari': 2, 'Maret': 3, 'April': 4, 'Mei': 5, 'Juni': 6,
    'Juli': 7, 'Agustus': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Desember': 12
}

def clean_province_name(prov):
    """Standardizes province names to ensure merging works."""
    if pd.isna(prov): return ""
    prov = str(prov).strip().upper()
    return prov

def load_and_melt(filepath, value_name):
    """
    Loads the BPS specific format (header on row 3), cleans it,
    and melts it from Wide (Months) to Long (Rows).
    """
    # Read CSV, skipping first 3 metadata rows
    df = pd.read_csv(filepath, header=3)

    # Rename first column (usually unnamed or index) to 'Provinsi'
    df.columns.values[0] = 'Provinsi'

    # Filter rows: Remove 'INDONESIA', 'Tahunan', empty rows, or purely numeric rows
    df['Provinsi'] = df['Provinsi'].apply(clean_province_name)
    df = df[df['Provinsi'] != '']
    df = df[df['Provinsi'] != 'INDONESIA']
    df = df[~df['Provinsi'].str.contains('PROVINSI', case=False)] # Header repeats

    # Drop 'Tahunan' column if exists
    if 'Tahunan' in df.columns:
        df = df.drop(columns=['Tahunan'])

    # Melt (Columns Jan-Dec becomes rows)
    # Filter columns to only include keys in MONTH_MAP or 'Provinsi'
    valid_cols = ['Provinsi'] + [col for col in df.columns if col in MONTH_MAP]
    df = df[valid_cols]

    df_long = df.melt(id_vars=['Provinsi'], var_name='Bulan', value_name=value_name)

    # Map month names to numbers
    df_long['Month_Num'] = df_long['Bulan'].map(MONTH_MAP)

    # Clean numeric values
    def clean_num(x):
        try:
            return float(x)
        except:
            return 0.0

    df_long[value_name] = df_long[value_name].apply(clean_num)

    return df_long

def disaggregate_to_daily(row):
    """
    Takes a monthly row and returns a DataFrame of daily rows.
    """
    prov = row['Provinsi']
    month = int(row['Month_Num'])
    val = row['Value'] # This will be either Luas Panen or Produksi
    col_name = row['Type'] # 'Luas Panen' or 'Produksi'

    # Get number of days in this specific month/year
    # monthrange returns (weekday_of_first_day, number_of_days)
    _, num_days = calendar.monthrange(YEAR, month)

    # Generate Date Range
    start_date = f"{YEAR}-{month:02d}-01"
    dates = pd.date_range(start=start_date, periods=num_days, freq='D')

    # --- Randomization Logic ---
    # Generate random weights
    np.random.seed(hash(prov) % 10000 + month + int(val)) # Deterministic seed per row
    weights = np.random.uniform(0.3, 1.7, size=num_days)

    # Normalize weights so they sum to exactly 1.0
    normalized_weights = weights / weights.sum()

    # Calculate daily values
    daily_values = val * normalized_weights

    # Create DataFrame
    temp_df = pd.DataFrame({
        'Date': dates,
        'Provinsi': prov,
        col_name: daily_values
    })

    return temp_df

def process_commodity(lp_file, prod_file, output_name):
    print(f"Processing {output_name}...")

    # 1. Load and Melt
    df_lp = load_and_melt(lp_file, 'Luas Panen')
    df_prod = load_and_melt(prod_file, 'Produksi')

    # 2. Expand Luas Panen to Daily
    print(f"  - Disaggregating Luas Panen...")
    lp_daily_list = []
    # Prepare row for function
    df_lp['Type'] = 'Luas Panen'
    df_lp.rename(columns={'Luas Panen': 'Value'}, inplace=True)

    for _, row in df_lp.iterrows():
        lp_daily_list.append(disaggregate_to_daily(row))

    df_lp_daily = pd.concat(lp_daily_list, ignore_index=True)

    # 3. Expand Produksi to Daily
    print(f"  - Disaggregating Produksi...")
    prod_daily_list = []
    df_prod['Type'] = 'Produksi'
    df_prod.rename(columns={'Produksi': 'Value'}, inplace=True)

    for _, row in df_prod.iterrows():
        prod_daily_list.append(disaggregate_to_daily(row))

    df_prod_daily = pd.concat(prod_daily_list, ignore_index=True)

    # 4. Merge
    print(f"  - Merging datasets...")
    # Merge on Date and Provinsi
    df_final = pd.merge(df_lp_daily, df_prod_daily, on=['Date', 'Provinsi'], how='outer')

    # Fill NaNs with 0
    df_final.fillna(0, inplace=True)

    # Sort
    df_final = df_final.sort_values(by=['Date', 'Provinsi'])

    # Save
    df_final.to_csv(output_name, index=False)
    print(f"✅ Saved to {output_name}")

# --- MAIN EXECUTION ---

# Get Script Directory
try:
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    script_dir = os.getcwd()

# Define File Pairs
files = {
    'Padi': {
        'lp': 'Luas Panen Padi Menurut Provinsi (Bulanan), 2025.csv',
        'prod': 'Produksi Padi Menurut Provinsi (Bulanan), 2025.csv',
        'out': 'Daily_Padi_2025.csv'
    },
    'Jagung': {
        'lp': 'Luas Panen Jagung Pipilan Menurut Provinsi (Bulanan), 2025.csv',
        'prod': 'Produksi Jagung Pipilan Kering Kadar Air 14 Persen Menurut Provinsi (Bulanan), 2025.csv',
        'out': 'Daily_Jagung_2025.csv'
    }
}

# Run Process
for commodity, paths in files.items():
    lp_path = os.path.join(script_dir, paths['lp'])
    prod_path = os.path.join(script_dir, paths['prod'])
    out_path = os.path.join(script_dir, paths['out'])

    if os.path.exists(lp_path) and os.path.exists(prod_path):
        process_commodity(lp_path, prod_path, out_path)
    else:
        print(f"❌ Files for {commodity} not found in {script_dir}")

Processing /content/Daily_Padi_2025.csv...
  - Disaggregating Luas Panen...
  - Disaggregating Produksi...
  - Merging datasets...
✅ Saved to /content/Daily_Padi_2025.csv
Processing /content/Daily_Jagung_2025.csv...
  - Disaggregating Luas Panen...
  - Disaggregating Produksi...
  - Merging datasets...
✅ Saved to /content/Daily_Jagung_2025.csv


In [4]:
import pandas as pd
import numpy as np
import os
import calendar
import re

# --- CONFIGURATION ---
# We will process these years
YEARS_TO_PROCESS = [2023, 2024]
REFERENCE_YEAR = 2025  # Used for seasonality weights

# Mapping Indonesian Month names to integers
MONTH_MAP = {
    'Januari': 1, 'Februari': 2, 'Maret': 3, 'April': 4, 'Mei': 5, 'Juni': 6,
    'Juli': 7, 'Agustus': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Desember': 12
}

def clean_province_name(prov):
    """Standardizes province names to ensure merging works."""
    if pd.isna(prov): return ""
    prov = str(prov).strip().upper()
    return prov

def load_seasonality_weights(filepath):
    """
    Loads 2025 Monthly data to calculate the proportion of harvest/production
    per month for each province.
    """
    # Read CSV, skipping first 3 metadata rows (BPS format specific to 2025 files)
    df = pd.read_csv(filepath, header=3)

    # Rename first column
    df.columns.values[0] = 'Provinsi'

    # Filter valid provinces
    df['Provinsi'] = df['Provinsi'].apply(clean_province_name)
    df = df[df['Provinsi'] != '']
    df = df[df['Provinsi'] != 'INDONESIA']
    df = df[~df['Provinsi'].str.contains('PROVINSI', case=False)]

    if 'Tahunan' in df.columns:
        df = df.drop(columns=['Tahunan'])

    # Keep only month columns and Provinsi
    valid_cols = ['Provinsi'] + [col for col in df.columns if col in MONTH_MAP]
    df = df[valid_cols]

    # Clean numeric values
    for col in valid_cols[1:]:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    # Set Index
    df.set_index('Provinsi', inplace=True)

    # Calculate Weights (Row-wise normalization)
    row_sums = df.sum(axis=1)
    weights = df.div(row_sums, axis=0)

    # Handle NaN weights (if total was 0, use uniform distribution)
    weights.fillna(1/12, inplace=True)

    # Rename columns to 1..12
    weights.columns = [MONTH_MAP[c] for c in weights.columns]

    return weights

def load_annual_data(filepath):
    """
    Loads the 2023/2024 Annual Combined files.
    Format is typically header at row 2 (index 2).
    Cols: Provinsi, Luas Panen, Produktivitas, Produksi
    """
    # Read with header at index 2 (Row 3 in Excel/CSV)
    df = pd.read_csv(filepath, header=2)

    # The row immediately after header often contains years "2023, 2023...", drop it
    if str(df.iloc[0, 1]).strip() in ['2023', '2024'] or pd.isna(df.iloc[0, 0]):
        df = df.iloc[1:].reset_index(drop=True)

    # Explicitly Select and Rename Columns
    # Index 0: Provinsi
    # Index 1: Luas Panen
    # Index 2: Produktivitas (DROPPED/IGNORED)
    # Index 3: Produksi

    # We rename only the ones we need
    df.rename(columns={
        df.columns[0]: 'Provinsi',
        df.columns[1]: 'Luas Panen',
        df.columns[3]: 'Produksi'
    }, inplace=True)

    # Filter only these 3 columns (Dropping Produktivitas here)
    df = df[['Provinsi', 'Luas Panen', 'Produksi']]

    # Clean Province
    df['Provinsi'] = df['Provinsi'].apply(clean_province_name)
    df = df[df['Provinsi'] != '']
    df = df[~df['Provinsi'].str.contains('PROVINSI', case=False)]

    # Clean Numerics
    for col in ['Luas Panen', 'Produksi']:
        if df[col].dtype == object:
             df[col] = df[col].astype(str).str.replace(',', '').replace('-', '0')
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    return df

def generate_daily_from_annual_row(row, year, lp_weights, prod_weights):
    """
    Disaggregates a single annual row into 365/366 daily rows.
    """
    prov = row['Provinsi']
    annual_lp = row['Luas Panen']
    annual_prod = row['Produksi']

    # Get Weights for this province (default to uniform if missing)
    if prov in lp_weights.index:
        w_lp = lp_weights.loc[prov].values
    else:
        w_lp = np.full(12, 1/12)

    if prov in prod_weights.index:
        w_prod = prod_weights.loc[prov].values
    else:
        w_prod = np.full(12, 1/12)

    # --- Weight Jittering (The "Small Role" Logic) ---
    # We perturb the 2025 weights slightly so 2023/24 aren't exact copies
    # Fix: Ensure seed is within valid 32-bit range for numpy
    seed_val = (abs(hash(prov)) + year) % (2**32)
    np.random.seed(seed_val)

    # Add noise: random factor between 0.8 and 1.2
    noise_lp = np.random.uniform(0.8, 1.2, size=12)
    noise_prod = np.random.uniform(0.8, 1.2, size=12)

    # Apply noise and re-normalize
    w_lp_adj = w_lp * noise_lp
    w_lp_adj = w_lp_adj / w_lp_adj.sum()

    w_prod_adj = w_prod * noise_prod
    w_prod_adj = w_prod_adj / w_prod_adj.sum()

    # --- Generate Daily Data ---
    all_dates = []
    all_lp = []
    all_prod = []

    for month_idx in range(12):
        month_num = month_idx + 1

        # Monthly Total for this year based on adjusted weights
        month_lp_total = annual_lp * w_lp_adj[month_idx]
        month_prod_total = annual_prod * w_prod_adj[month_idx]

        # Days in month (handle leap year for 2024)
        _, num_days = calendar.monthrange(year, month_num)

        # Daily Weights (Random non-normal distribution within month)
        # Using a fresh seed for daily variation
        daily_noise = np.random.uniform(0.3, 1.7, size=num_days)
        daily_w = daily_noise / daily_noise.sum()

        # Calculate daily values
        daily_lp_vals = month_lp_total * daily_w
        daily_prod_vals = month_prod_total * daily_w

        # Generate Dates
        start = f"{year}-{month_num:02d}-01"
        dates = pd.date_range(start=start, periods=num_days, freq='D')

        all_dates.extend(dates)
        all_lp.extend(daily_lp_vals)
        all_prod.extend(daily_prod_vals)

    # Create DataFrame
    return pd.DataFrame({
        'Date': all_dates,
        'Provinsi': prov,
        'Luas Panen': all_lp,
        'Produksi': all_prod
    })

def process_commodity_year(commodity, year, annual_file, ref_lp_file, ref_prod_file, output_name):
    print(f"Processing {commodity} for {year}...")

    # 1. Load Seasonality Weights (from 2025 Data)
    lp_weights_df = load_seasonality_weights(ref_lp_file)
    prod_weights_df = load_seasonality_weights(ref_prod_file)

    # 2. Load Annual Data
    df_annual = load_annual_data(annual_file)

    # 3. Process Rows
    daily_dfs = []
    for _, row in df_annual.iterrows():
        daily_dfs.append(generate_daily_from_annual_row(row, year, lp_weights_df, prod_weights_df))

    # 4. Combine and Save
    if daily_dfs:
        df_final = pd.concat(daily_dfs, ignore_index=True)
        df_final = df_final.sort_values(by=['Date', 'Provinsi'])
        df_final.to_csv(output_name, index=False)
        print(f"✅ Saved to {output_name}")
    else:
        print("❌ No data processed.")

# --- MAIN EXECUTION ---

try:
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    script_dir = os.getcwd()

# Configuration Dictionary
tasks = {
    'Padi': {
        'ref_lp': 'Luas Panen Padi Menurut Provinsi (Bulanan), 2025.csv',
        'ref_prod': 'Produksi Padi Menurut Provinsi (Bulanan), 2025.csv',
        'years': {
            2023: 'Luas Panen, Produksi, dan Produktivitas Padi Menurut Provinsi, 2023.csv',
            2024: 'Luas Panen, Produksi, dan Produktivitas Padi Menurut Provinsi, 2024.csv'
        }
    },
    'Jagung': {
        'ref_lp': 'Luas Panen Jagung Pipilan Menurut Provinsi (Bulanan), 2025.csv',
        'ref_prod': 'Produksi Jagung Pipilan Kering Kadar Air 14 Persen Menurut Provinsi (Bulanan), 2025.csv',
        'years': {
            2023: 'Luas Panen, Produksi, dan Produktivitas Jagung Menurut Provinsi, 2023.csv',
            2024: 'Luas Panen, Produksi, dan Produktivitas Jagung Menurut Provinsi, 2024.csv'
        }
    }
}

for commodity, config in tasks.items():
    ref_lp_path = os.path.join(script_dir, config['ref_lp'])
    ref_prod_path = os.path.join(script_dir, config['ref_prod'])

    # Check if reference files exist
    if not (os.path.exists(ref_lp_path) and os.path.exists(ref_prod_path)):
        print(f"⚠️ Reference files for {commodity} (2025) missing. Skipping.")
        continue

    for year, filename in config['years'].items():
        annual_path = os.path.join(script_dir, filename)
        output_filename = f"Daily_{commodity}_{year}.csv"
        output_path = os.path.join(script_dir, output_filename)

        if os.path.exists(annual_path):
            process_commodity_year(
                commodity,
                year,
                annual_path,
                ref_lp_path,
                ref_prod_path,
                output_path
            )
        else:
            print(f"❌ Annual file for {commodity} {year} not found: {filename}")

Processing Padi for 2023...
✅ Saved to /content/Daily_Padi_2023.csv
Processing Padi for 2024...
✅ Saved to /content/Daily_Padi_2024.csv
Processing Jagung for 2023...
✅ Saved to /content/Daily_Jagung_2023.csv
Processing Jagung for 2024...
✅ Saved to /content/Daily_Jagung_2024.csv


In [6]:
import pandas as pd
import os

# --- CONFIGURATION ---
VALID_PROVINCES = {
    "ACEH", "SUMATERA UTARA", "SUMATERA BARAT", "RIAU", "JAMBI",
    "SUMATERA SELATAN", "BENGKULU", "LAMPUNG", "KEP. BANGKA BELITUNG",
    "KEP. RIAU", "DKI JAKARTA", "JAWA BARAT", "JAWA TENGAH",
    "DI YOGYAKARTA", "JAWA TIMUR", "BANTEN", "BALI",
    "NUSA TENGGARA BARAT", "NUSA TENGGARA TIMUR", "KALIMANTAN BARAT",
    "KALIMANTAN TENGAH", "KALIMANTAN SELATAN", "KALIMANTAN TIMUR",
    "KALIMANTAN UTARA", "SULAWESI UTARA", "SULAWESI TENGAH",
    "SULAWESI SELATAN", "SULAWESI TENGGARA", "GORONTALO",
    "SULAWESI BARAT", "MALUKU", "MALUKU UTARA", "PAPUA BARAT",
    "PAPUA", "PAPUA SELATAN", "PAPUA TENGAH",
    "PAPUA PEGUNUNGAN", "PAPUA BARAT DAYA"
}

# Define Date Filter Range
START_DATE_FILTER = "2023-11-01"
END_DATE_FILTER = "2025-10-31"

def clean_province_name(prov):
    if pd.isna(prov):
        return ""
    # Standardize: Uppercase and Strip
    return str(prov).strip().upper()

def process_commodity_group(commodity_name, file_list, output_filename):
    print(f"\nProcessing {commodity_name}...")
    dfs = []

    # 1. Read and Concatenate Files
    for filename in file_list:
        if os.path.exists(filename):
            print(f"  - Reading {filename}...")
            df = pd.read_csv(filename)
            dfs.append(df)
        else:
            print(f"  ❌ Warning: {filename} not found. Skipping.")

    if not dfs:
        print("  No files found to combine.")
        return

    combined_df = pd.concat(dfs, ignore_index=True)
    original_count = len(combined_df)

    # 2. Clean Province Names
    combined_df['Provinsi'] = combined_df['Provinsi'].apply(clean_province_name)

    # 3. Filter Logic
    # 3a. Date Filter (Nov 1 2023 - Oct 31 2025)
    if 'Date' in combined_df.columns:
        combined_df['Date'] = pd.to_datetime(combined_df['Date'])

        start_ts = pd.Timestamp(START_DATE_FILTER)
        end_ts = pd.Timestamp(END_DATE_FILTER)

        # Filter mask
        date_mask = (combined_df['Date'] >= start_ts) & (combined_df['Date'] <= end_ts)
        combined_df = combined_df[date_mask]

    # 3b. Province Filter
    # Keep row IF Province is in VALID_PROVINCES
    # This automatically drops 'INDONESIA', typos, or header repeats
    combined_df = combined_df[combined_df['Provinsi'].isin(VALID_PROVINCES)]

    filtered_count = len(combined_df)
    dropped_count = original_count - filtered_count

    # 4. Sort (Date -> Province)
    combined_df = combined_df.sort_values(by=['Date', 'Provinsi'])

    # 5. Save
    combined_df.to_csv(output_filename, index=False)

    print(f"  ✅ Saved to {output_filename}")
    print(f"  - Original Rows: {original_count}")
    print(f"  - Rows After Filtering: {filtered_count} (Dropped {dropped_count})")
    print(f"  - Unique Provinces: {combined_df['Provinsi'].nunique()}")
    if not combined_df.empty:
        print(f"  - Date Range: {combined_df['Date'].min().date()} to {combined_df['Date'].max().date()}")

# --- MAIN EXECUTION ---

# Define the file sets
padi_files = [
    'Daily_Padi_2023.csv',
    'Daily_Padi_2024.csv',
    'Daily_Padi_2025.csv'
]

jagung_files = [
    'Daily_Jagung_2023.csv',
    'Daily_Jagung_2024.csv',
    'Daily_Jagung_2025.csv'
]

# Run Combination
process_commodity_group("Padi", padi_files, "Combined_Daily_Padi_Nov23_Oct25.csv")
process_commodity_group("Jagung", jagung_files, "Combined_Daily_Jagung_Nov23_Oct25.csv")


Processing Padi...
  - Reading Daily_Padi_2023.csv...
  - Reading Daily_Padi_2024.csv...
  - Reading Daily_Padi_2025.csv...
  ✅ Saved to Combined_Daily_Padi_Nov23_Oct25.csv
  - Original Rows: 42379
  - Rows After Filtering: 27778 (Dropped 14601)
  - Unique Provinces: 38
  - Date Range: 2023-11-01 to 2025-10-31

Processing Jagung...
  - Reading Daily_Jagung_2023.csv...
  - Reading Daily_Jagung_2024.csv...
  - Reading Daily_Jagung_2025.csv...
  ✅ Saved to Combined_Daily_Jagung_Nov23_Oct25.csv
  - Original Rows: 42379
  - Rows After Filtering: 27778 (Dropped 14601)
  - Unique Provinces: 38
  - Date Range: 2023-11-01 to 2025-10-31
