In [17]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
from darts import TimeSeries
from darts.models import RandomForestModel
from darts.metrics import mape, mae, mse
from darts.utils.model_selection import train_test_split
from darts.dataprocessing.transformers import Scaler, Diff
from darts.utils.missing_values import fill_missing_values
import warnings
import traceback

warnings.filterwarnings("ignore")

# ============================================================
# ‚öôÔ∏è CONFIGURATION
# ============================================================

BASE_PATH = r"C:\wilson\Thesis\Data\final_data"
RESULT_PATH = r"C:\wilson\Thesis\Results"
PREDICTION_DIR = os.path.join(RESULT_PATH, "rf_predictionsss_final")
os.makedirs(RESULT_PATH, exist_ok=True)
os.makedirs(PREDICTION_DIR, exist_ok=True)

DATA_FILES = {
    "ihsg": {"path": os.path.join(BASE_PATH, "ihsg_final.csv"), "target_col": "close_ihsg"},
    "lq45": {"path": os.path.join(BASE_PATH, "lq45_final.csv"), "target_col": "close_lq45"},
    "kompas100": {"path": os.path.join(BASE_PATH, "kompas100_final.csv"), "target_col": "close_kompas100"},
}

COVARIATES_NONE = []
COVARIATES_DOMESTIC = ["close_inflation", "close_gdp", "close_cpi", "close_unemployment"]
COVARIATES_GLOBAL = ["close_usd", "close_brent", "close_wti"]
COVARIATES_ALL = COVARIATES_DOMESTIC + COVARIATES_GLOBAL
COVARIATE_SETS = {
    "None": COVARIATES_NONE,
    "Domestic": COVARIATES_DOMESTIC,
    "Global": COVARIATES_GLOBAL,
    "All": COVARIATES_ALL,
    "Inflation": ["close_inflation"],
    "GDP": ["close_gdp"],
    "CPI": ["close_cpi"],
    "Unemployment": ["close_unemployment"],
    "USD": ["close_usd"],
    "Brent": ["close_brent"],
    "WTI": ["close_wti"]
}

SCENARIOS = {
    "Short-Term Sprint": {"window": 30, "horizon": 1},
    "Medium-Term Pace": {"window": 60, "horizon": 5},
    "Long-Term Marathon": {"window": 90, "horizon": 10}
}

RF_CONFIG = {
    "n_estimators": 100,
    "max_depth": 5,
    "max_samples": 0.7,
    "n_jobs": -1,
    "random_state": 42
}

# ============================================================
# üß© FUNCTIONS
# ============================================================

def load_data(path, target_col):
    df = pd.read_csv(path)
    df["date"] = pd.to_datetime(df["date"])
    df = df.sort_values("date").reset_index(drop=True)
    return df

def to_series(df, target_col, covariates):
    target_series = TimeSeries.from_dataframe(df, time_col="date", value_cols=target_col, fill_missing_dates=True, freq="B")
    target_series = fill_missing_values(target_series)
    covariate_series = None
    if covariates:
        covariate_series = TimeSeries.from_dataframe(df, time_col="date", value_cols=covariates, fill_missing_dates=True, freq="B")
        covariate_series = fill_missing_values(covariate_series)
    return target_series, covariate_series

def train_and_evaluate_rf(target_series, covariate_series, window, horizon):
    # 1. Split Data
    train, val = train_test_split(target_series, test_size=0.2)
    
    # 2. Log-Transform & Differencing
    ts_log = target_series.map(np.log)
    differencer = Diff(lags=1)
    ts_log_diff = differencer.fit_transform(ts_log)
    
    # 3. Scaling
    scaler = Scaler()
    ts_scaled = scaler.fit_transform(ts_log_diff)
    train_scaled, val_scaled = ts_scaled.split_before(val.start_time())
    
    # Covariates
    cov_transformed = None
    if covariate_series:
        cov_scaler = Scaler()
        cov_transformed = cov_scaler.fit_transform(covariate_series)

    # 4. Train Model
    model = RandomForestModel(
        lags=window,
        lags_past_covariates=window if covariate_series else None,
        output_chunk_length=horizon,
        **RF_CONFIG
    )
    model.fit(train_scaled, past_covariates=cov_transformed)

    # 5. Block-by-Block Forecast
    forecast_list_scaled = model.historical_forecasts(
        series=ts_scaled,
        past_covariates=cov_transformed,
        start=val.start_time(),
        forecast_horizon=horizon,
        stride=horizon,          
        retrain=False,
        last_points_only=False,  
        verbose=False
    )

    if isinstance(forecast_list_scaled, TimeSeries):
        forecast_list_scaled = [forecast_list_scaled]

    # 6. Reconstruction Loop
    all_pred_dates = []
    all_pred_prices = []

    for chunk_scaled in forecast_list_scaled:
        # A. Inverse Scale
        chunk_diff_log = scaler.inverse_transform(chunk_scaled)
        chunk_dates = chunk_diff_log.time_index
        chunk_log_returns = chunk_diff_log.values().flatten()

        # B. Anchor Price
        start_date = chunk_dates[0]
        idx = target_series.get_index_at_point(start_date)
        
        # C. Reconstruct
        anchor_log_price = ts_log[idx-1].values()[0][0]
        chunk_log_prices = anchor_log_price + np.cumsum(chunk_log_returns)
        chunk_prices = np.exp(chunk_log_prices)
        
        all_pred_dates.extend(chunk_dates)
        all_pred_prices.extend(chunk_prices)

    # 7. Metrics & DataFrame
    prediction_df = pd.DataFrame({
        "date": pd.to_datetime(all_pred_dates),
        "predicted": all_pred_prices
    })
    
    full_df = target_series.to_dataframe().reset_index()
    full_df.columns = ["date", "actual"]
    
    final_df = pd.merge(full_df, prediction_df, on="date", how="left")
    final_df["train_flag"] = 0
    final_df.loc[final_df["date"] <= train.end_time(), "train_flag"] = 1

    # Evaluate
    eval_df = final_df.dropna(subset=["predicted"])
    y_true = eval_df["actual"].values
    y_pred = eval_df["predicted"].values
    
    mape_val = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    mae_val = np.mean(np.abs(y_true - y_pred))
    mse_val = np.mean((y_true - y_pred)**2)

    return mape_val, mae_val, mse_val, final_df

# ============================================================
# üöÄ MAIN LOOP (MODIFIED)
# ============================================================

results = []
start_time = datetime.now()
print(f"üìÖ Experiment started: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")

for target_key, info in DATA_FILES.items():
    df = load_data(info["path"], info["target_col"])
    print(f"\nüìÇ Processing {target_key.upper()}")

    for scenario_name, sc in SCENARIOS.items():
        window, horizon = sc["window"], sc["horizon"]

        for cov_name, cov_list in COVARIATE_SETS.items():
            print(f"[‚ñ∂] {target_key.upper()} | {scenario_name} | Covariates={cov_name}")
            try:
                target_series, covariate_series = to_series(df, info["target_col"], cov_list)
                
                mape_val, mae_val, mse_val, pred_df = train_and_evaluate_rf(
                    target_series, covariate_series, window, horizon
                )
                
                # --- ‚ú® MODIFICATION START ‚ú® ---
                # Add metadata columns so the Dashboard can read them directly!
                pred_df["covariate_set"] = cov_name
                pred_df["target"] = target_key
                pred_df["scenario"] = scenario_name
                # --- ‚ú® MODIFICATION END ‚ú® ---

                fname = f"rf_pred_{target_key}_{scenario_name.replace(' ','_')}_{cov_name}.csv"
                pred_df.to_csv(os.path.join(PREDICTION_DIR, fname), index=False)
                
                results.append({
                    "Target": target_key.upper(),
                    "Scenario": scenario_name,
                    "Covariates": cov_name,
                    "Window": window,
                    "Horizon": horizon,
                    "MAPE": round(mape_val, 4),
                    "MAE": round(mae_val, 4),
                    "MSE": round(mse_val, 4)
                })
                print(f"   ‚úÖ MAPE={mape_val:.2f}% | MAE={mae_val:.2f}")
                
            except Exception as e:
                traceback.print_exc()
                print(f"   ‚ùå Error: {e}")
                results.append({"Target": target_key, "Scenario": scenario_name, "Error": str(e)})

results_df = pd.DataFrame(results)
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
out_path = os.path.join(RESULT_PATH, f"rf_results_summary_{timestamp}.csv")
results_df.to_csv(out_path, index=False)
print(f"\n‚úÖ All experiments done! Saved to {out_path}")

üìÖ Experiment started: 2026-01-06 21:41:44

üìÇ Processing IHSG
[‚ñ∂] IHSG | Short-Term Sprint | Covariates=None
   ‚úÖ MAPE=0.54% | MAE=37.81
[‚ñ∂] IHSG | Short-Term Sprint | Covariates=Domestic
   ‚úÖ MAPE=0.54% | MAE=37.82
[‚ñ∂] IHSG | Short-Term Sprint | Covariates=Global
   ‚úÖ MAPE=0.55% | MAE=39.02
[‚ñ∂] IHSG | Short-Term Sprint | Covariates=All
   ‚úÖ MAPE=0.55% | MAE=39.06
[‚ñ∂] IHSG | Short-Term Sprint | Covariates=Inflation
   ‚úÖ MAPE=0.54% | MAE=37.78
[‚ñ∂] IHSG | Short-Term Sprint | Covariates=GDP
   ‚úÖ MAPE=0.54% | MAE=37.80
[‚ñ∂] IHSG | Short-Term Sprint | Covariates=CPI
   ‚úÖ MAPE=0.54% | MAE=37.81
[‚ñ∂] IHSG | Short-Term Sprint | Covariates=Unemployment
   ‚úÖ MAPE=0.54% | MAE=37.79
[‚ñ∂] IHSG | Short-Term Sprint | Covariates=USD
   ‚úÖ MAPE=0.55% | MAE=38.71
[‚ñ∂] IHSG | Short-Term Sprint | Covariates=Brent
   ‚úÖ MAPE=0.54% | MAE=37.81
[‚ñ∂] IHSG | Short-Term Sprint | Covariates=WTI
   ‚úÖ MAPE=0.54% | MAE=37.81
[‚ñ∂] IHSG | Medium-Term Pace | Covariates=None
 