In [None]:
import warnings
warnings.filterwarnings('ignore')

import logging, joblib, os, contextlib
logging.getLogger('lightgbm').setLevel(logging.WARNING)
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import STL
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
import shap
import matplotlib.pyplot as plt

# Paths ke CSV split
TRAIN_PATH = '/kaggle/input/data-btc/fix2/train.csv'
VAL_PATH   = '/kaggle/input/data-btc/fix2/val.csv'
TEST_PATH  = '/kaggle/input/data-btc/fix2/test.csv'

# Load splits
df_tr = pd.read_csv(TRAIN_PATH, parse_dates=['timestamp']).drop(columns=['timestamp','close_time'])
df_va = pd.read_csv(VAL_PATH,   parse_dates=['timestamp']).drop(columns=['timestamp','close_time'])
df_te = pd.read_csv(TEST_PATH,  parse_dates=['timestamp']).drop(columns=['timestamp','close_time'])

# Constants
horizons = {'1h':1,'2h':2,'3h':3,'6h':6,'12h':12,'1d':24,'3d':72,'7d':168,'15d':360,'30d':720}
PERIOD   = 24

# Helper: buat label shift
def prepare(df, h):
    df2 = df.copy()
    df2['label'] = df2['close'].shift(-h)
    df2.dropna(subset=['label'], inplace=True)
    df2.reset_index(drop=True, inplace=True)
    return df2

# Helper: STL silent
def fit_stl_quiet(series, period=PERIOD):
    with open(os.devnull, 'w') as dn, contextlib.redirect_stderr(dn):
        return STL(series, period=period, robust=True).fit()

# Helper: Optuna tuning
def tune_lgbm(X, y):
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators',100,500),
            'num_leaves':    trial.suggest_int('num_leaves',16,64),
            'max_depth':     trial.suggest_int('max_depth',3,10),
            'learning_rate': trial.suggest_float('learning_rate',1e-3,1e-1,log=True),
            'feature_fraction': trial.suggest_float('feature_fraction',0.6,1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction',0.6,1.0),
            'min_child_samples': trial.suggest_int('min_child_samples',5,30),
            'verbose': -1, 'device':'gpu'
        }
        ms = []
        tss = TimeSeriesSplit(n_splits=3)
        for ti, va in tss.split(X):
            m = LGBMRegressor(random_state=42, **params)
            m.fit(X.iloc[ti], y[ti])
            ms.append(mean_squared_error(y[va], m.predict(X.iloc[va])))
        return np.mean(ms)
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=20, show_progress_bar=False)
    return study.best_trial.params

# Phase1: retrain per-step dengan 3 fitur
def walk_forward_phase1(history_df, test_df, label):
    history = history_df.copy()
    history_close = history['close'].to_numpy()
    preds, trues = [], []
    pts = min(10, len(test_df))

    for i in range(pts):
        tc = test_df['close'].iloc[i]
        full = np.append(history_close, tc)
        stl = fit_stl_quiet(full)
        trend, season = stl.trend, stl.seasonal
        n = len(history)

        # build df_re dari history
        df_re = history.copy()
        df_re['trend']    = trend[:n]
        df_re['season']   = season[:n]
        df_re['residual'] = df_re['label'] - (df_re['trend'] + df_re['season'])

        Xr = df_re[['close','trend','season']]
        yr = df_re['residual'].values

        mdl = LGBMRegressor(random_state=42)
        mdl.fit(Xr, yr)

        feat = test_df.iloc[[i]].copy()
        feat['trend']  = trend[n]
        feat['season'] = season[n]
        Xf = feat[['close','trend','season']]

        rp = mdl.predict(Xf)[0]
        fc = trend[n] + season[n] + rp

        preds.append(fc)
        trues.append(test_df['label'].iloc[i])

        history = pd.concat([history, test_df.iloc[[i]]], ignore_index=True)
        history_close = np.append(history_close, tc)

    rmse = mean_squared_error(trues, preds, squared=False)
    mape = mean_absolute_percentage_error(trues, preds) * 100

    plt.figure(figsize=(9,4))
    plt.plot(trues, label='Actual')
    plt.plot(preds, '--', label='Predicted')
    plt.title(f'{label} Phase1: RMSE={rmse:.2f}, MAPE={mape:.2f}%')
    plt.legend(); plt.tight_layout(); plt.show()

# Phase2: predict with final model
def walk_forward_phase2(history_df, test_df, model, feature_cols, label):
    history = history_df.copy()
    history_close = history['close'].to_numpy()
    preds, trues = [], []
    pts = min(10, len(test_df))

    for i in range(pts):
        tc = test_df['close'].iloc[i]
        full = np.append(history_close, tc)
        stl = fit_stl_quiet(full)
        trend, season = stl.trend, stl.seasonal
        n = len(history)

        feat = test_df.iloc[[i]].copy()
        feat['trend']  = trend[n]
        feat['season'] = season[n]
        Xf = feat[feature_cols]

        rp = model.predict(Xf)[0]
        fc = trend[n] + season[n] + rp

        preds.append(fc)
        trues.append(test_df['label'].iloc[i])

        history = pd.concat([history, test_df.iloc[[i]]], ignore_index=True)
        history_close = np.append(history_close, tc)

    rmse = mean_squared_error(trues, preds, squared=False)
    mape = mean_absolute_percentage_error(trues, preds) * 100

    plt.figure(figsize=(9,4))
    plt.plot(trues, label='Actual')
    plt.plot(preds, '--', label='Predicted')
    plt.title(f'{label} Phase2: RMSE={rmse:.2f}, MAPE={mape:.2f}%')
    plt.legend(); plt.tight_layout(); plt.show()

# ===== FASE 1: TRAIN → VAL =====
print("===== FASE 1: TRAIN → VAL =====")
for label, h in horizons.items():
    df_tr_p = prepare(df_tr, h)
    df_va_p = prepare(df_va, h)
    stl = fit_stl_quiet(df_tr_p['close'])
    df_tr_p['trend'], df_tr_p['season'] = stl.trend, stl.seasonal
    df_tr_p['residual'] = df_tr_p['label'] - (df_tr_p['trend'] + df_tr_p['season'])
    print(f'-- {label} --')
    walk_forward_phase1(df_tr_p, df_va_p, label)

# ===== FASE 2: (TRAIN+VAL) → TEST [1h & 30d] =====
print("\n===== FASE 2: (TRAIN+VAL) → TEST =====")
for label, h in {'1h':1, '30d':720}.items():
    print(f"\n--- Horizon {label} ---")
    df_hist = prepare(pd.concat([df_tr, df_va], ignore_index=True), h)
    df_test= prepare(df_te, h)

    # STL + residual
    stl = fit_stl_quiet(df_hist['close'])
    df_hist['trend'], df_hist['season'] = stl.trend, stl.seasonal
    df_hist['residual'] = df_hist['label'] - (df_hist['trend'] + df_hist['season'])

    # drop semua label_*
    feature_cols = [c for c in df_hist.columns
                    if c not in ('label','residual') and not c.startswith('label_')]

    X = df_hist[feature_cols]
    y = df_hist['residual'].values

    # tuning & train final
    best = tune_lgbm(X, y)
    print(" best_params:", best)
    model = LGBMRegressor(random_state=42, **best)
    model.fit(X, y)
    joblib.dump(model, f'/kaggle/working/model_{label}.pkl')

    # SHAP top-20
    expl = shap.TreeExplainer(model)
    sv   = expl.shap_values(X)
    df_sv= pd.DataFrame(np.abs(sv), columns=feature_cols)
    imp20 = df_sv.mean().sort_values(ascending=False).head(20)
    print("\nTop 20 SHAP importances:")
    print(imp20.to_string())

    # evaluasi
    walk_forward_phase2(df_hist, df_test, model, feature_cols, label)


Shap

In [None]:
import warnings
warnings.filterwarnings('ignore')

import logging, joblib, os, contextlib
logging.getLogger('lightgbm').setLevel(logging.WARNING)
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import STL
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
import shap
import matplotlib.pyplot as plt

# 1) Load data
TRAIN_PATH = '/kaggle/input/data-btc/fix3/fix2/train.csv'
VAL_PATH   = '/kaggle/input/data-btc/fix3/fix2/val.csv'
TEST_PATH  = '/kaggle/input/data-btc/fix3/fix2/test.csv'

df_tr = pd.read_csv(TRAIN_PATH, parse_dates=['timestamp']).drop(columns=['timestamp','close_time'])
df_va = pd.read_csv(VAL_PATH,   parse_dates=['timestamp']).drop(columns=['timestamp','close_time'])
df_te = pd.read_csv(TEST_PATH,  parse_dates=['timestamp']).drop(columns=['timestamp','close_time'])

# 2) Helper functions
def prepare(df, h):
    df2 = df.copy()
    df2['label'] = df2['close'].shift(-h)
    df2.dropna(subset=['label'], inplace=True)
    df2.reset_index(drop=True, inplace=True)
    return df2

def fit_stl_quiet(series, period=24):
    with open(os.devnull, 'w') as dn, contextlib.redirect_stderr(dn):
        return STL(series, period=period, robust=True).fit()

def tune_lgbm(X, y):
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators',100,500),
            'num_leaves':    trial.suggest_int('num_leaves',16,64),
            'max_depth':     trial.suggest_int('max_depth',3,10),
            'learning_rate': trial.suggest_float('learning_rate',1e-3,1e-1,log=True),
            'feature_fraction': trial.suggest_float('feature_fraction',0.6,1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction',0.6,1.0),
            'min_child_samples': trial.suggest_int('min_child_samples',5,30),
            'verbose': -1, 'device':'gpu'
        }
        ms = []
        tss = TimeSeriesSplit(n_splits=3)
        for ti, va in tss.split(X):
            m = LGBMRegressor(random_state=42, **params)
            m.fit(X.iloc[ti], y[ti])
            ms.append(mean_squared_error(y[va], m.predict(X.iloc[va])))
        return np.mean(ms)

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=20, show_progress_bar=False)
    return study.best_trial.params

def walk_forward(history_df, test_df, model, feature_cols, period=24):
    history = history_df.copy()
    history_close = history['close'].to_numpy()
    preds, trues = [], []
    pts = min(10, len(test_df))

    for i in range(pts):
        tc = test_df['close'].iloc[i]
        full = np.append(history_close, tc)
        stl = fit_stl_quiet(full, period)
        trend, season = stl.trend, stl.seasonal
        n = len(history)

        feat = test_df.iloc[[i]].copy()
        feat['trend']  = trend[n]
        feat['season'] = season[n]
        Xf = feat[feature_cols]

        rp = model.predict(Xf)[0]
        fc = trend[n] + season[n] + rp

        preds.append(fc)
        trues.append(test_df['label'].iloc[i])

        history_close = np.append(history_close, tc)
        history = pd.concat([history, test_df.iloc[[i]]], ignore_index=True)

    rmse = mean_squared_error(trues, preds, squared=False)
    mape = mean_absolute_percentage_error(trues, preds) * 100

    plt.figure(figsize=(9,4))
    plt.plot(trues, label='Actual')
    plt.plot(preds, '--', label='Predicted')
    plt.title(f'Forecast vs Actual (10) — RMSE={rmse:.2f}, MAPE={mape:.2f}%')
    plt.legend(); plt.tight_layout(); plt.show()


# 3) FASE 2 — horizon 1h, 1d, 30d
for label, h in {'1h':1, '1d':24, '30d':720}.items():
    print(f"\n=== Horizon {label} ===")

    # a) siapkan data
    df_hist = prepare(pd.concat([df_tr, df_va], ignore_index=True), h)
    df_test = prepare(df_te, h)

    # b) STL + residual sebagai target
    stl = fit_stl_quiet(df_hist['close'])
    df_hist['trend'], df_hist['season'] = stl.trend, stl.seasonal
    df_hist['residual'] = df_hist['label'] - (df_hist['trend'] + df_hist['season'])

    # c) tentukan feature_cols: drop label, residual, dan semua kolom label_*
    feature_cols = [
        c for c in df_hist.columns
        if c not in ('label','residual') and not c.startswith('label_')
    ]

    # d) tuning & training final
    X = df_hist[feature_cols]
    y = df_hist['residual'].values
    best = tune_lgbm(X, y)
    print(" best_params:", best)

    model = LGBMRegressor(random_state=42, **best)
    model.fit(X, y)
    joblib.dump(model, f'/kaggle/working/model_{label}.pkl')

    # e) SHAP — top 20 mean(|SHAP|)
    expl = shap.TreeExplainer(model)
    sv   = expl.shap_values(X)
    df_sv = pd.DataFrame(np.abs(sv), columns=feature_cols)
    imp20 = df_sv.mean().sort_values(ascending=False).head(20)
    print("\nTop 20 SHAP importances:")
    print(imp20.to_string())

    # f) evaluasi via walk_forward
    walk_forward(df_hist, df_test, model, feature_cols)
