## Pure Lightgbm approach forecast 5 days volatolity

In [8]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [9]:
# 1. read eth price
eth = pd.read_csv("ETH_price.csv")
eth['date'] = pd.to_datetime(eth['Date(UTC)'])  
eth = eth[['date', 'Value']].rename(columns={'Value': 'ETH_close'})

# 2. read vix
vix = pd.read_csv("vix-daily.csv")
vix['date'] = pd.to_datetime(vix['DATE'])       
vix = vix[['date', 'CLOSE']].rename(columns={'CLOSE': 'VIX_close'})

# 3. read BTC + SP500 + GOLD + DXY
mkt = pd.read_csv("btc_full_sp500_gold_dxy_close_2017_onward.csv")
mkt['date'] = pd.to_datetime(mkt['date'])

# 4. aggregate by date
df = mkt.merge(eth, on='date', how='inner').merge(vix, on='date', how='left')

df = df.sort_values('date').reset_index(drop=True)
print(df.head())
print(df.tail())


FileNotFoundError: [Errno 2] No such file or directory: 'ETH_price.csv'

In [None]:
window = 5
df['eth_ret'] = df['ETH_close'].pct_change()

# past 5 days realized vol
df['eth_vol_5d_past'] = df['eth_ret'].rolling(window).std()
df['eth_vol_5d_past'] = df['eth_vol_5d_past'] * np.sqrt(252 / window)

# next 5 days realized vol
df['eth_vol_5d_next'] = df['eth_ret'].rolling(window).std().shift(-1)
df['eth_vol_5d_next'] = df['eth_vol_5d_next'] * np.sqrt(252 / window)

# define feature
# return
df['eth_ret_today']   = df['ETH_close'].pct_change()
df['btc_ret_today']   = df['BTC_close'].pct_change()
df['sp500_ret_today'] = df['SP500_close'].pct_change()
df['gold_ret_today']  = df['GOLD_close'].pct_change()
df['dxy_ret_today']   = df['DXY_close'].pct_change()
df['vix_ret_today']   = df['VIX_close'].pct_change()

# lag1
df['eth_ret_1d_lag1']   = df['eth_ret_today']
df['btc_ret_1d_lag1']   = df['btc_ret_today']
df['sp500_ret_1d_lag1'] = df['sp500_ret_today']
df['gold_ret_1d_lag1']  = df['gold_ret_today']
df['dxy_ret_1d_lag1']   = df['dxy_ret_today']
df['vix_ret_1d_lag1']   = df['vix_ret_today']

# keep some level factor
df['VIX_level'] = df['VIX_close']

df_model = df.dropna().copy()

In [None]:
# cut the data
train_end = "2023-12-31"
val_end   = "2024-12-31"

train = df_model[df_model['date'] <= train_end]
val   = df_model[(df_model['date'] > train_end) & (df_model['date'] <= val_end)]
test  = df_model[df_model['date'] > val_end]

# feature list
features = [
    # raw price features (safe to use)
    'BTC_open', 'BTC_high', 'BTC_low', 'BTC_close', 'BTC_volume',
    'SP500_close', 'GOLD_close', 'DXY_close',
    'ETH_close', 'VIX_close',

    # same-day returns
    #'eth_ret_today', 'btc_ret_today', 'sp500_ret_today',
    #'gold_ret_today', 'dxy_ret_today', 'vix_ret_today',

    # lag1 returns
    #'eth_ret_1d_lag1', 'btc_ret_1d_lag1', 'sp500_ret_1d_lag1',
    #'gold_ret_1d_lag1', 'dxy_ret_1d_lag1', 'vix_ret_1d_lag1',

    # past volatility (critical)
    'eth_vol_5d_past',
]

target = 'eth_vol_5d_next'

X_train, y_train = train[features], train[target]
X_val,   y_val   = val[features],   val[target]
X_test,  y_test  = test[features],  test[target]

print("Train size:", X_train.shape, "Val size:", X_val.shape, "Test size:", X_test.shape)


Train size: (1754, 11) Val size: (252, 11) Test size: (218, 11)


In [None]:
params = {
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'n_jobs': -1,
    'random_state': 42,
}

model = lgb.LGBMRegressor(**params)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[lgb.early_stopping(50, verbose=False)]
)

pred_val = model.predict(X_val)
pred_test = model.predict(X_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 1754, number of used features: 11
[LightGBM] [Info] Start training from score 0.373439


In [None]:
# test model
mse_val = mean_squared_error(y_val, pred_val)
mse_test = mean_squared_error(y_test, pred_test)
mse = mean_squared_error(y_test, pred_test)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, pred_test)
r2 = r2_score(y_test, pred_test)

print("MSE :", mse)
print("RMSE:", rmse)
print("MAE :", mae)
print("R^2 :", r2)

pred_val = model.predict(X_val)
pred_test = model.predict(X_test)

print("Baseline LGBM - Val MSE:", mse_val)
print("Baseline LGBM - Test MSE:", mse_test)

corr = np.corrcoef(y_test, pred_test)[0, 1]
print("Correlation:", corr)


MSE : 0.007896210761399795
RMSE: 0.08886062548395546
MAE : 0.06087979899541354
R^2 : 0.6248843261643355
Baseline LGBM - Val MSE: 0.007097034134371079
Baseline LGBM - Test MSE: 0.007896210761399795
Correlation: 0.7936985136379506


In [None]:
# build a DataFrame for plotting
plot_df = pd.DataFrame({
    'date': test['date'].values,   # time index
    'actual': y_test.values,       # actual future ETH return
    'predicted': pred_test         # model prediction
})

# set date as index
plot_df = plot_df.set_index('date')

# plot
plt.figure(figsize=(12,6))

# plot actual returns
plt.plot(plot_df.index, plot_df['actual'], label='Actual Vol', alpha=0.8)

# plot predicted returns
plt.plot(plot_df.index, plot_df['predicted'], label='Predicted Vol', alpha=0.8)

# styling
plt.title("Predicted vs Actual 5-Day ETH Volatility")
plt.xlabel("Date")
plt.ylabel("Return")
plt.legend()
plt.grid(True, alpha=0.3)

plt.show()



NameError: name 'pd' is not defined

In [None]:
# ===== Naive baseline: vol_{t+1} = vol_t =====
y_test_vol = test['eth_vol_5d_next']
y_pred_naive_vol = test['eth_vol_5d_past']  # 直接拿过去5日vol

mse_naive_v  = mean_squared_error(y_test_vol, y_pred_naive_vol)
rmse_naive_v = np.sqrt(mse_naive_v)
mae_naive_v  = mean_absolute_error(y_test_vol, y_pred_naive_vol)
r2_naive_v   = r2_score(y_test_vol, y_pred_naive_vol)

print("=== Naive Vol Baseline (vol_{t+1} = vol_t) ===")
print(f"Test RMSE: {rmse_naive_v}")
print(f"Test MAE : {mae_naive_v}")
print(f"Test R^2 : {r2_naive_v}")



=== Naive Vol Baseline (vol_{t+1} = vol_t) ===
Test RMSE: 0.09389632411316844
Test MAE : 0.05633971584868596
Test R^2 : 0.5811643304212561
