In [564]:
# =========================
# 1. IMPORTS
# =========================
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader


import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import matplotlib.pyplot as plt

In [565]:
# =========================
# 2. LOAD Processed DATA
# =========================
df = pd.read_csv("../data/processing/processed_v2.csv")
volatility = "volatility_rolling_28"
zscore = "zscore_rolling_28"

df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)
df["zscore_scaled"] = np.tanh(df[zscore] / 6)
# df["ema_dist"] = (df["ema_9"] - df["ema_21"]) / df["close"] * 10

In [566]:
df.columns

Index(['Unnamed: 0', 'date', 'close', 'high', 'low', 'open', 'volume',
       'log_returns', 'simple_returns', 'zscore_rolling_28',
       'volatility_rolling_28', 'ema_9', 'ema_21', 'zscore_scaled'],
      dtype='str')

In [567]:
# px.line(
#     df,
#     x="date",
#     y=["log_returns", "zscore_scaled", volatility, "ema_dist"],
# )

In [568]:
# =========================
# 4. SUPERVISED DATASET
# =========================

features = ["log_returns", volatility, "zscore_scaled"]
target = "log_returns"

lookback = 7
train_ratio = 0.8


def make_sequences(df_feat, target_col, lookback):
    X, y = [], []

    for i in range(len(df_feat) - lookback):
        X.append(df_feat.iloc[i : i + lookback].values.flatten())
        y.append(df_feat.iloc[i + lookback][target_col])

    return np.array(X), np.array(y)


# ===== build sequences
data = df[features]
X, y = make_sequences(data, target, lookback)

split = int(len(X) * train_ratio)

X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]


# =========================
# EXPORT CSV
# =========================
col_names = [f"{f}_t-{lookback-t}" for t in range(lookback) for f in features]

df_train = pd.DataFrame(X_train, columns=col_names)
df_train["target"] = y_train

df_test = pd.DataFrame(X_test, columns=col_names)
df_test["target"] = y_test

df_train.to_csv("../data/datasets/train_sequences.csv", index=False)
df_test.to_csv("../data/datasets/test_sequences.csv", index=False)

In [569]:
X_train = X_train.reshape(-1, lookback, len(features))
X_test  = X_test.reshape(-1, lookback, len(features))

In [570]:

# =========================
# 5. NORMALIZATION
# =========================
X_mu = X_train.mean(axis=(0, 1), keepdims=True)
X_std = X_train.std(axis=(0, 1), keepdims=True) + 1e-8

X_train = (X_train - X_mu) / X_std
X_test = (X_test - X_mu) / X_std

y_mu = y_train.mean()
y_std = y_train.std() + 1e-8

y_train = (y_train - y_mu) / y_std
y_test = (y_test - y_mu) / y_std

In [571]:
# =========================
# 6. DATASET + DATALOADER
# =========================
class ReturnDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X).float()
        self.y = torch.tensor(y).float().unsqueeze(-1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_loader = DataLoader(ReturnDataset(X_train, y_train), batch_size=32, shuffle=False)

test_loader = DataLoader(ReturnDataset(X_test, y_test), batch_size=32, shuffle=False)


In [None]:
# # =========================
# # 7. MODEL
# # =========================
# class LSTMRegressor(nn.Module):
#     def __init__(self, input_dim=4, hidden_dim=16):
#         super().__init__()
#         self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=2, batch_first=True)
#         self.fc = nn.Linear(hidden_dim, 1)

#     def forward(self, x):
#         out, _ = self.lstm(x)
#         h_last = out[:, -1, :]
#         return self.fc(h_last)


class LSTMRegressor(nn.Module):
    def __init__(
        self,
        input_dim: int = 3,
        hidden_dim: int = 32,
        num_layers: int = 2,
    ):
        super().__init__()

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
        )

        # chỉ 1 linear layer
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        out, _ = self.lstm(x)

        # lấy timestep cuối
        last_hidden = out[:, -1, :]

        out = self.fc(last_hidden)

        return out


model = LSTMRegressor(input_dim=len(features))
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

In [573]:
log_dir = f"runs/lstm_regressor_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
writer = SummaryWriter(log_dir)

In [574]:
epochs = 300

for epoch in range(epochs):
    model.train()
    train_loss_sum = 0.0

    for xb, yb in train_loader:
        optimizer.zero_grad()

        preds = model(xb)
        loss = criterion(preds, yb)

        loss.backward()

        # vẫn giữ clip để tránh exploding gradient (nên giữ cái này)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)

        optimizer.step()

        train_loss_sum += loss.item() * len(xb)

    train_loss = train_loss_sum / len(train_loader.dataset)

    print(f"Epoch {epoch+1:3d} | Loss {train_loss:.6f}")

    # ===== epoch log =====
    writer.add_scalar("train/loss", train_loss, epoch)

    print(f"Epoch {epoch+1:3d} | Loss {train_loss:.6f}")

    # ===== optional: weight histogram =====
    for name, param in model.named_parameters():
        writer.add_histogram(name, param, epoch)

Epoch   1 | Loss 1.002737
Epoch   1 | Loss 1.002737
Epoch   2 | Loss 1.002661
Epoch   2 | Loss 1.002661
Epoch   3 | Loss 1.001591
Epoch   3 | Loss 1.001591
Epoch   4 | Loss 1.000771
Epoch   4 | Loss 1.000771
Epoch   5 | Loss 0.999888
Epoch   5 | Loss 0.999888
Epoch   6 | Loss 0.998464
Epoch   6 | Loss 0.998464
Epoch   7 | Loss 0.996165
Epoch   7 | Loss 0.996165
Epoch   8 | Loss 0.993313
Epoch   8 | Loss 0.993313
Epoch   9 | Loss 1.000284
Epoch   9 | Loss 1.000284
Epoch  10 | Loss 0.995624
Epoch  10 | Loss 0.995624
Epoch  11 | Loss 0.990582
Epoch  11 | Loss 0.990582
Epoch  12 | Loss 0.983799
Epoch  12 | Loss 0.983799
Epoch  13 | Loss 0.991071
Epoch  13 | Loss 0.991071
Epoch  14 | Loss 0.984683
Epoch  14 | Loss 0.984683
Epoch  15 | Loss 0.992437
Epoch  15 | Loss 0.992437
Epoch  16 | Loss 0.985923
Epoch  16 | Loss 0.985923
Epoch  17 | Loss 0.969430
Epoch  17 | Loss 0.969430
Epoch  18 | Loss 0.959276
Epoch  18 | Loss 0.959276
Epoch  19 | Loss 0.952401
Epoch  19 | Loss 0.952401
Epoch  20 | 

In [590]:
torch.save(model.state_dict(), "lstm_model.pt")

In [575]:
def predict(loader):
    preds, trues = [], []

    model.eval()
    with torch.no_grad():
        for xb, yb in loader:
            out = model(xb)

            preds.append(out.squeeze(-1).cpu().numpy())
            trues.append(yb.squeeze(-1).cpu().numpy())

    return np.concatenate(preds), np.concatenate(trues)

def evaluate_and_log(name, y_pred, y_true, epoch):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    writer.add_scalar(f"{name}/mse", mse, epoch)
    writer.add_scalar(f"{name}/rmse", rmse, epoch)
    writer.add_scalar(f"{name}/mae", mae, epoch)
    writer.add_scalar(f"{name}/r2", r2, epoch)

    print(
        f"{name.upper()} | "
        f"MSE {mse:.6f} | RMSE {rmse:.6f} | MAE {mae:.6f} | R2 {r2:.4f}"
    )

    fig = plt.figure(figsize=(6, 6))
    plt.scatter(y_true, y_pred, s=5, alpha=0.5)
    plt.plot([y_true.min(), y_true.max()],
             [y_true.min(), y_true.max()],
             linestyle="--")
    writer.add_figure(f"{name}/pred_vs_true", fig, epoch)
    plt.close()


In [576]:
def update_ret_pers(prev_ret_pers, prev_log_ret, new_log_ret, scale=20):
    """
    Cập nhật ret_pers dựa trên logic run-length
    """
    if np.sign(new_log_ret) == np.sign(prev_log_ret) and new_log_ret != 0:
        run = prev_ret_pers * scale + 1
    else:
        run = 1 if new_log_ret != 0 else 0

    return np.tanh(run / scale)


In [577]:
def recursive_forecast_log_return(
    model,
    df,
    start_index,
    n_steps,
    lookback,
    X_mu,
    X_std,
    y_mu,
    y_std,
    features,
    target,
):
    """
    Recursive forecast log_returns, then reconstruct close
    """

    model.eval()

    # ===== 1. Initial window (REAL DATA)
    window = df.loc[
        start_index - lookback : start_index - 1,
        features,
    ].values.copy()

    # store predictions
    logret_preds = []

    target_idx = features.index(target)
    z_idx = features.index("zscore_scaled") if "zscore_scaled" in features else None
    vol_idx = features.index(volatility) if volatility in features else None
    ema_dist_idx = features.index("ema_dist") if "ema_dist" in features else None

    # init rolling stats from real data
    ret_hist = df["log_returns"].iloc[start_index - 30 : start_index].tolist()
    last_close = df["close"].iloc[start_index - 1]
    last_ema_9 = df["ema_9"].iloc[start_index - 1]
    last_ema_21 = df["ema_21"].iloc[start_index - 1]

    alpha_9 = 2 / (9 + 1)
    alpha_21 = 2 / (21 + 1)

    for _ in range(n_steps):

        # normalize window
        x = (window - X_mu.squeeze()) / X_std.squeeze()
        x = torch.tensor(x).float().unsqueeze(0)

        # predict log_returns
        with torch.no_grad():
            y_hat_norm = model(x).item()

        new_log_ret = y_hat_norm * y_std + y_mu
        logret_preds.append(new_log_ret)

        # ===== update derived features
        # update close & EMA
        new_close = last_close * np.exp(new_log_ret)
        new_ema_9 = alpha_9 * new_close + (1 - alpha_9) * last_ema_9
        new_ema_21 = alpha_21 * new_close + (1 - alpha_21) * last_ema_21
        new_ema_dist = new_ema_9 - new_ema_21

        # update rolling stats for log_returns
        ret_hist.append(new_log_ret)
        if len(ret_hist) > 30:
            ret_hist = ret_hist[-30:]
        mu = float(np.mean(ret_hist))
        sigma = float(np.std(ret_hist)) + 1e-8
        new_z = (new_log_ret - mu) / sigma
        new_vol = sigma

        # build new row
        new_row = window[-1].copy()
        new_row[target_idx] = new_log_ret
        if z_idx is not None:
            new_row[z_idx] = new_z
        if vol_idx is not None:
            new_row[vol_idx] = new_vol
        if ema_dist_idx is not None:
            new_row[ema_dist_idx] = new_ema_dist

        # roll window
        window = np.vstack([window[1:], new_row])

        # update memory
        last_close = new_close
        last_ema_9 = new_ema_9
        last_ema_21 = new_ema_21

    return np.array(logret_preds)


In [578]:
import plotly.graph_objects as go

# ===== predict =====
y_pred_norm, y_true_norm = predict(train_loader)

# inverse transform
y_pred = y_pred_norm * y_std + y_mu
y_true = y_true_norm * y_std + y_mu


# ===== map dates =====
all_target_idx = np.arange(lookback, len(df))
train_target_idx = all_target_idx[:split]
dates_train = df["date"].iloc[train_target_idx].reset_index(drop=True)


# =========================
# Plotly
# =========================
fig = go.Figure()

# actual
fig.add_trace(go.Scatter(
    x=dates_train,
    y=y_true,
    mode="lines",
    name="Actual log_returns",
    line=dict(width=1.5)
))

# predicted
fig.add_trace(go.Scatter(
    x=dates_train,
    y=y_pred,
    mode="lines",
    name="Predicted log_returns",
    opacity=0.8
))

# zero line
fig.add_hline(y=0, line_width=1, line_dash="dash")


# ===== layout tuning =====
fig.update_layout(
    title="Log Returns - Train Set (Actual vs Pred)",
    xaxis_title="Date",
    yaxis_title="log_returns",
    hovermode="x unified",
    height=450,
    legend=dict(orientation="h", y=1.05)
)

fig.show()


In [579]:
import plotly.graph_objects as go

# ===== predict =====
y_pred_norm, y_true_norm = predict(test_loader)

# inverse transform
y_pred = y_pred_norm * y_std + y_mu
y_true = y_true_norm * y_std + y_mu


# ===== map dates =====
all_target_idx = np.arange(lookback, len(df))
test_target_idx = all_target_idx[split:]
dates_test = df["date"].iloc[test_target_idx].reset_index(drop=True)


# =========================
# 📈 Plotly
# =========================
fig = go.Figure()

# actual
fig.add_trace(go.Scatter(
    x=dates_test,
    y=y_true,
    mode="lines",
    name="Actual log_returns",
    line=dict(width=1.5)
))

# predicted
fig.add_trace(go.Scatter(
    x=dates_test,
    y=y_pred,
    mode="lines",
    name="Predicted log_returns",
    opacity=0.8
))

# zero line
fig.add_hline(y=0, line_width=1, line_dash="dash")


# ===== layout tuning =====
fig.update_layout(
    title="Log Returns — Test Set (Actual vs Pred)",
    xaxis_title="Date",
    yaxis_title="log_returns",
    hovermode="x unified",
    height=450,
    legend=dict(orientation="h", y=1.05)
)

fig.show()


In [580]:
START_INDEX = split + lookback  # điểm bắt đầu dự đoán
N_STEPS = 430  # số ngày dự đoán tiếp

logret_pred = recursive_forecast_log_return(
    model=model,
    df=df,
    start_index=START_INDEX,
    n_steps=N_STEPS,
    lookback=lookback,
    X_mu=X_mu,
    X_std=X_std,
    y_mu=y_mu,
    y_std=y_std,
    features=features,
    target=target,
)

In [581]:
anchor_close = df["close"].iloc[START_INDEX - 1]

close_pred = anchor_close * np.exp(np.cumsum(logret_pred))


In [582]:
df_rec = df.iloc[START_INDEX : START_INDEX + N_STEPS].copy()
df_rec["close_pred"] = close_pred


In [583]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df_rec["date"],
        y=df_rec["close"],
        name="Actual Close",
        line=dict(color="black"),
    )
)

fig.add_trace(
    go.Scatter(
        x=df_rec["date"],
        y=df_rec["close_pred"],
        name="Recursive Prediction",
        line=dict(color="red"),
    )
)

start_date = df_rec["date"].iloc[0]

fig.add_shape(
    type="line",
    x0=start_date,
    x1=start_date,
    y0=0,
    y1=1,
    xref="x",
    yref="paper",
    line=dict(color="blue", dash="dash"),
)

fig.add_annotation(
    x=start_date,
    y=1,
    yref="paper",
    text="Prediction Start",
    showarrow=False,
    xanchor="left",
    yanchor="bottom",
)

fig.update_layout(
    title="Recursive Forecast (log_return → close)",
    xaxis_title="Time",
    yaxis_title="Price",
    legend=dict(x=0.01, y=0.99),
)

fig.show()

In [584]:
cols = features
df_corr = df[cols].copy()


In [585]:
corr_matrix = df_corr.corr(method="pearson")
print(corr_matrix)

                       log_returns  volatility_rolling_28  zscore_scaled
log_returns               1.000000              -0.027602       0.900913
volatility_rolling_28    -0.027602               1.000000      -0.028801
zscore_scaled             0.900913              -0.028801       1.000000


In [586]:
# X shape: (N, lookback, 4)
X_flat = X.reshape(X.shape[0], -1)

col_names = []
for t in range(lookback):
    for f in features:
        col_names.append(f"{f}_t-{lookback-t}")

df_flat = pd.DataFrame(X_flat, columns=col_names)


In [587]:
corr_flat = df_flat.corr()

In [588]:
y_series = pd.Series(y, name="y_log_returns")

corr_with_y = df_flat.apply(lambda col: col.corr(y_series))
corr_with_y.sort_values(key=np.abs, ascending=False).head(30)


log_returns_t-7             -0.043575
zscore_scaled_t-5            0.030958
log_returns_t-3              0.023922
log_returns_t-1             -0.021812
zscore_scaled_t-7           -0.021358
log_returns_t-6             -0.020699
zscore_scaled_t-3            0.019208
log_returns_t-4             -0.018990
zscore_scaled_t-4           -0.015055
log_returns_t-5              0.010303
volatility_rolling_28_t-6    0.008424
volatility_rolling_28_t-7    0.007988
volatility_rolling_28_t-4   -0.007031
volatility_rolling_28_t-1   -0.005697
zscore_scaled_t-2           -0.004682
volatility_rolling_28_t-2   -0.003862
log_returns_t-2              0.002921
volatility_rolling_28_t-3   -0.002797
volatility_rolling_28_t-5    0.002785
zscore_scaled_t-1           -0.000727
zscore_scaled_t-6           -0.000069
dtype: float64

In [589]:
import plotly.graph_objects as go

# ===== predict =====
y_pred_norm, y_true_norm = predict(test_loader)

# inverse transform
y_pred = y_pred_norm * y_std + y_mu
y_true = y_true_norm * y_std + y_mu


# ===== map dates =====
all_target_idx = np.arange(lookback, len(df))
test_target_idx = all_target_idx[split:]
dates_test = df["date"].iloc[test_target_idx].reset_index(drop=True)


# =========================
# 📈 Plotly
# =========================
fig = go.Figure()

# actual
fig.add_trace(go.Scatter(
    x=dates_test,
    y=y_true,
    mode="lines",
    name="Actual log_returns",
    line=dict(width=1.5)
))

# predicted
fig.add_trace(go.Scatter(
    x=dates_test,
    y=y_pred,
    mode="lines",
    name="Predicted log_returns",
    opacity=0.8
))

# zero line
fig.add_hline(y=0, line_width=1, line_dash="dash")


# ===== layout tuning =====
fig.update_layout(
    title="Log Returns — Test Set (Actual vs Pred)",
    xaxis_title="Date",
    yaxis_title="log_returns",
    hovermode="x unified",
    height=450,
    legend=dict(orientation="h", y=1.05)
)

fig.show()
