In [None]:
%%writefile /kaggle/working/RWKV-TS/configs/train_config.yaml

data:
  path: "/kaggle/working/BNB_OHLC_5m.csv"
  freq: "5T"                # Data resolution 5 phút
  train_ratio: 1.0        

model:
  model_type: "lstm"    # "lstm", "lstm_attention", "optimize"
  enc_in: 26                # Số features 
  d_model: 128               # Giảm từ 128 -> 96 
  e_layers: 1               # Số LSTM layers
  n_heads: 4                # Số attention heads
  seq_len: 288              # 24h = 288*5phút (tăng context window)
  pred_len: 36              # 3h = 36*5phút
  dropout: 0.5              # Tăng dropout để chống overfit
  output_dim: 1

training:
  epochs: 100               
  batch_size: 128           
  lr: 0.0001                
  device: "cuda"
  log_dir: "logs"
  checkpoint_dir: "checkpoints"
  checkpoint_interval: 10
  resume: "auto"
  patience: 50             
  min_delta: 0.0005
  loss_fn: "mse"
  use_amp: True
  grad_accum_steps: 1       
  ema_decay: 0.996
  use_swa: True
  swa_lr: 0.01
  swa_start_ratio: 0.5
  warmup_epochs: 5   

In [4]:
import pandas as pd

# Đọc file gốc
df = pd.read_csv('/kaggle/input/bnb-eth-ohlc-5m/BNB_OHLC_5m.csv')

# Chuẩn hóa tên cột timestamp
column_map = {
    'Timestamp': 'timestamp',
    'Date': 'timestamp',
    'time': 'timestamp'
}
for original_col, target_col in column_map.items():
    if original_col in df.columns:
        df.rename(columns={original_col: target_col}, inplace=True)

df.to_csv("/kaggle/working/BNB_OHLC_5m.csv", index=False)

In [None]:
import sys
sys.path.append('/kaggle/working/RWKV-TS/src')  

import torch
import yaml
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from lstm_model import LSTMModel 
from data_loader import CryptoDataset, CryptoDataLoader
from sklearn.metrics import (
    mean_absolute_error, 
    mean_squared_error, 
    r2_score,
    mean_absolute_percentage_error,
    median_absolute_error,
)

# 1. Load config từ file YAML
config_path = "/kaggle/working/RWKV-TS/configs/train_config.yaml"
with open(config_path) as f:
    config = yaml.safe_load(f)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 2. Khởi tạo model 
model = LSTMModel(config).to(device)  

# 3. Load weights từ best_model.pt
checkpoint_path = "/kaggle/input/lstm-model-training-swa/RWKV-TS/checkpoints/20250518_024522/best_epoch_16.pt"
checkpoint = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# 4. Tạo tập train để lấy scalers (không dùng để train)
train_data_loader = CryptoDataLoader(config_path=config_path)
scalers = train_data_loader.scalers  # Lấy scalers đã được fit từ tập train

# 6. Tạo dataset test mới với class tùy chỉnh và scalers từ tập train
test_dataset = CryptoDataset(
    data_path="/kaggle/working/BNB_OHLC_5m.csv",
    config=config,
    train=False,
    scalers=scalers, 
    test_mode=True
)

# 7. Tạo DataLoader cho test
test_loader = DataLoader(
    test_dataset,
    batch_size=config['training']['batch_size'],
    shuffle=False,
    num_workers=4
)

# 8. Hàm dự đoán
def predict(model, data_loader):
    predictions = []
    actuals = []
    with torch.no_grad():
        for batch in data_loader:
            x = batch['x'].to(device)
            y = batch['y'].to(device)
            pred = model(x)
            predictions.append(pred.cpu().numpy())
            actuals.append(y.cpu().numpy())
    return np.concatenate(predictions), np.concatenate(actuals)

# 9. Thực hiện dự đoán
preds, true_values = predict(model, test_loader)
print("Predictions shape:", preds.shape)

# 10. Đánh giá
y_true = true_values.flatten()
y_pred = preds.flatten()

mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
medae = median_absolute_error(y_true, y_pred)

# In kết quả
print("\nEvaluation Metrics:")
print(f"MAE   : {mae:.4f}")
print(f"MSE   : {mse:.4f}")
print(f"RMSE  : {rmse:.4f}")
print(f"MAPE  : {mape*100:.2f}%")
print(f"R²    : {r2:.4f}")
print(f"MedAE : {medae:.4f}")

# 11. Visualization
import matplotlib.pyplot as plt

# Vẽ biểu đồ cho 5 mẫu đầu tiên
num_plot = 5  
for i in range(num_plot):
    plt.figure(figsize=(12, 5))
    plt.plot(true_values[i], label="Ground Truth", marker='o', markersize=4)
    plt.plot(preds[i], label="Prediction", marker='x', markersize=4)
    plt.title(f"Sample {i+1} - Prediction vs Ground Truth")
    plt.xlabel("Time Step")
    plt.ylabel("Scaled Value")
    plt.legend()
    plt.grid(True)
    plt.show()

# Scatter plot
plt.figure(figsize=(8, 8))
plt.scatter(y_true, y_pred, alpha=0.3, s=10)
plt.xlabel("True Values (scaled)")
plt.ylabel("Predictions (scaled)")
plt.title("Predictions vs True Values")
plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--')
plt.grid(True)
plt.show()

# Phân phối lỗi
residuals = y_pred - y_true
plt.figure(figsize=(10, 5))
plt.hist(residuals, bins=50, edgecolor='black')
plt.title("Distribution of Prediction Errors")
plt.xlabel("Prediction Error")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

error_matrix = np.abs(preds - true_values).squeeze()  # (155008, 12, 1) -> (155008, 12)

# Hoặc: error_matrix = np.abs(preds.squeeze(-1) - true_values.squeeze(-1))

plt.figure(figsize=(12, 6))
sns.heatmap(error_matrix, cmap="YlGnBu", cbar=True)
plt.title("Heatmap of Absolute Errors per Step")
plt.xlabel("Prediction Step")
plt.ylabel("Sample Index")
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error

true_values_2d = true_values.squeeze(-1)
preds_2d = preds.squeeze(-1)

window_size = 100
rolling_mse = [
    mean_squared_error(true_values_2d[i:i+window_size], preds_2d[i:i+window_size])
    for i in range(len(true_values_2d) - window_size)
]

import matplotlib.pyplot as plt

plt.plot(rolling_mse)
plt.title("Rolling MSE (window=100)")
plt.xlabel("Start Index")
plt.ylabel("MSE")
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error

true_values_2d = true_values.squeeze(-1)
preds_2d = preds.squeeze(-1)

stepwise_mse = [
    mean_squared_error(true_values_2d[:, step], preds_2d[:, step])
    for step in range(true_values_2d.shape[1])
]

import matplotlib.pyplot as plt

plt.plot(stepwise_mse)
plt.title("Stepwise MSE")
plt.xlabel("Start Index")
plt.ylabel("MSE")
plt.show()

step_errors = np.median(error_matrix, axis=0)

worst_steps = np.argsort(step_errors)[-3:]

print(f"Steps to improve: {worst_steps}")
for step in worst_steps:
    print(f"Step {step}: Median error = {step_errors[step]:.6f}")