<a href="https://colab.research.google.com/github/t1nh233/predict_vnindex_stacked_lstm/blob/main/notebooks/lstm_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import sys
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import matplotlib.pyplot as plt
import numpy as np

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [None]:
## Load du lieu
url = "https://raw.githubusercontent.com/t1nh233/predict_vnindex_stacked_lstm/refs/heads/main/data/raw/vn_index_historical_data_9_12.csv"
vnindex = load_and_process_data(url)

## Xu ly cac dac trung moi
vnindex_feature = feature_extraction(vnindex)

## Cac dac trung su dung de huan luyen
FEATURE_COLUMNS = ['Close', 'Volume', 'RSI', 'SMA', 'EMA', 'MACD_Hist', 'BB_Width', 'BB_Percentage']
TARGET_COLUMN = 'Label'

## Trich xuat du lieu theo cac dac trung huan luyen
feature_data = vnindex_feature[FEATURE_COLUMNS + [TARGET_COLUMN]].copy()
TARGET_INDEX = feature_data.columns.get_loc(TARGET_COLUMN)

## Chia tap du lieu train, valid, set theo ti le 70:20:10
train_df, val_df, test_df = split_data(train_df, 0.7, 0.2)

## Scale data ve (0, 1)
scaler, train_scaled_df, val_scaled_df, test_scaled_df = scale_data(train_df, val_df, test_df, FEATURE_COLUMNS)

## Tao input (sliding window) cho LSTM
WINDOW_SIZE = 30
X_train, y_train = create_sliding_window(train_scaled_df, WINDOW_SIZE, TARGET_INDEX)
X_val, y_val = create_sliding_window(val_scaled_df, WINDOW_SIZE, TARGET_INDEX)
X_test, y_test = create_sliding_window(test_scaled_df, WINDOW_SIZE, TARGET_INDEX)


device = torch.device("cuda" if torch.cude.is_available() else "cpu")

train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
val_dataset = torch.utils.data.TensorDataset(X_val, y_val)

## De luu tham so sau khi dieu chinh va model sau khi huan luyen
save_dir = os.path.join('..', 'models')
os.makedirs(save_dir, exist_ok=True)

## Dieu chinh tham so cua mo hinh (chon ra bo tham so tot nhat)

def hyper_tuning(trial):
  ## Sieu tham so cho model
  hidden_size = trial.suggest_categorical("hidden_size", [32, 64, 128, 256])
  num_layers = trial.suggest_int("num_layers", 1, 3)
  dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)

  ## Sieu tham so cho optimizer
  learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
  weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)

  batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])

  train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
  val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  ## Khoi tao model
  model = LSTMModel(
      input_size = X_train.shape[2],
      hidden_size = hidden_size,
      num_layers = num_layers,
      dropout_rate = dropout_rate
  ).to(device)

  loss_func = nn.HuberLoss(delta=1.0)
  optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

  ## Training voi epoch 10 tren train de test
  for epoch in range(10):
    model.train()
    for X_batch, y_batch in train_loader:
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)

      optimizer.zero_grad()
      y_pred = model(X_batch)
      loss = loss_func(y_batch, y_pred)
      loss.backward()

      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
      optimizer.step()

  ## Validate
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
      for X_batch_v, y_batch_v in val_loader:
        X_batch_v, y_batch_v = X_batch_v.to(device), y_batch_v.to(device)

        y_pred_v = model(X_batch_v)
        loss = loss_func(y_batch_v, y_pred_v)
        val_loss += loss.item()

    ## In ra ket qua khi chay tren bo tham so nay
    avg_val_loss = val_loss / len(val_loader)
    trial.report(avg_val_loss, epoch)

    if trial.should_prune():
      raise optuna.exceptions.TrialPruned()

  return avg_val_loss

## Bat dau chay
print("Start tuning")
study = optuna.create_study(direction="minimize")
study.optimize(hyper_tuning, n_trials=20)

## Tra ve bo tham so toi uu nhat
print("Best parameter: ", study.best_params)
best_params = study.best_params

## Luu best params vao file
best_params_config = {
    "window_size": WINDOW_SIZE,
    "feature_columns": FEATURE_COLUMNS,
    **best_params
}

params_path = os.path.join(save_dir, 'best_params.json')
with open(params_path, 'w') as f:
    json.dump(best_params_config, f, indent=4)

In [None]:
## Huan luyen mo hinh
final_batch_size = best_params['batch_size']
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=final_batch_size, shuffle=False)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=final_batch_size, shuffle=False)

model = LSTMModel(
    input_size = X_train.shape[2],
    hidden_size = best_params['hidden_size'],
    num_layers = best_params['num_layers'],
    dropout_rate = best_params['dropout_rate']
).to(device)

loss_func = nn.HuberLoss(delta=1.0)
optimizer = optim.AdamW(model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])

NUM_EPOCHS = 100
best_val_loss = float('inf')
history = {'train_loss': [], 'val_loss': []}

print("Start training")
model_path = os.path.join(save_dir, 'best_vnindex_lstm.pth')

for epoch in range(NUM_EPOCHS):
  avg_train_loss = train_model(model, train_loader, loss_func, optimizer, device)
  avg_val_loss = validate_model(model, val_loader, loss_func, device)

  history['train_loss'].append(avg_train_loss)
  history['val_loss'].append(avg_val_loss)

  if avg_val_loss < best_val_loss:
    best_val_loss = avg_val_loss
    torch.save(model.state_dict(), model_path)

  if (epoch + 1) % 5 == 0:
    print(f'Epoch [{epoch + 1}/{NUM_EPOCHS}], Train Loss: {avg_train_loss:.5f}, Val Loss: {avg_val_loss:.5f}')

## Ve bieu do train_loss va val_loss
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

In [None]:
model_path = os.path.join('..', 'models', 'best_vnindex_lstm.pth')

# Load trong so da huan luyen vao model
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)

test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=final_batch_size, shuffle=False)

## Du doan tren tap test
preds, targets = predict_model(model, test_loader, device)

## Inverse lai cac gia tri
final_preds = inverse_transform_target(preds, scaler, TARGET_INDEX)
final_targets = inverse_transform_target(targets, scaler, TARGET_INDEX)

## Tinh metrics
mae, rmse, r2_score = cal_metrics(final_targets, final_preds)

print(f"MAE: {mae:.2f} điểm")
print(f"RMSE: {rmse:.2f} điểm")
print(f"R2 Score: {r2:.4f}")

# Ve bieu do truc quan giua gia tri thuc te va gia tri du doan
plt.figure(figsize=(12,6))
plt.plot(final_targets, label='Thực tế (VN-Index)', color='blue')
plt.plot(final_preds, label='Dự báo (LSTM)', color='red', alpha=0.7)
plt.title('So sánh Giá trị thực tế và Giá trị mô hình dự báo trên Test set')
plt.legend()
plt.show()