In [1]:
%load_ext autoreload
%reload_ext autoreload
import sys
import os
from dotenv import load_dotenv
load_dotenv()
sys.path.insert(0, os.getenv('SRC_PATH'))

import numpy as np
import pandas as pd
import seaborn as sns
import sqlite3
from src.volsurface import GridInterpVolSurface, KernelVolSurface
from src.utils.data_helper import clean_data, VolSurfPointwiseDataset

import torch
from torch.utils.data import DataLoader, TensorDataset
from torchvision import transforms
from src.train import Trainer

from src.volsurface import VAEPWVolSurface
# from src.volsurface import TrainedDecoderVolSurface

import json

DB_PATH = os.getenv('DB_PATH')
CSV_PATH = os.getenv('CSV_PATH')
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

pd.set_option('future.no_silent_downcasting', True)

In [2]:
from src.utils.logger import setup_logger
logger = setup_logger('train')

In [3]:
query = """
SELECT date, symbol, exdate, last_date, cp_flag, strike_price, best_bid, best_offer, volume, open_interest, impl_volatility, delta
FROM opprc
"""
dtype = {
    'symbol': 'string',
    'cp_flag': 'string',
    'strike_price': 'float64',
    'best_bid': 'float64',
    'best_offer': 'float64',
    'volume': 'int64',
    'open_interest': 'int64',
    'impl_volatility': 'float64',
    'delta': 'float64'
}
df_raw = pd.read_sql_query(query, conn, parse_dates=['date', 'exdate', 'last_date'])
df_raw = df_raw.replace('', np.nan) # sqlite returns empty strings for NULL values
df_raw = df_raw.astype(dtype)

df = clean_data(df_raw)

[2025-04-20 14:13:40] [INFO] src.utils.data_helper (50) : Bad data - Filtered 1110238 rows, Retained sample 76.67%
[2025-04-20 14:13:51] [INFO] src.utils.data_helper (63) : Consecutive trading stats completed
[2025-04-20 14:13:52] [INFO] src.utils.data_helper (86) : Consecutive trading - Filtered 2341950 rows, Retained sample 35.81%
[2025-04-20 14:13:52] [INFO] src.utils.data_helper (91) : Moneyness calculation completed


In [4]:
model_name = "vae_pw_ii_tune"
train_model = True
load_model = False
save_model = False
data_dir = CSV_PATH + "/predicted_vol_surfaces.json"  # Path to the volatility surfaces dataset
batch_size = 32
epochs = 10

In [5]:
maturity_grid = np.array([1, 7, 30, 60, 90, 180, 360, 720])
delta_grid = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [6]:
SRC_PATH = os.getenv('SRC_PATH')
os.chdir(SRC_PATH)
trainer = Trainer(model_name)
trainer.create_model()

[2025-04-20 14:14:05] [INFO] src.train (60) : Using device: mps


In [7]:
with open(data_dir, "r") as f:
    data = json.load(f)

mapping_ids = {dt: i for i, dt in enumerate(sorted(map(lambda x: x[:10], data.keys())))}

df['mapping_ids'] = df['date'].dt.strftime('%Y-%m-%d').map(mapping_ids)

vol_surfaces = []
for key in sorted(data.keys()):
    surface = torch.tensor(data[key], dtype=torch.float32)
    vol_surfaces.append(surface.flatten())  # Flatten 2D to 1D

data_tensor = torch.stack(vol_surfaces)
pw_grid_data = torch.tensor(df[['ttm', 'moneyness']].values, dtype=torch.float32)
# !only for test run
pw_grid_data[:, 0] = pw_grid_data[:, 0] / 365.0
pw_vol_data = torch.tensor(df['impl_volatility'].values, dtype=torch.float32)
mapping_ids = torch.tensor(df['mapping_ids'].values).long()

assert data_tensor.shape[0] == max(mapping_ids) + 1

dataset = VolSurfPointwiseDataset(pw_grid_data, pw_vol_data, data_tensor, mapping_ids)
train_loader = DataLoader(
    dataset, 
    batch_size=trainer.batch_size,
    shuffle=True
)

In [8]:
# Train the model
for epoch in range(epochs):
    logger.info(f"Epoch {epoch + 1}/{epochs}")
    trainer.train(train_loader)

[2025-04-20 14:14:14] [INFO] train (3) : Epoch 1/10
[2025-04-20 14:15:25] [INFO] src.train (179) : Loss: 0.0050
[2025-04-20 14:15:25] [INFO] train (3) : Epoch 2/10
[2025-04-20 14:16:35] [INFO] src.train (179) : Loss: 0.0041
[2025-04-20 14:16:35] [INFO] train (3) : Epoch 3/10
[2025-04-20 14:17:47] [INFO] src.train (179) : Loss: 0.0039
[2025-04-20 14:17:47] [INFO] train (3) : Epoch 4/10
[2025-04-20 14:18:56] [INFO] src.train (179) : Loss: 0.0039
[2025-04-20 14:18:56] [INFO] train (3) : Epoch 5/10
[2025-04-20 14:20:03] [INFO] src.train (179) : Loss: 0.0039
[2025-04-20 14:20:03] [INFO] train (3) : Epoch 6/10
[2025-04-20 14:21:10] [INFO] src.train (179) : Loss: 0.0039
[2025-04-20 14:21:10] [INFO] train (3) : Epoch 7/10
[2025-04-20 14:22:17] [INFO] src.train (179) : Loss: 0.0038
[2025-04-20 14:22:17] [INFO] train (3) : Epoch 8/10
[2025-04-20 14:23:25] [INFO] src.train (179) : Loss: 0.0038
[2025-04-20 14:23:25] [INFO] train (3) : Epoch 9/10
[2025-04-20 14:24:33] [INFO] src.train (179) : Loss:

In [10]:
torch.save(trainer.model.state_dict(), f"params/{trainer.model_name}.pth")

In [12]:
# hyper tune
trainer.hypertune(
    train_loader
)

Hypertune!! {'learning_rate': <ray.tune.search.sample.Categorical object at 0x3a03bbf70>, 'batch_size': <ray.tune.search.sample.Categorical object at 0x3a03bb3d0>, 'latent_dim': <ray.tune.search.sample.Categorical object at 0x3a0274b80>, 'hidden_dim': <ray.tune.search.sample.Categorical object at 0x3a0274670>}


2025-04-20 14:32:43,166	INFO worker.py:1812 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2025-04-20 14:32:43,722	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2025-04-20 14:32:43,723	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949
E0000 00:00:1745173963.905839 2771397 tcp_posix.cc:596] recvmsg encountered uncommon error: Message too long


== Status ==
Current time: 2025-04-20 14:32:44 (running for 00:00:00.74)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: None | Iter 1.000: None
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-04-20_14-32-38_050488_51306/artifacts/2025-04-20_14-32-43/lambda_2025-04-20_14-32-43/driver_artifacts
Number of trials: 2/2 (2 PENDING)
+--------------------+----------+-------+--------------+--------------+--------------+-----------------+
| Trial name         | status   | loc   |   batch_size |   hidden_dim |   latent_dim |   learning_rate |
|--------------------+----------+-------+--------------+--------------+--------------+-----------------|
| lambda_d95d8_00000 | PENDING  |       |          128 |           50 |           20 |          0.0001 |
| lambda_d95d8_00001 | PENDING  |       |           64 |          100 |           10 |          0.001  |
+--------------------+----------+-------+--------------+-------

[36m(pid=51500)[0m E0000 00:00:1745173965.019356 2771477 tcp_posix.cc:596] recvmsg encountered uncommon error: Message too long
[33m(raylet)[0m I0000 00:00:1745173965.492042 2770972 chttp2_transport.cc:1182] ipv4:127.0.0.1:61579: Got goaway [2] err=UNAVAILABLE:GOAWAY received; Error code: 2; Debug Text: Cancelling all calls {grpc_status:14, http2_error:2, created_time:"2025-04-20T14:32:45.492041-04:00", file_line:1171, file:"external/com_github_grpc_grpc/src/core/ext/transport/chttp2/transport/chttp2_transport.cc"}


== Status ==
Current time: 2025-04-20 14:32:49 (running for 00:00:05.82)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: None | Iter 1.000: None
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-04-20_14-32-38_050488_51306/artifacts/2025-04-20_14-32-43/lambda_2025-04-20_14-32-43/driver_artifacts
Number of trials: 2/2 (2 RUNNING)
+--------------------+----------+-----------------+--------------+--------------+--------------+-----------------+
| Trial name         | status   | loc             |   batch_size |   hidden_dim |   latent_dim |   learning_rate |
|--------------------+----------+-----------------+--------------+--------------+--------------+-----------------|
| lambda_d95d8_00000 | RUNNING  | 127.0.0.1:51500 |          128 |           50 |           20 |          0.0001 |
| lambda_d95d8_00001 | RUNNING  | 127.0.0.1:51501 |           64 |          100 |           10 |          0.001  |
+------------

2025-04-20 14:34:03,898	ERROR tune_controller.py:1331 -- Trial task failed for trial lambda_d95d8_00000
Traceback (most recent call last):
  File "/opt/anaconda3/envs/vae-volsurface/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/opt/anaconda3/envs/vae-volsurface/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/opt/anaconda3/envs/vae-volsurface/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/opt/anaconda3/envs/vae-volsurface/lib/python3.10/site-packages/ray/_private/worker.py", line 2755, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/opt/anaconda3/envs/vae-volsurface/lib/python3.10/site-packages/ray/_private/worker.py", line 906, in get_objects
    raise value.as_instanceof_cause()


== Status ==
Current time: 2025-04-20 14:34:05 (running for 00:01:21.57)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: None | Iter 1.000: None
Logical resource usage: 1.0/8 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-04-20_14-32-38_050488_51306/artifacts/2025-04-20_14-32-43/lambda_2025-04-20_14-32-43/driver_artifacts
Number of trials: 2/2 (1 ERROR, 1 RUNNING)
+--------------------+----------+-----------------+--------------+--------------+--------------+-----------------+
| Trial name         | status   | loc             |   batch_size |   hidden_dim |   latent_dim |   learning_rate |
|--------------------+----------+-----------------+--------------+--------------+--------------+-----------------|
| lambda_d95d8_00001 | RUNNING  | 127.0.0.1:51501 |           64 |          100 |           10 |          0.001  |
| lambda_d95d8_00000 | ERROR    | 127.0.0.1:51500 |          128 |           50 |           20 |          0.0001 |
+---

2025-04-20 14:34:06,707	ERROR tune_controller.py:1331 -- Trial task failed for trial lambda_d95d8_00001
Traceback (most recent call last):
  File "/opt/anaconda3/envs/vae-volsurface/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/opt/anaconda3/envs/vae-volsurface/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/opt/anaconda3/envs/vae-volsurface/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/opt/anaconda3/envs/vae-volsurface/lib/python3.10/site-packages/ray/_private/worker.py", line 2755, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/opt/anaconda3/envs/vae-volsurface/lib/python3.10/site-packages/ray/_private/worker.py", line 906, in get_objects
    raise value.as_instanceof_cause()


== Status ==
Current time: 2025-04-20 14:34:06 (running for 00:01:22.86)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: None | Iter 1.000: None
Logical resource usage: 1.0/8 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-04-20_14-32-38_050488_51306/artifacts/2025-04-20_14-32-43/lambda_2025-04-20_14-32-43/driver_artifacts
Number of trials: 2/2 (2 ERROR)
+--------------------+----------+-----------------+--------------+--------------+--------------+-----------------+
| Trial name         | status   | loc             |   batch_size |   hidden_dim |   latent_dim |   learning_rate |
|--------------------+----------+-----------------+--------------+--------------+--------------+-----------------|
| lambda_d95d8_00000 | ERROR    | 127.0.0.1:51500 |          128 |           50 |           20 |          0.0001 |
| lambda_d95d8_00001 | ERROR    | 127.0.0.1:51501 |           64 |          100 |           10 |          0.001  |
+--------------

TuneError: ('Trials did not complete', [lambda_d95d8_00000, lambda_d95d8_00001])

In [14]:
# try

model_name = "vae_pw_ii_tune"
train_model = True
load_model = False
save_model = False
data_dir = CSV_PATH + "/predicted_vol_surfaces.json"  # Path to the volatility surfaces dataset
batch_size = 32
epochs = 10

trainer_tune = Trainer(model_name)
trainer_tune.create_model()

train_loader = DataLoader(
    dataset, 
    batch_size=trainer_tune.batch_size,
    shuffle=True
)

[2025-04-19 23:47:24] [INFO] src.train (60) : Using device: mps


In [15]:
# Train the model
for epoch in range(epochs):
    logger.info(f"Epoch {epoch + 1}/{epochs}")
    trainer_tune.train(train_loader)

[2025-04-19 23:47:38] [INFO] train (3) : Epoch 1/10
[2025-04-19 23:48:49] [INFO] src.train (179) : Loss: 0.0047
[2025-04-19 23:48:49] [INFO] train (3) : Epoch 2/10
[2025-04-19 23:49:58] [INFO] src.train (179) : Loss: 0.0040
[2025-04-19 23:49:58] [INFO] train (3) : Epoch 3/10
[2025-04-19 23:51:08] [INFO] src.train (179) : Loss: 0.0039
[2025-04-19 23:51:08] [INFO] train (3) : Epoch 4/10
[2025-04-19 23:52:18] [INFO] src.train (179) : Loss: 0.0039
[2025-04-19 23:52:18] [INFO] train (3) : Epoch 5/10
[2025-04-19 23:53:28] [INFO] src.train (179) : Loss: 0.0039
[2025-04-19 23:53:28] [INFO] train (3) : Epoch 6/10
[2025-04-19 23:54:38] [INFO] src.train (179) : Loss: 0.0039
[2025-04-19 23:54:38] [INFO] train (3) : Epoch 7/10
[2025-04-19 23:55:48] [INFO] src.train (179) : Loss: 0.0038
[2025-04-19 23:55:48] [INFO] train (3) : Epoch 8/10
[2025-04-19 23:56:58] [INFO] src.train (179) : Loss: 0.0038
[2025-04-19 23:56:58] [INFO] train (3) : Epoch 9/10
[2025-04-19 23:58:07] [INFO] src.train (179) : Loss: