# Set up environment

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import pickle
import pandas_market_calendars as mcal
from tqdm import tqdm
import torch
torch.set_float32_matmul_precision('high')
from gymnasium import spaces
from mmd.env import GenLSTM, MMDSimulator, load_generator
from mmd.train import start_writer, get_params_from_events, get_params_dicts, get_robustq_params_dicts, train_robustdqn
from mmd.evaluation import simulate_agent_spx
from agent.q import QFunc
from agent.DQN import PORDQN, DQN, PORDQN_Nadam

In [None]:
seed = 0 # results in the paper used 0, 1, 2, 3, 4

total_length = 560
burn_in = 500
state_len = 60
cal_start_date = '1995-01-01'
cal_end_date = '2024-12-31'
trading_calendar = 'NYSE'
calendar = mcal.get_calendar(trading_calendar)
schedule = calendar.schedule(start_date=cal_start_date, end_date=cal_end_date)

int_rate = 0.024
trans_cost = 0.0005 # standard cost = 0.0005
eval_batch_size = 1000
eval_seed = 12345

In [None]:
with open('./data/mmd_generator/ma_params.pkl', 'rb') as f:
    ma_model_params = pickle.load(f)

In [None]:
events_path = './data/mmd_generator/'
params = get_params_from_events(events_path)
for key, value in params.items():
    for key, value in value.items():
        if key in globals(): continue # skip if already in globals
        globals()[key] = value
data_params, model_params, train_params = get_params_dicts(vars().copy())

In [None]:
generator = GenLSTM(noise_dim, seq_dim, sample_len, hidden_size=hidden_size, n_lstm_layers=n_lstm_layers, activation=activation)
generator = load_generator(generator, events_path)

# Custom DQN

In [None]:
torch.manual_seed(seed)

# simulator params
batch_size = 8
device = 'cpu'
action_space = spaces.Discrete(9)
action_values = torch.linspace(-1., 1., 9)
env = MMDSimulator(generator, ma_model_params, trading_calendar, cal_start_date, cal_end_date, state_len, burn_in, batch_size=batch_size, action_space=action_space, action_values=action_values)
other_state_vars = ['log_wealth', 'positions', 'dt']

# model params
architecture = [64, 64]
obs_dim = state_len+len(other_state_vars)
num_actions = action_values.shape[0]
discount = 0.99
qfunc = QFunc(state_len+len(other_state_vars), architecture, action_values.shape[0])
eps_greedy = 0.1 # Epsilon-greedy exploration
buffer_max_length = int(1e5)
clone_steps = 50
train_steps = 1
agent_batch_size = 128
n_batches = 1
n_epochs = 1
lr = 1e-4
n_episodes = 5

dqn_agent = DQN(obs_dim, num_actions, discount, qfunc, eps_greedy, buffer_max_length, clone_steps, train_steps, agent_batch_size, n_batches, n_epochs, lr, device=device, seed=seed)

In [None]:
for i in tqdm(range(n_episodes)):
    obs, _ = env.reset()
    done = torch.tensor([False]*batch_size)
    action = dqn_agent.agent_start(obs)
    cum_reward = torch.zeros(batch_size)
    while not done.any():
        obs, reward, done, _, _ = env.step(action)
        cum_reward += reward.squeeze()
        if done.any():
            dqn_agent.agent_end(reward, obs)
            break
        else:
            action = dqn_agent.agent_step(reward, obs)
    print(f'Cumulative reward: {cum_reward.mean().item()}')

## Evaluation

In [None]:
dqn_agent.training_mode = False
torch.manual_seed(eval_seed)
dqn_eval_env = MMDSimulator(generator, ma_model_params, trading_calendar, cal_start_date, cal_end_date, state_len, burn_in, batch_size=eval_batch_size, logging=True, action_space=action_space, action_values=action_values)
obs, reset_info = dqn_eval_env.reset()
action = dqn_agent.get_action(obs)
done = torch.tensor([False] * batch_size)
while not done.any():
    obs, reward, done, truncated, info = dqn_eval_env.step(action)
    if done.any():
        break
    else:
        action = dqn_agent.get_action(obs)
dqn_eval_env.print_metrics()

In [None]:
metrics = simulate_agent_spx(dqn_agent.q, action_values, int_rate, trans_cost)
for key, value in metrics.items():
    print(f'{key}: {value}')

# Robust DQN

## Setup environment

In [None]:
torch.manual_seed(seed)
batch_size = 8
device = 'cpu'
action_space = spaces.Discrete(9)
action_values = torch.linspace(-1., 1., 9)
num_actions = len(action_values)
nu_dist = 't'
nu_scale = 0.03
nu_df = 2
other_state_vars = ['log_wealth', 'positions', 'dt']
obs_dim = state_len + len(other_state_vars)

## Training from scratch

In [None]:
# RUN IF TRAINING FROM SCRATCH
discount = 0.99
eps_greedy = 0.1 # epsilon greedy parameter
buffer_max_length = int(1e5)
clone_steps = 50
train_steps = 1
agent_batch_size = 128
n_batches = 1
n_epochs = 1
robustq_lr = 1e-4
architecture = [64, 64]
pre_train_Q = False
n_episodes = 5

robustq = QFunc(state_len+len(other_state_vars), architecture, action_values.shape[0]).to(device)

delta = 1e-4 # regularisation parameter for Sinkhorn distance
epsilon = 0.003 # Sinkhorn distance
norm_ord = 1
lamda_init = 0. # initial lambda
lamda_max_iter = 100
lamda_step_size = 10 # step size for learning rate scheduler
lamda_gamma = 10. # gamma for learning rate scheduler
lamda_lr = 0.02 # learning rate for lambda
n_outer = 1 # not used in this algorithm but used in logging by writer
n_inner = 1000 # number of samples from nu to calc inner expectations

simulator_params, model_params = get_robustq_params_dicts(vars().copy())
writer = start_writer(simulator_params, model_params, model_name='PORDQN')

In [None]:
env = MMDSimulator(generator, ma_model_params, trading_calendar, cal_start_date, cal_end_date, state_len, burn_in,int_rate, trans_cost, batch_size, action_space, action_values, device)

robustdqn_agent = PORDQN(obs_dim, num_actions, discount, nu_scale, nu_df, action_values, epsilon, delta, n_inner, lamda_init,lamda_lr, lamda_max_iter, lamda_step_size, lamda_gamma, norm_ord, robustq, eps_greedy, buffer_max_length, clone_steps, train_steps, agent_batch_size, n_batches, n_epochs, robustq_lr, device=device, seed=seed, writer=writer)

robustdqn_agent = train_robustdqn(robustdqn_agent, env, writer, simulator_params, model_params)

# Storing agent to non-volatile memory
with open('robustdqn_agent_q.pkl', 'wb') as f:
    pickle.dump(robustdqn_agent.q, f)

In [None]:
# loading agent from non-volatile memory
robustdqn_agent_q: PORDQN
with open('robustdqn_agent_q.pkl', 'rb') as f:
    robustdqn_agent_q = pickle.load(f)

## Evaluation

In [None]:
robustdqn_agent.training_mode = False
torch.manual_seed(eval_seed)
dqn_eval_env = MMDSimulator(generator, ma_model_params, trading_calendar, cal_start_date, cal_end_date, state_len, burn_in, batch_size=eval_batch_size, logging=True, action_space=action_space, action_values=action_values, int_rate=int_rate, trans_cost=trans_cost)
obs, reset_info = dqn_eval_env.reset()
action = robustdqn_agent.get_action(obs)
done = torch.tensor([False] * batch_size)
while not done.any():
    obs, reward, done, truncated, info = dqn_eval_env.step(action)
    if done.any():
        break
    else:
        action = robustdqn_agent.get_action(obs)
dqn_eval_env.print_metrics()

In [None]:
simulate_agent_spx('data/spx.csv', robustdqn_agent_q, action_values, int_rate=int_rate, trans_cost=trans_cost)

## Robust DQN with NAdam

### Setup Environment

In [None]:
torch.manual_seed(seed)
batch_size = 8
device = 'cpu'
action_space = spaces.Discrete(9)
action_values = torch.linspace(-1., 1., 9)
num_actions = len(action_values)
nu_dist = 't'
nu_scale = 0.03
nu_df = 2
other_state_vars = ['log_wealth', 'positions', 'dt']
obs_dim = state_len + len(other_state_vars)

### Training

In [None]:
# RUN IF TRAINING FROM SCRATCH
discount = 0.99
eps_greedy = 0.1 # epsilon greedy parameter
buffer_max_length = int(1e5)
clone_steps = 50
train_steps = 1
agent_batch_size = 128
n_batches = 1
n_epochs = 1
robustq_lr = 1e-4
architecture = [64, 64]
pre_train_Q = False
n_episodes = 5

robustq = QFunc(state_len+len(other_state_vars), architecture, action_values.shape[0]).to(device)

delta = 1e-4 # regularisation parameter for Sinkhorn distance
epsilon = 0.003 # Sinkhorn distance
norm_ord = 1
lamda_init = 0. # initial lambda
lamda_max_iter = 100
lamda_step_size = 10 # step size for learning rate scheduler
lamda_gamma = 10. # gamma for learning rate scheduler
lamda_lr = 0.02 # learning rate for lambda
n_outer = 1 # not used in this algorithm but used in logging by writer
n_inner = 1000 # number of samples from nu to calc inner expectations

simulator_params, model_params = get_robustq_params_dicts(vars().copy())
writer = start_writer(simulator_params, model_params, model_name='PORDQN')

In [None]:
env = MMDSimulator(generator, ma_model_params, trading_calendar, cal_start_date, cal_end_date, state_len, burn_in,int_rate, trans_cost, batch_size, action_space, action_values, device)

robustdqn_agent = PORDQN_Nadam(obs_dim, num_actions, discount, nu_scale, nu_df, action_values, epsilon, delta, n_inner, lamda_init,lamda_lr, lamda_max_iter, lamda_step_size, lamda_gamma, norm_ord, robustq, eps_greedy, buffer_max_length, clone_steps, train_steps, agent_batch_size, n_batches, n_epochs, robustq_lr, device=device, seed=seed, writer=writer)

robustdqn_agent = train_robustdqn(robustdqn_agent, env, writer, simulator_params, model_params)

# Storing agent to non-volatile memory
with open('robustdqn_NAdam_agent_q.pkl', 'wb') as f:
    pickle.dump(robustdqn_agent.q, f)

In [None]:
# loading agent from non-volatile memory
robustdqn_agent_q: PORDQN
with open('robustdqn_NAdam_agent_q.pkl', 'rb') as f:
    robustdqn_agent_q = pickle.load(f)

In [None]:
simulate_agent_spx('data/spx.csv', robustdqn_agent_q, action_values, int_rate=int_rate, trans_cost=trans_cost)

### Adjusted volatility

In [None]:
simulate_agent_spx('data/spx_volatility_adjusted.csv', robustdqn_agent_q, action_values, int_rate=int_rate, trans_cost=trans_cost)