# Imports

In [None]:
import numpy as np
from tqdm import tqdm
import torch
device = 'cpu'

from gambling.env import Env
from gambling.train import get_robustq_params_dicts, start_writer
from agent.q import QFunc
from agent.DQN import DQN, GUSRDQN

# Parameters

In [None]:
seed = 0 # results in paper uses seeds 0,...,99
alpha = 1.2
beta = 2.0
num_samples_given = 5
neg_reward_factor = 5.

env_batch_size = 16
dqn_train_steps = 10000
rdqn_train_steps = 5000

# Uncertainty in Distribution

In [None]:
torch.manual_seed(seed)

# Estimate alpha and beta from sample mean and variance
true_mean = alpha/(alpha + beta)
env = Env(int(num_samples_given), alpha=alpha, beta=beta, seed=seed)
obs = env.reset()
mean_hat = obs.mean().item()
var_hat = obs.var().item()
alpha_hat = mean_hat**2*(1 - mean_hat)/var_hat - mean_hat
beta_hat = alpha_hat*(1/mean_hat - 1)

print(f'alpha: {alpha}, beta: {beta}')
print(f'mean = {true_mean:.3f}, var = {alpha*beta/((alpha + beta)**2 * (alpha + beta + 1)):.3f}')
print(f'alpha_hat: {alpha_hat:.3f}, beta_hat: {beta_hat:.3f}')
print(f'mean_hat = {mean_hat:.3f}, var_hat = {var_hat:.3f}')

# DQN

In [None]:
torch.manual_seed(seed)
action_space = torch.tensor([-1,0,1], dtype=torch.float32)
agent = DQN(
    obs_dim=1,
    num_actions=action_space.shape[0],
    qfunc=QFunc(1, [64, 64], action_space.shape[0]),
    lr=0.001,
    batch_size=64,
    epsilon=0.1,
    discount=0.95,
    buffer_max_length=10000,
    clone_steps=32,
    train_steps=1,
    n_batches=1,
    n_epochs=1,
    clip_gradients=True,
    device='cpu',
    seed=seed
)

In [None]:
print(f'alpha_hat: {alpha_hat}, beta_hat: {beta_hat}')
env = Env(env_batch_size, alpha=alpha_hat, beta=beta_hat, neg_reward_factor=neg_reward_factor, seed=seed)
reward_list = []
obs = env.reset()
action = agent.agent_start(obs)
obs, reward, _, _ = env.step(action)
reward_list.append(reward.mean())
for i in tqdm(range(dqn_train_steps)):
    action = agent.agent_step(reward, obs)
    obs, reward, _, _ = env.step(action)
    reward_list.append(reward.mean())
agent.agent_end(reward, obs)

In [None]:
print(f'alpha: {alpha}, beta: {beta}')
agent.training_mode = False
eval_seed = 12345
env = Env(100, alpha, beta, neg_reward_factor=neg_reward_factor, seed=eval_seed)
obs = env.reset()
reward_list = []
for i in tqdm(range(int(1e4))):
    action = agent.get_action(obs)
    obs, reward, _, _ = env.step(action)
    reward_list.append(reward.mean())
print(f'Average reward: {np.mean(reward_list):.4f}')

## Robust DQN

In [None]:
torch.manual_seed(seed)

action_space = torch.tensor([-1,0,1])
discount = 0.95
eps_greedy = 0.1 # epsilon greedy parameter
buffer_max_length = int(1e4)
clone_steps = 32
train_steps = 1
agent_batch_size = 64
n_batches = 1
n_epochs = 1
robustq_lr = 1e-3
architecture = [64, 64]

robustq = QFunc(1, architecture, action_space.shape[0]).to(device)

delta = 1e-4 # regularisation parameter for Sinkhorn distance
epsilon = 0.1 # Sinkhorn distance
norm_ord = 1
lamda_init = 0. # initial lambda
lamda_max_iter = 100
lamda_step_size = 10 # step size for learning rate scheduler
lamda_gamma = 10. # gamma for learning rate scheduler
lamda_lr = 0.02 # learning rate for lambda
n_outer = 1 # not used in this algorithm but used in logging by writer
n_inner = 200 # number of samples from nu to calc inner expectations

simulator_params, model_params = get_robustq_params_dicts(vars().copy())
writer = start_writer(simulator_params, model_params, model_name='GUSRDQN')

robustdqn_agent = GUSRDQN(1, action_space.shape[0], discount, action_space, neg_reward_factor, epsilon, delta, n_inner, lamda_init, lamda_lr, lamda_max_iter, lamda_step_size, lamda_gamma, norm_ord, robustq, eps_greedy, buffer_max_length, clone_steps, train_steps, agent_batch_size, n_batches, n_epochs, robustq_lr, device=device, seed=seed, writer=writer)

In [None]:
print(f'alpha_hat: {alpha_hat}, beta_hat: {beta_hat}')
env = Env(env_batch_size, alpha=alpha_hat, beta=beta_hat, neg_reward_factor=neg_reward_factor, seed=seed)
obs = env.reset()
action = action_space[robustdqn_agent.agent_start(obs)]
obs, reward, _, _ = env.step(action)
for i in tqdm(range(rdqn_train_steps)):
    action = action_space[robustdqn_agent.agent_step(reward, obs)]
    obs, reward, _, _ = env.step(action)
robustdqn_agent.agent_end(reward, obs)

In [None]:
print(f'alpha: {alpha}, beta: {beta}')
robustdqn_agent.training_mode = False
env = Env(100, alpha, beta, neg_reward_factor=neg_reward_factor, seed=None)
obs = env.reset()
reward_list = []
for i in tqdm(range(int(1e4))):
    action = robustdqn_agent.get_action(obs)
    obs, reward, _, _ = env.step(action)
    reward_list.append(reward.mean())
print(f'Average reward: {np.mean(reward_list):.4f}')