In [1]:
"""
Configs:
    # Env
    env_id="NNWorld01-v01", 
     - once in the goal state, each action ends the episode and returns a reward of 5
     - no clear episodes, continuously train the agent with num_train_steps, 
       if the agent reaches terminal, just reset environment and keep training 
     - try to immitate the bandit experiments
    # Params
    eps_sched_fn=poly(0.5), lr_sched_fn=poly(0.8)
    # Algos
    haver2, action_sigma=adaptive(1), haver_delta=0.01, haver_const=varied
Status:
"""

from collections import defaultdict
import random
import numpy as np
np.set_printoptions(precision=5, suppress=True)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")
sns.set_palette("tab20")
colors = sns.color_palette("bright")

import time
from tqdm import tqdm 
import multiprocessing

# import gymnasium as gym
import gym
import gym_examples
from gym.wrappers import FlattenObservation

from algos import *
from bandit_problem import *
from utils import *

In [None]:

random.seed(123)
np.random.seed(123)
tdqm_disable = True

# params
num_trials = 1
num_steps_train = 5000
num_episodes_eval = 100

lr_sched_type = "linear"
lr_sched_fn = create_lr_sched_fn(lr_sched_type, lr=0.7)

max_eps = 1.0
min_eps = 1.0
decay_rate = 0.0001
eps_sched_type = "poly"
eps_sched_fn = create_eps_sched_fn(eps_sched_type, min_eps, max_eps, decay_rate)

# create gym env
env_id = "gym_examples/NNWorldEnv01-v1"
env_scheme = "two_island"
gamma = 0.95

num_depths = 4
num_widths = 16
num_actions = num_widths
terminal_reward = -10.0

reward_dist = "normal"
problem_instance = "multi_gap_nonlinear"
action_max_mu = 0.0
action_sigma = 1.0
action_sigmas = action_sigma*np.ones(num_actions)
gap_splits = [0.5]
gap_deltas = [5.0]
bandit_problem = BanditProblem(
    problem_instance, reward_dist, num_actions, action_max_mu, 
    action_sigmas=action_sigmas, gap_splits=gap_splits, gap_deltas=gap_deltas)
print(f"action_mus = {bandit_problem.action_mus}")
print(f"action_sigmas = {bandit_problem.action_sigmas}")

action_max_mu = bandit_problem.action_mus[0]
optimal_num_steps = num_depths
optimal_vstar = terminal_reward*gamma**(optimal_num_steps-1) \
    + action_max_mu*np.sum([gamma**k for k in range(optimal_num_steps-1)])
optimal_reward_per_step = (terminal_reward + action_max_mu*(optimal_num_steps-1))/optimal_num_steps  
print(f"optimal_num_steps = {optimal_num_steps}")
print(f"optimal_reward_per_step = {optimal_reward_per_step}")
print(f"optimal_vstar = {optimal_vstar}")

env = gym.make(env_id, num_depths=num_depths, num_widths=num_widths, 
               bandit_problem=bandit_problem, terminal_reward=terminal_reward)
env_wrapped = FlattenObservation(env)
cur_state, info = env_wrapped.reset()

manager = multiprocessing.Manager()
episode_start_sigmahats_list = manager.list()
episode_rewards_list = manager.list()
episode_vstar_est_list = manager.list()
Q_table_list = manager.list()
Q_nvisits_list = manager.list()    

def run_trial(i_trial, args):

    random.seed(10000+i_trial)
    np.random.seed(10000+i_trial)

    # env = gym.make(env_id, size=gridworld_size)
    # env_wrapped = FlattenObservation(env)
    # env_wrapped.reset(seed=10000+i_trial)

    # lr_sched_fn = create_lr_sched_fn(lr_sched_type)
    # eps_sched_fn = create_eps_sched_fn(eps_sched_type, min_eps, max_eps, decay_rate)
    q_algo = create_q_algo(args["est_name"])

    Q_table, Q_nvisits, stats = q_algo(
        env_wrapped, num_actions, num_steps_train,
        gamma, lr_sched_fn, eps_sched_fn, tdqm_disable, args)

    episode_start_sigmahats, episode_rewards, episode_vstar_est= zip(*stats)
    episode_start_sigmahats_list.append(episode_start_sigmahats)
    episode_rewards_list.append(episode_rewards)
    episode_vstar_est_list.append(episode_vstar_est)
    Q_table_list.append(Q_table)
    Q_nvisits_list.append(Q_nvisits)

args = dict()
args["action_sigma"] = action_sigma
args["haver_alpha"] = 2.0
args["haver_delta"] = 0.05
args["haver_const"] = 1.0
args["weightedms_num_data"] = 1000
args["num_depths"] = num_depths
args["env_scheme"] = env_scheme

pool = multiprocessing.Pool()

episode_start_sigmahats_dict = defaultdict()
episode_rewards_dict = defaultdict()
episode_vstar_est_dict = defaultdict()
episode_vstar_est_bias_dict = defaultdict()
episode_vstar_est_var_dict = defaultdict()
episode_vstar_est_mse_dict = defaultdict()
Q_table_dict = defaultdict()
Q_nvisits_dict = defaultdict()

haver_const_ary = [1.0]
haver_name_ary = [f"haver_{x}" for x in haver_const_ary]
haver3_name_ary = [f"haver3_{x}" for x in haver_const_ary]

est_name_ary = ["max", "weightedms"]
# est_name_ary = haver_name_ary + est_name_ary 
# est_name_ary = est_name_ary + haver_name_ary
est_name_ary = est_name_ary + haver3_name_ary
est_name_ary = ["haver3_1.0", "weightedms"]
for est_name in est_name_ary:
    start_time = time.time()
    print(f"\n-> est_name = {est_name}")
    if "haver" in est_name:
        elems = est_name.split("_")
        args["est_name"] = elems[0]
        args["haver_const"] = float(elems[-1])
        print(f"haver_const = {args['haver_const']}")
    else:
        args["est_name"] = est_name
    
    pool.starmap(run_trial, [(i, args) for i in range(num_trials)])

    episode_start_sigmahats_ary = np.hstack([episode_start_sigmahats_list])
    episode_rewards_ary = np.hstack([episode_rewards_list])
    episode_vstar_est_ary = np.hstack([episode_vstar_est_list])

    episode_start_sigmahats_dict[est_name] = np.mean(episode_start_sigmahats_ary, 0)
    episode_rewards_dict[est_name] = np.mean(episode_rewards_ary, 0)
    episode_vstar_est_dict[est_name] = np.mean(episode_vstar_est_ary, 0)
    print(f"last_episode_start_sigmahat = {episode_start_sigmahats_dict[est_name][-1]:.4f}")
    print(f"last_episode_reward_per_step = {episode_rewards_dict[est_name][-1]:.4f}")
    print(f"last_episode_estim_start_muhat = {episode_vstar_est_dict[est_name][-1]:.4f}")
    
    episode_vstar_est_bias_dict[est_name] = np.mean(episode_vstar_est_ary - optimal_vstar, 0)
    episode_vstar_est_var_dict[est_name] = np.var(episode_vstar_est_ary - optimal_vstar, 0, ddof=1)
    episode_vstar_est_mse_dict[est_name] = \
        episode_vstar_est_bias_dict[est_name]**2 \
        + episode_vstar_est_var_dict[est_name]
    
    # Q_table_dict[est_name] = np.mean(np.stack(Q_table_list), 0)
    # Q_nvisits_dict[est_name] = np.mean(np.stack(Q_nvisits_list), 0)
    # print(np.stack(Q_table_list).shape)
    Q_table_dict[est_name] = np.stack(Q_table_list)[0,:,:,:]
    Q_nvisits_dict[est_name] = np.stack(Q_nvisits_list)[0,:,:,:]
    # print(Q_table_list[0][0,0,:])
    # print(Q_table_list[1][0,0,:])
    # print(Q_table_ary)
    # stop
                           
    episode_start_sigmahats_list[:] = []
    episode_rewards_list[:] = []
    episode_vstar_est_list[:] = []
    Q_table_list[:] = []
    Q_nvisits_list[:] = []
    
    end_time = time.time()
    print(f"it takes {end_time-start_time:0.4f}")

action_mus = [ 0.  0.  0.  0.  0.  0.  0.  0. -5. -5. -5. -5. -5. -5. -5. -5.]
action_sigmas = [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
optimal_num_steps = 4
optimal_reward_per_step = -2.5
optimal_vstar = -8.573749999999999

-> i_step = 0
cur_state = [0 0]
action = 5
reward = 0.24
new_state = [1 5]
haver3_estimator
[0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0.]
[-inf -inf -inf -inf -inf -inf -inf -inf]
[0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001]
Q_nvisits[cur_state][action] = 1.0
Q_table[cur_state][action], before = 0.00
Q_table[cur_state][action], after = 0.24
haver3_estimator
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ -inf  -inf  -inf  -inf  -inf 0.237  -inf  -inf  -inf  -inf  -inf  -inf
  -inf  -inf  -inf  -inf]
[0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001]
haver3_estimator
action_maxlcb_idx = 5
action_max

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



action = 1
reward = 0.92
new_state = [1 1]
haver3_estimator
[1. 1. 1. 3. 1. 1. 2. 1.]
[1. 1. 1. 3. 1. 1. 2. 1.]
[-0.36052  0.73431 -0.74553 -0.1237  -0.86917  1.1666  -8.10221 -0.87352]
[0.00001 0.00001 0.00001 0.30029 0.00001 0.00001 0.21649 0.00001]
haver3_estimator
action_maxlcb_idx = 5
action_maxlcb_muhat = 1.17
Bset_idxes = [5]
Bset_probs = [0. 0. 0. 0. 0. 1. 0. 0.]
Bset_nvisits = [0. 0. 0. 0. 0. 1. 0. 0.]
Q_est = 1.17
weightedms_estimator
idxes = [5]
probs = [0. 0. 0. 0. 0. 1. 0. 0.]
Q_est = 1.17
Q_nvisits[cur_state][action] = 12.0
Q_table[cur_state][action], before = 0.72
Q_table[cur_state][action], after = 0.83
haver3_estimator
[ 2. 12.  2.  1.  1.  1.  2.  1.  2.  1.  1.  1.  1.  1.  1.  1.]
[ 2. 12.  2.  1.  1.  1.  2.  1.  2.  1.  1.  1.  1.  1.  1.  1.]
[ 0.04131  0.82536 -0.73162 -3.06116 -0.06171  0.237   -0.13406 -1.40093
 -8.57193 -5.51234 -4.68091 -3.60479 -4.43462 -4.18285 -5.12814 -3.83091]
[0.98449 0.82754 0.22313 0.00001 0.00001 0.00001 1.25489 0.00001 2.53379
 0.0

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Bset_probs = [0.06818 0.77273 0.      0.      0.04545 0.02273 0.04545 0.      0.
 0.      0.      0.      0.04545 0.      0.      0.     ]
Bset_nvisits = [ 3. 34.  0.  0.  2.  1.  2.  0.  0.  0.  0.  0.  2.  0.  0.  0.]
Q_est = -0.40
weightedms_estimator
idxes = [ 0  1  4  5  6 12]
probs = [0.166 0.361 0.    0.    0.02  0.158 0.239 0.    0.    0.    0.    0.
 0.056 0.    0.    0.   ]
Q_est = -0.63

-> i_step = 225
cur_state = [1 1]
action = 5
reward = 0.39
new_state = [2 5]
haver3_estimator
[2. 1. 2. 2. 2. 1. 2. 2.]
[2. 1. 2. 2. 2. 1. 2. 2.]
[ -9.45756 -10.67657  -5.91607  -9.09205  -9.58108  -8.24433  -5.4274
  -4.36854]
[0.51659 0.00001 5.56289 0.12657 0.40075 0.00001 4.75565 4.48865]
haver3_estimator
action_maxlcb_idx = 5
action_maxlcb_muhat = -8.24
Bset_idxes = [2, 5, 6, 7]
Bset_probs = [0.      0.      0.28571 0.      0.      0.14286 0.28571 0.28571]
Bset_nvisits = [0. 0. 2. 0. 0. 1. 2. 2.]
Q_est = -5.67
weightedms_estimator
idxes = [2 5 6 7]
probs = [0.    0.    0.31  0.    0.   

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Q_table[cur_state][action], after = 0.47
haver3_estimator
[ 3. 36.  2.  1.  3. 24.  2.  1.  3.  1.  2.  1.  2.  1.  1.  1.]
[ 3. 36.  2.  1.  3. 24.  2.  1.  3.  1.  2.  1.  2.  1.  1.  1.]
[ -0.07284   0.22812  -0.73162  -3.06116  -0.0511    0.47417  -0.13406
  -1.40093  -9.55206  -5.51234  -8.57006  -3.60479 -12.65496  -4.18285
  -5.12814  -3.83091]
[0.81988 1.03765 0.22313 0.00001 1.0608  1.63536 1.25489 0.00001 2.49026
 0.00001 3.88915 0.00001 8.22034 0.00001 0.00001 0.00001]
haver3_estimator
action_maxlcb_idx = 1
action_maxlcb_muhat = 0.23
Bset_idxes = [0, 1, 4, 5, 6, 12]
Bset_probs = [0.04286 0.51429 0.      0.      0.04286 0.34286 0.02857 0.      0.
 0.      0.      0.      0.02857 0.      0.      0.     ]
Bset_nvisits = [ 3. 36.  0.  0.  3. 24.  2.  0.  0.  0.  0.  0.  2.  0.  0.  0.]
Q_est = -0.09
weightedms_estimator
idxes = [ 0  1  2  4  5  6 10 12]
probs = [0.123 0.183 0.002 0.    0.119 0.373 0.142 0.    0.    0.    0.008 0.
 0.05  0.    0.    0.   ]
Q_est = -0.52

-> i_ste

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




haver3_estimator
[0. 0. 1. 0. 0. 1. 0. 0.]
[0. 0. 1. 0. 0. 1. 0. 0.]
[    -inf     -inf -0.98913     -inf     -inf  0.4867      -inf     -inf]
[0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001]
haver3_estimator
action_maxlcb_idx = 5
action_maxlcb_muhat = 0.49
Bset_idxes = [5]
Bset_probs = [0. 0. 0. 0. 0. 1. 0. 0.]
Bset_nvisits = [0. 0. 0. 0. 0. 1. 0. 0.]
Q_est = 0.49
weightedms_estimator
idxes = [5]
probs = [0. 0. 0. 0. 0. 1. 0. 0.]
Q_est = 0.49
Q_nvisits[cur_state][action] = 3.0
Q_table[cur_state][action], before = -0.13
Q_table[cur_state][action], after = -0.05
haver3_estimator
[ 5. 43.  2.  2. 13. 29.  3.  1.  3.  1.  3.  1.  2.  1.  1.  1.]
[ 5. 43.  2.  2. 13. 29.  3.  1.  3.  1.  3.  1.  2.  1.  1.  1.]
[ -0.12865  -0.08078  -0.73162  -5.41544  -0.16178  -0.17412  -0.05472
  -1.40093  -9.55206  -5.51234  -9.67381  -3.60479 -12.65496  -4.18285
  -5.12814  -3.83091]
[0.67109 1.22693 0.22313 2.35427 2.46055 2.09772 1.03074 0.00001 2.49026
 0.00001 3.53839 0.00001 8.2

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




Q_table[cur_state][action], before = -10.00
Q_table[cur_state][action], after = -10.00
haver3_estimator
[13. 46.  2.  2. 14. 30. 16.  1.  3.  1.  3.  1.  2.  1.  1.  1.]
[13. 46.  2.  2. 14. 30. 16.  1.  3.  1.  3.  1.  2.  1.  1.  1.]
[ -0.61342  -0.25913  -0.73162  -5.41544  -0.52507  -0.27508  -0.36143
  -1.40093  -9.55206  -5.51234  -9.67381  -3.60479 -12.65496  -4.18285
  -5.12814  -3.83091]
[1.91239 1.37711 0.22313 2.35427 2.7088  2.13292 1.49524 0.00001 2.49026
 0.00001 3.53839 0.00001 8.22034 0.00001 0.00001 0.00001]
haver3_estimator
action_maxlcb_idx = 7
action_maxlcb_muhat = -1.40
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 12]
Bset_probs = [0.10317 0.36508 0.01587 0.01587 0.11111 0.2381  0.12698 0.00794 0.
 0.      0.      0.      0.01587 0.      0.      0.     ]
Bset_nvisits = [13. 46.  2.  2. 14. 30. 16.  1.  0.  0.  0.  0.  2.  0.  0.  0.]
Q_est = -0.64
weightedms_estimator
idxes = [ 0  1  2  3  4  5  6 12]
probs = [0.131 0.161 0.012 0.004 0.259 0.245 0.138 0.    0.    0.    0

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1.99925 1.72421 2.87946 2.35427 2.82237 2.44962 1.99734 0.00001 2.49026
 0.00001 3.53839 0.00001 8.22034 0.00001 0.00001 0.00001]
haver3_estimator
action_maxlcb_idx = 7
action_maxlcb_muhat = -1.40
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 12]
Bset_probs = [0.08974 0.35897 0.08333 0.01282 0.09615 0.22436 0.11538 0.00641 0.
 0.      0.      0.      0.01282 0.      0.      0.     ]
Bset_nvisits = [14. 56. 13.  2. 15. 35. 18.  1.  0.  0.  0.  0.  2.  0.  0.  0.]
Q_est = -1.04
weightedms_estimator
idxes = [ 0  1  2  3  4  5  6  7 10 12]
probs = [0.147 0.098 0.197 0.002 0.201 0.17  0.138 0.009 0.    0.    0.002 0.
 0.036 0.    0.    0.   ]
Q_est = -1.30

-> i_step = 666
cur_state = [2 3]
action = 5
reward = 0.33
new_state = [3 5]
Q_nvisits[cur_state][action] = 7.0
Q_table[cur_state][action], before = -7.50
Q_table[cur_state][action], after = -7.74
haver3_estimator
[14. 56. 13.  2. 15. 35. 18.  1.  3.  1.  3.  1.  2.  1.  1.  1.]
[14. 56. 13.  2. 15. 35. 18.  1.  3.  1.  3.  1.  2.  1.  1.  1.]


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1. 1. 5. 6. 1. 1. 1. 1.]
[1. 1. 5. 6. 1. 1. 1. 1.]
[-9.94733 -6.96133 -5.68323 -5.48165 -9.52243 -7.098   -7.7324  -7.08658]
[0.00001 0.00001 4.57637 2.76734 0.00001 0.00001 0.00001 0.00001]
haver3_estimator
action_maxlcb_idx = 1
action_maxlcb_muhat = -6.96
Bset_idxes = [1, 2, 3]
Bset_probs = [0.      0.08333 0.41667 0.5     0.      0.      0.      0.     ]
Bset_nvisits = [0. 1. 5. 6. 0. 0. 0. 0.]
Q_est = -5.69
weightedms_estimator
idxes = [1 2 3]
probs = [0.    0.126 0.427 0.447 0.    0.    0.    0.   ]
Q_est = -5.75
Q_nvisits[cur_state][action] = 18.0
Q_table[cur_state][action], before = -1.35
Q_table[cur_state][action], after = -1.62
haver3_estimator
[16. 67. 15.  2. 18. 41. 21.  1.  3.  1.  3.  1.  2.  1.  2.  1.]
[16. 67. 15.  2. 18. 41. 21.  1.  3.  1.  3.  1.  2.  1.  2.  1.]
[ -1.39301  -1.36308  -1.59633  -5.41544  -1.62317  -1.44443  -1.56345
  -1.40093  -9.55206  -5.51234  -9.67381  -3.60479 -12.65496  -4.18285
 -10.46398  -3.83091]
[2.39345 2.05777 3.21305 2.35427 3.16492 

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



haver3_estimator
action_maxlcb_idx = 1
action_maxlcb_muhat = -6.96
Bset_idxes = [1, 2, 3]
Bset_probs = [0.      0.07692 0.38462 0.53846 0.      0.      0.      0.     ]
Bset_nvisits = [0. 1. 5. 7. 0. 0. 0. 0.]
Q_est = -5.83
weightedms_estimator
idxes = [1 2 3]
probs = [0.    0.127 0.458 0.415 0.    0.    0.    0.   ]
Q_est = -5.88
Q_nvisits[cur_state][action] = 19.0
Q_table[cur_state][action], before = -1.62
Q_table[cur_state][action], after = -1.80
haver3_estimator
[18. 73. 16.  2. 19. 44. 22. 14.  3.  1.  3.  1.  2.  1.  2.  1.]
[18. 73. 16.  2. 19. 44. 22. 14.  3.  1.  3.  1.  2.  1.  2.  1.]
[ -1.84601  -1.62791  -1.86678  -5.41544  -1.79517  -1.68889  -1.82936
  -1.88815  -9.55206  -5.51234  -9.67381  -3.60479 -12.65496  -4.18285
 -10.46398  -3.83091]
[2.59592 2.17221 3.28262 2.35427 3.16577 2.75395 2.80968 3.35654 2.49026
 0.00001 3.53839 0.00001 8.22034 0.00001 5.33583 0.00001]
haver3_estimator
action_maxlcb_idx = 1
action_maxlcb_muhat = -1.63
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




[20. 86. 17.  2. 21. 50. 25. 15.  3.  1.  3.  1.  2.  1.  2.  1.]
[20. 86. 17.  2. 21. 50. 25. 15.  3.  1.  3.  1.  2.  1.  2.  1.]
[ -2.20607  -2.13778  -2.24319  -5.41544  -2.19491  -2.14771  -2.29735
  -2.27012  -9.55206  -5.51234  -9.67381  -3.60479 -12.65496  -4.18285
 -10.46398  -3.83091]
[2.69091 2.36043 3.52259 2.35427 3.26308 2.88244 2.98441 3.54371 2.49026
 0.00001 3.53839 0.00001 8.22034 0.00001 5.33583 0.00001]
haver3_estimator
action_maxlcb_idx = 11
action_maxlcb_muhat = -3.60
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 11, 12, 14]
Bset_probs = [0.08299 0.35685 0.07054 0.0083  0.08714 0.20747 0.10373 0.06224 0.
 0.      0.      0.00415 0.0083  0.      0.0083  0.     ]
Bset_nvisits = [20. 86. 17.  2. 21. 50. 25. 15.  0.  0.  0.  1.  2.  0.  2.  0.]
Q_est = -2.37
weightedms_estimator
idxes = [ 0  1  2  3  4  5  6  7 11 12 14]
probs = [0.121 0.09  0.158 0.007 0.147 0.124 0.148 0.154 0.    0.    0.    0.001
 0.038 0.    0.012 0.   ]
Q_est = -2.74

-> i_step = 998
cur_state = [2 5]


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




[22. 99. 19.  3. 23. 54. 27. 16.  3.  1.  3.  1.  3.  1.  2.  1.]
[ -2.57166  -2.50437  -2.69444  -6.65196  -2.48979  -2.49743  -2.63132
  -2.53227  -9.55206  -5.51234  -9.67381  -3.60479 -11.30709  -4.18285
 -10.46398  -3.83091]
[2.81412 2.42382 3.58477 2.59867 3.26287 3.04467 3.129   3.57824 2.49026
 0.00001 3.53839 0.00001 6.97731 0.00001 5.33583 0.00001]
haver3_estimator
action_maxlcb_idx = 11
action_maxlcb_muhat = -3.60
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 11, 12, 14]
Bset_probs = [0.08178 0.36803 0.07063 0.01115 0.0855  0.20074 0.10037 0.05948 0.
 0.      0.      0.00372 0.01115 0.      0.00743 0.     ]
Bset_nvisits = [22. 99. 19.  3. 23. 54. 27. 16.  0.  0.  0.  1.  3.  0.  2.  0.]
Q_est = -2.74
weightedms_estimator
idxes = [ 0  1  2  3  4  5  6  7 10 11 12 14]
probs = [0.115 0.094 0.156 0.005 0.164 0.13  0.118 0.162 0.    0.    0.002 0.001
 0.033 0.    0.02  0.   ]
Q_est = -3.04

-> i_step = 1112
cur_state = [0 0]
action = 4
reward = 0.64
new_state = [1 4]
haver3_estimator
[1

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Bset_nvisits = [ 25. 109.  20.   3.  26.  61.  29.  18.   0.   0.   0.   1.   3.   0.
   2.   0.]
Q_est = -3.04
weightedms_estimator
idxes = [ 0  1  2  3  4  5  6  7 10 11 12 14]
probs = [0.111 0.107 0.169 0.009 0.143 0.125 0.121 0.154 0.    0.    0.004 0.001
 0.039 0.    0.017 0.   ]
Q_est = -3.41

-> i_step = 1224
cur_state = [0 0]
action = 1
reward = -0.25
new_state = [1 1]
haver3_estimator
[ 4. 20.  4. 24.  5. 45.  4.  3.]
[ 4. 20.  4. 24.  5. 45.  4.  3.]
[-6.0131  -6.08771 -7.07673 -6.17327 -7.33219 -6.07335 -7.94631 -6.07502]
[3.38289 3.69367 3.9892  3.2861  3.43574 3.49961 0.43365 3.68045]
haver3_estimator
action_maxlcb_idx = 5
action_maxlcb_muhat = -6.07
Bset_idxes = [0, 1, 2, 3, 4, 5, 7]
Bset_probs = [0.0381  0.19048 0.0381  0.22857 0.04762 0.42857 0.      0.02857]
Bset_nvisits = [ 4. 20.  4. 24.  5. 45.  0.  3.]
Q_est = -6.19
weightedms_estimator
idxes = [0 1 2 3 4 5 7]
probs = [0.152 0.153 0.124 0.144 0.086 0.162 0.    0.179]
Q_est = -6.31
Q_nvisits[cur_state][action] = 110

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Q_table[cur_state][action], after = -9.28
haver3_estimator
[ 27. 120.  22.   3.  27.  66.  31.  19.   3.   1.   4.   1.   3.   1.
   2.   1.]
[ 27. 120.  22.   3.  27.  66.  31.  19.   3.   1.   4.   1.   3.   1.
   2.   1.]
[ -3.07891  -3.08614  -3.18845  -6.65196  -3.11395  -3.09467  -3.22235
  -3.17278  -9.55206  -5.51234 -10.03712  -3.60479 -11.30709  -4.18285
 -10.46398  -3.83091]
[2.79479 2.56487 3.56126 2.59867 3.37929 3.0554  3.30436 3.61374 2.49026
 0.00001 3.12828 0.00001 6.97731 0.00001 5.33583 0.00001]
haver3_estimator
action_maxlcb_idx = 11
action_maxlcb_muhat = -3.60
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 11, 12, 14]
Bset_probs = [0.08411 0.37383 0.06854 0.00935 0.08411 0.20561 0.09657 0.05919 0.
 0.      0.      0.00312 0.00935 0.      0.00623 0.     ]
Bset_nvisits = [ 27. 120.  22.   3.  27.  66.  31.  19.   0.   0.   0.   1.   3.   0.
   2.   0.]
Q_est = -3.27
weightedms_estimator
idxes = [ 0  1  2  3  4  5  6  7 10 11 12 14]
probs = [0.112 0.09  0.163 0.008 0.154 0.118

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



haver3_estimator
[ 2.  3.  6.  2.  1. 14.  3.  1.]
[ 2.  3.  6.  2.  1. 14.  3.  1.]
[-8.08269 -8.50535 -7.3131  -7.12334 -7.19014 -7.07522 -8.87186 -8.42614]
[0.11748 1.58308 3.04349 1.24632 0.00001 2.56447 0.22561 0.00001]
haver3_estimator
action_maxlcb_idx = 4
action_maxlcb_muhat = -7.19
Bset_idxes = [1, 2, 3, 4, 5]
Bset_probs = [0.      0.11538 0.23077 0.07692 0.03846 0.53846 0.      0.     ]
Bset_nvisits = [ 0.  3.  6.  2.  1. 14.  0.  0.]
Q_est = -7.30
weightedms_estimator
idxes = [1 2 3 4 5]
probs = [0.    0.06  0.307 0.187 0.108 0.338 0.    0.   ]
Q_est = -7.26
Q_nvisits[cur_state][action] = 33.0
Q_table[cur_state][action], before = -3.33
Q_table[cur_state][action], after = -3.41
haver3_estimator
[ 30. 131.  23.   3.  29.  70.  33.  21.   3.   1.   4.   2.   3.   2.
   2.   2.]
[ 30. 131.  23.   3.  29.  70.  33.  21.   3.   1.   4.   2.   3.   2.
   2.   2.]
[ -3.33807  -3.35023  -3.33406  -6.65196  -3.37304  -3.38836  -3.40889
  -3.52648  -9.55206  -5.51234 -10.03712  -9.1367

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Bset_probs = [0. 1. 0. 0. 0. 0. 0. 0.]
Bset_nvisits = [0. 1. 0. 0. 0. 0. 0. 0.]
Q_est = -4.46
weightedms_estimator
idxes = [1]
probs = [0. 1. 0. 0. 0. 0. 0. 0.]
Q_est = -4.46
Q_nvisits[cur_state][action] = 4.0
Q_table[cur_state][action], before = -11.31
Q_table[cur_state][action], after = -10.37
haver3_estimator
[ 33. 144.  25.   3.  32.  74.  35.  22.   3.   1.   4.   2.   4.   2.
   2.   3.]
[ 33. 144.  25.   3.  32.  74.  35.  22.   3.   1.   4.   2.   4.   2.
   2.   3.]
[ -3.65496  -3.58257  -3.59626  -6.65196  -3.63359  -3.58744  -3.5959
  -3.67755  -9.55206  -5.51234 -10.03712  -9.13677 -10.36747  -7.13424
 -10.46398  -7.80633]
[2.83619 2.63287 3.51894 2.59867 3.34727 3.23727 3.28181 3.59055 2.49026
 0.00001 3.12828 5.53198 6.25785 2.95139 5.33583 2.81384]
haver3_estimator
action_maxlcb_idx = 1
action_maxlcb_muhat = -3.58
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15]
Bset_probs = [0.08661 0.37795 0.06562 0.00787 0.08399 0.19423 0.09186 0.05774 0.
 0.      0.      0.0

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





-> i_step = 1651
cur_state = [ 3 13]
action = 0
reward = -10.00
new_state = [0 0]
terminated
Q_nvisits[cur_state][action] = 3.0
Q_table[cur_state][action], before = -10.00
Q_table[cur_state][action], after = -10.00
haver3_estimator
[ 35. 154.  27.   3.  34.  78.  37.  23.   3.   1.   4.   2.   4.   3.
   2.   3.]
[ 35. 154.  27.   3.  34.  78.  37.  23.   3.   1.   4.   2.   4.   3.
   2.   3.]
[ -3.85588  -3.79839  -3.85204  -6.65196  -3.83354  -3.78621  -3.8329
  -3.8455   -9.55206  -5.51234 -10.03712  -9.13677 -10.36747  -7.91707
 -10.46398  -7.80633]
[2.8733  2.67991 3.50816 2.59867 3.34528 3.26992 3.34491 3.59891 2.49026
 0.00001 3.12828 5.53198 6.25785 2.65194 5.33583 2.81384]
haver3_estimator
action_maxlcb_idx = 1
action_maxlcb_muhat = -3.80
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15]
Bset_probs = [0.08642 0.38025 0.06667 0.00741 0.08395 0.19259 0.09136 0.05679 0.
 0.      0.      0.00494 0.00988 0.00741 0.00494 0.00741]
Bset_nvisits = [ 35. 154.  27.   3.  34.  

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



action_maxlcb_idx = 1
action_maxlcb_muhat = -3.99
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 15]
Bset_probs = [0.08525 0.38018 0.06682 0.00691 0.08525 0.19355 0.08986 0.0553  0.
 0.00461 0.      0.00461 0.00922 0.00691 0.00461 0.00691]
Bset_nvisits = [ 37. 165.  29.   3.  37.  84.  39.  24.   0.   2.   0.   2.   4.   3.
   2.   3.]
Q_est = -4.22
weightedms_estimator
idxes = [ 0  1  2  3  4  5  6  7  9 11 12 13 14 15]
probs = [0.116 0.094 0.13  0.007 0.135 0.122 0.122 0.141 0.    0.002 0.    0.046
 0.04  0.007 0.033 0.005]
Q_est = -4.82

-> i_step = 1761
cur_state = [1 1]
action = 4
reward = -0.77
new_state = [2 4]
haver3_estimator
[2. 1. 1. 2. 5. 2. 1. 2.]
[2. 1. 1. 2. 5. 2. 1. 2.]
[-10.7018  -11.1585   -9.71785  -8.86759  -9.6536   -9.49615 -10.59004
  -9.72722]
[0.42618 0.00001 0.00001 0.35784 1.04833 0.78622 0.00001 0.66772]
haver3_estimator
action_maxlcb_idx = 2
action_maxlcb_muhat = -9.72
Bset_idxes = [2, 3, 4, 5, 7]
Bset_probs = [0.      0.      0.08333 0.16667 0.41

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




idxes = [1 2 3 5 6 7]
probs = [0.    0.121 0.357 0.336 0.    0.079 0.028 0.079]
Q_est = -7.52
Q_nvisits[cur_state][action] = 39.0
Q_table[cur_state][action], before = -4.15
Q_table[cur_state][action], after = -4.21
haver3_estimator
[ 39. 176.  31.   3.  39.  89.  41.  26.   3.   2.   4.   2.   4.   3.
   2.   3.]
[ 39. 176.  31.   3.  39.  89.  41.  26.   3.   2.   4.   2.   4.   3.
   2.   3.]
[ -4.20087  -4.17072  -4.2387   -6.65196  -4.21179  -4.18111  -4.20922
  -4.29458  -9.55206  -8.05681 -10.03712  -9.13677 -10.36747  -7.91707
 -10.46398  -7.80633]
[2.91022 2.71285 3.44163 2.59867 3.28636 3.24943 3.38007 3.61487 2.49026
 2.54447 3.12828 5.53198 6.25785 2.65194 5.33583 2.81384]
haver3_estimator
action_maxlcb_idx = 1
action_maxlcb_muhat = -4.17
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 15]
Bset_probs = [0.08478 0.38261 0.06739 0.00652 0.08478 0.19348 0.08913 0.05652 0.
 0.00435 0.      0.00435 0.0087  0.00652 0.00435 0.00652]
Bset_nvisits = [ 39. 176.  31.   3.  39

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




terminated
Q_nvisits[cur_state][action] = 75.0
Q_table[cur_state][action], before = -10.00
Q_table[cur_state][action], after = -10.00
haver3_estimator
[ 41. 188.  32.   3.  42.  94.  43.  27.   3.   2.   4.   2.   4.   3.
   2.   3.]
[ 41. 188.  32.   3.  42.  94.  43.  27.   3.   2.   4.   2.   4.   3.
   2.   3.]
[ -4.36039  -4.33549  -4.33231  -6.65196  -4.37243  -4.32317  -4.36886
  -4.43659  -9.55206  -8.05681 -10.03712  -9.13677 -10.36747  -7.91707
 -10.46398  -7.80633]
[2.92719 2.70857 3.4273  2.59867 3.2215  3.22123 3.37931 3.62046 2.49026
 2.54447 3.12828 5.53198 6.25785 2.65194 5.33583 2.81384]
haver3_estimator
action_maxlcb_idx = 1
action_maxlcb_muhat = -4.34
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 15]
Bset_probs = [0.08436 0.38683 0.06584 0.00617 0.08642 0.19342 0.08848 0.05556 0.
 0.00412 0.      0.00412 0.00823 0.00617 0.00412 0.00617]
Bset_nvisits = [ 41. 188.  32.   3.  42.  94.  43.  27.   0.   2.   0.   2.   4.   3.
   2.   3.]
Q_est = -4.51
weighted

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 15]
Bset_probs = [0.08382 0.38986 0.06628 0.00585 0.08577 0.19298 0.08772 0.05458 0.
 0.0039  0.      0.00585 0.0078  0.00585 0.0039  0.00585]
Bset_nvisits = [ 43. 200.  34.   3.  44.  99.  45.  28.   0.   2.   0.   3.   4.   3.
   2.   3.]
Q_est = -4.68
weightedms_estimator
idxes = [ 0  1  2  3  4  5  6  7  9 10 11 12 13 14 15]
probs = [0.107 0.092 0.127 0.027 0.117 0.136 0.103 0.129 0.    0.006 0.005 0.035
 0.062 0.009 0.04  0.005]
Q_est = -5.48

-> i_step = 2080
cur_state = [0 0]
action = 1
reward = -0.38
new_state = [1 1]
haver3_estimator
[ 9. 35.  6. 45.  8. 85.  6.  6.]
[ 9. 35.  6. 45.  8. 85.  6.  6.]
[-7.39071 -7.38494 -7.34966 -7.37487 -8.13658 -7.35236 -8.38097 -7.35074]
[2.77877 3.23536 3.28417 2.80579 2.91526 2.95105 0.71319 2.92932]
haver3_estimator
action_maxlcb_idx = 5
action_maxlcb_muhat = -7.35
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7]
Bset_probs = [0.045 0.175 0.03  0.225 0.04  0.425 0.03  0.03 ]
Bset_nvisits = [ 

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




Q_est = -7.56
Q_nvisits[cur_state][action] = 106.0
Q_table[cur_state][action], before = -4.63
Q_table[cur_state][action], after = -4.66
haver3_estimator
[ 46. 211.  36.   3.  47. 106.  47.  29.   3.   2.   4.   3.   4.   3.
   2.   3.]
[ 46. 211.  36.   3.  47. 106.  47.  29.   3.   2.   4.   3.   4.   3.
   2.   3.]
[ -4.65946  -4.63768  -4.68612  -6.65196  -4.66332  -4.65558  -4.69634
  -4.6895   -9.55206  -8.05681 -10.03712 -10.39171 -10.36747  -7.91707
 -10.46398  -7.80633]
[2.90148 2.71563 3.38464 2.59867 3.16895 3.18397 3.40737 3.61881 2.49026
 2.54447 3.12828 4.853   6.25785 2.65194 5.33583 2.81384]
haver3_estimator
action_maxlcb_idx = 1
action_maxlcb_muhat = -4.64
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 15]
Bset_probs = [0.08487 0.3893  0.06642 0.00554 0.08672 0.19557 0.08672 0.05351 0.
 0.00369 0.      0.00554 0.00738 0.00554 0.00369 0.00554]
Bset_nvisits = [ 46. 211.  36.   3.  47. 106.  47.  29.   0.   2.   0.   3.   4.   3.
   2.   3.]
Q_est = -4.81
weight

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



probs = [0.088 0.081 0.115 0.02  0.128 0.115 0.141 0.148 0.001 0.006 0.006 0.035
 0.07  0.007 0.025 0.014]
Q_est = -5.67

-> i_step = 2299
cur_state = [3 5]
action = 0
reward = -10.00
new_state = [0 0]
terminated
Q_nvisits[cur_state][action] = 101.0
Q_table[cur_state][action], before = -10.00
Q_table[cur_state][action], after = -10.00
haver3_estimator
[ 48. 225.  37.   3.  50. 109.  48.  31.   3.   2.   4.   3.   4.   3.
   2.   3.]
[ 48. 225.  37.   3.  50. 109.  48.  31.   3.   2.   4.   3.   4.   3.
   2.   3.]
[ -4.79328  -4.74486  -4.76572  -6.65196  -4.78017  -4.74765  -4.75783
  -4.86526  -9.55206  -8.05681 -10.03712 -10.39171 -10.36747  -7.91707
 -10.46398  -7.80633]
[2.91252 2.68533 3.37258 2.59867 3.1285  3.19383 3.39794 3.58725 2.49026
 2.54447 3.12828 4.853   6.25785 2.65194 5.33583 2.81384]
haver3_estimator
action_maxlcb_idx = 1
action_maxlcb_muhat = -4.74
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 15]
Bset_probs = [0.08451 0.39613 0.06514 0.00528 0.08803 0.1

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




[2.89328 2.69976 3.34755 2.26166 3.12713 3.1582  3.37776 3.62323 2.49026
 2.54447 3.12828 4.853   6.25785 2.65194 5.33583 2.81384]
haver3_estimator
action_maxlcb_idx = 1
action_maxlcb_muhat = -4.90
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 15]
Bset_probs = [0.08361 0.39632 0.06522 0.00669 0.08696 0.19398 0.08528 0.05351 0.
 0.00334 0.      0.00502 0.00669 0.00502 0.00334 0.00502]
Bset_nvisits = [ 50. 237.  39.   4.  52. 116.  51.  32.   0.   2.   0.   3.   4.   3.
   2.   3.]
Q_est = -5.04
weightedms_estimator
idxes = [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
probs = [0.112 0.098 0.111 0.011 0.113 0.12  0.134 0.128 0.001 0.011 0.002 0.03
 0.06  0.011 0.044 0.014]
Q_est = -5.80

-> i_step = 2417
cur_state = [1 1]
action = 3
reward = 0.92
new_state = [2 3]
haver3_estimator
[38.  2.  1. 25.  2. 42.  2. 13.]
[38.  2.  1. 25.  2. 42.  2. 13.]
[ -9.28823 -10.44369  -9.98502  -9.11794  -9.47179  -9.29145 -10.03886
  -9.28232]
[1.56002 0.13147 0.00001 2.15478 0.86812 1.

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Q_table[cur_state][action], before = -4.99
Q_table[cur_state][action], after = -5.00
haver3_estimator
[ 53. 248.  41.   4.  54. 122.  52.  32.   3.   2.   4.   4.   4.   3.
   2.   3.]
[ 53. 248.  41.   4.  54. 122.  52.  32.   3.   2.   4.   4.   4.   3.
   2.   3.]
[ -5.03909  -4.99832  -5.04559  -6.78148  -5.0107   -4.99354  -5.01421
  -5.01134  -9.55206  -8.05681 -10.03712 -11.14737 -10.36747  -7.91707
 -10.46398  -7.80633]
[2.87981 2.69468 3.32966 2.26166 3.11885 3.1143  3.3865  3.62323 2.49026
 2.54447 3.12828 4.4019  6.25785 2.65194 5.33583 2.81384]
haver3_estimator
action_maxlcb_idx = 1
action_maxlcb_muhat = -5.00
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 15]
Bset_probs = [0.08494 0.39744 0.06571 0.00641 0.08654 0.19551 0.08333 0.05128 0.
 0.00321 0.      0.00641 0.00641 0.00481 0.00321 0.00481]
Bset_nvisits = [ 53. 248.  41.   4.  54. 122.  52.  32.   0.   2.   0.   4.   4.   3.
   2.   3.]
Q_est = -5.15
weightedms_estimator
idxes = [ 0  1  2  3  4  5  6  7  8  

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



haver3_estimator
[ 2.  4. 25.  1. 41.  3.  4.  7.]
[ 2.  4. 25.  1. 41.  3.  4.  7.]
[ -9.4946   -9.48805  -9.27989  -9.77392  -9.27327 -10.31124  -9.85651
  -9.15394]
[0.32541 1.20779 1.10749 0.00001 1.81061 0.60657 1.0903  0.74445]
haver3_estimator
action_maxlcb_idx = 3
action_maxlcb_muhat = -9.77
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7]
Bset_probs = [0.02299 0.04598 0.28736 0.01149 0.47126 0.03448 0.04598 0.08046]
Bset_nvisits = [ 2.  4. 25.  1. 41.  3.  4.  7.]
Q_est = -9.35
weightedms_estimator
idxes = [0 1 2 4 5 6 7]
probs = [0.031 0.175 0.223 0.    0.299 0.004 0.083 0.185]
Q_est = -9.35
Q_nvisits[cur_state][action] = 45.0
Q_table[cur_state][action], before = -7.68
Q_table[cur_state][action], after = -7.73
haver3_estimator
[ 55. 259.  42.   5.  56. 127.  54.  34.   3.   2.   4.   4.   4.   3.
   2.   3.]
[ 55. 259.  42.   5.  56. 127.  54.  34.   3.   2.   4.   4.   4.   3.
   2.   3.]
[ -5.13083  -5.10941  -5.13291  -7.03459  -5.1442   -5.11313  -5.1045
  -5.17668  -9.55206  -8.056

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



action = 6
reward = -0.39
new_state = [1 6]
haver3_estimator
[ 3.  4.  8.  4.  3. 29.  4.  1.]
[ 3.  4.  8.  4.  3. 29.  4.  1.]
[-8.44868 -8.88437 -7.95518 -7.97753 -7.9255  -7.94987 -9.03995 -8.42614]
[0.5264  1.52006 2.86186 1.22995 1.13913 2.04362 0.35063 0.00001]
haver3_estimator
action_maxlcb_idx = 7
action_maxlcb_muhat = -8.43
Bset_idxes = [0, 1, 2, 3, 4, 5, 7]
Bset_probs = [0.05769 0.07692 0.15385 0.07692 0.05769 0.55769 0.      0.01923]
Bset_nvisits = [ 3.  4.  8.  4.  3. 29.  0.  1.]
Q_est = -8.06
weightedms_estimator
idxes = [0 1 2 3 4 5 7]
probs = [0.021 0.078 0.327 0.14  0.169 0.261 0.    0.004]
Q_est = -8.04
Q_nvisits[cur_state][action] = 57.0
Q_table[cur_state][action], before = -5.20
Q_table[cur_state][action], after = -5.25
haver3_estimator
[ 57. 272.  44.   5.  58. 131.  57.  35.   3.   2.   4.   5.   4.   3.
   2.   3.]
[ 57. 272.  44.   5.  58. 131.  57.  35.   3.   2.   4.   5.   4.   3.
   2.   3.]
[ -5.23066  -5.19841  -5.21334  -7.03459  -5.22719  -5.19989  -5.2

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



haver3_estimator
action_maxlcb_idx = 1
action_maxlcb_muhat = -5.29
Bset_idxes = [0, 1, 2, 3, 4, 5, 6, 7, 9, 12, 13, 14, 15]
Bset_probs = [0.08559 0.40228 0.06562 0.00713 0.08845 0.19401 0.08417 0.05136 0.
 0.00285 0.      0.      0.00571 0.00428 0.00428 0.00428]
Bset_nvisits = [ 60. 282.  46.   5.  62. 136.  59.  36.   0.   2.   0.   0.   4.   3.
   3.   3.]
Q_est = -5.40
weightedms_estimator
idxes = [ 0  1  2  3  4  5  6  7  9 10 11 12 13 14 15]
probs = [0.099 0.084 0.132 0.013 0.125 0.121 0.138 0.144 0.    0.005 0.004 0.005
 0.081 0.013 0.015 0.021]
Q_est = -5.99

-> i_step = 2850
cur_state = [2 2]
action = 6
reward = 0.79
new_state = [3 6]
Q_nvisits[cur_state][action] = 3.0
Q_table[cur_state][action], before = -9.57
Q_table[cur_state][action], after = -9.29
haver3_estimator
[ 60. 282.  46.   5.  62. 136.  59.  36.   3.   2.   4.   5.   4.   3.
   3.   3.]
[ 60. 282.  46.   5.  62. 136.  59.  36.   3.   2.   4.   5.   4.   3.
   3.   3.]
[ -5.33933  -5.29323  -5.30152  -7.03459  -5.3

In [None]:

fig, axes = fig, axes = plt.subplots(
        nrows=5, ncols=1, sharex=True, sharey=False, figsize=(8,12))
# axes = [axes]

x_ary = np.linspace(4000, num_steps_train-1, num=100, dtype=np.int32)
# est_name_ary = ["weightedms"]
est_name_ary_unwanted = []
est_name_ary_wanted = [est_name for est_name in est_name_ary[:8] \
                 if est_name not in est_name_ary_unwanted]
for est_name in est_name_ary_wanted:
    # axes[0].plot(x_ary, episode_rewards_dict[est_name][x_ary], label=est_name)
    y_ary = running_avg(episode_rewards_dict[est_name][x_ary], 100)
    axes[0].plot(
        x_ary, y_ary, label=est_name)
    axes[1].plot(x_ary, episode_vstar_est_dict[est_name][x_ary], label=est_name)
    axes[2].plot(x_ary, episode_vstar_est_mse_dict[est_name][x_ary], label=est_name)
    axes[3].plot(x_ary, episode_vstar_est_bias_dict[est_name][x_ary], label=est_name)
    axes[4].plot(x_ary, episode_vstar_est_var_dict[est_name][x_ary], label=est_name)

axes[0].axhline(y=optimal_reward_per_step, color="black")
axes[1].axhline(y=optimal_vstar, color="black")
axes[0].set_title(f"reward_per_step, K={num_actions}, num_depths={num_depths}, sigma={action_sigma},"
                    f" delta={gap_deltas[0]}")
axes[1].set_title("vstar_est")
axes[2].set_title("vstar_est_mse")
axes[3].set_title("vstar_est_bias")
axes[4].set_title("vstar_est_var")
# axes[0].legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
axes[1].legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
# axes[2].legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.show()

In [None]:

fig, axes = fig, axes = plt.subplots(
        nrows=5, ncols=1, sharex=True, sharey=False, figsize=(8,12))
# axes = [axes]

x_ary = np.linspace(4000, num_steps_train-1, num=100, dtype=np.int32)
# est_name_ary = ["weightedms"]
est_name_ary_unwanted = ["weightedms", "weightedms2"]
est_name_ary_wanted = [est_name for est_name in est_name_ary[:8] \
                 if est_name not in est_name_ary_unwanted]
for est_name in est_name_ary_wanted:
    # axes[0].plot(x_ary, episode_rewards_dict[est_name][x_ary], label=est_name)
    y_ary = running_avg(episode_rewards_dict[est_name][x_ary], 100)
    axes[0].plot(
        x_ary, y_ary, label=est_name)
    axes[1].plot(x_ary, episode_vstar_est_dict[est_name][x_ary], label=est_name)
    axes[2].plot(x_ary, episode_vstar_est_mse_dict[est_name][x_ary], label=est_name)
    axes[3].plot(x_ary, episode_vstar_est_bias_dict[est_name][x_ary], label=est_name)
    axes[4].plot(x_ary, episode_vstar_est_var_dict[est_name][x_ary], label=est_name)

axes[0].axhline(y=optimal_reward_per_step, color="black")
axes[1].axhline(y=optimal_vstar, color="black")
axes[0].set_title(f"reward_per_step, K={num_actions}, num_depths={num_depths}, sigma={action_sigma},"
                    f" delta={gap_deltas[0]}")
axes[1].set_title("vstar_est")
axes[2].set_title("vstar_est_mse")
axes[3].set_title("vstar_est_bias")
axes[4].set_title("vstar_est_var")
# axes[0].legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
axes[1].legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
# axes[2].legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.show()

In [None]:

for est_name in est_name_ary:
    print(f"\n-> est_name = {est_name}")
    Q_table = Q_table_dict[est_name]
    Q_nvisits = Q_nvisits_dict[est_name]
    for i_row in range(num_depths):
        for j_col in range(num_actions):
            print(f"{i_row} {j_col}: {Q_table[i_row,j_col]}, \n     {Q_nvisits[i_row,j_col]}")

 