In [1]:
from collections import defaultdict
import random
import numpy as np
np.set_printoptions(precision=2, suppress=True)

import time
import copy 
import multiprocess as mp

import gym
from env import FrozenLakeCustom, FrozenLakeSimulator

from mcts_haver import run_mcts_trial
from value_iteration import value_iteration

from config import parse_args

import logging
logger = logging.getLogger()
logger.setLevel(logging.FATAL)

In [2]:
np.random.seed(0)
random.seed(0)

# params
args = parse_args()
args["update_method"] = "haver"
args["rollout_method"] = ""


#
env_id = "FrozenLake-v1"
env = FrozenLakeCustom(
    map_name=args["map_name"], is_slippery=args["is_slippery"],
    render_mode=args["render_mode"])

simulator = FrozenLakeSimulator(env.P)

V_vit, Q_vit = value_iteration(
    simulator, args["gamma"], args["vit_thres"])
# global Q_vit_g = Q_vit
        
for state in range(simulator.num_states):
    logging.warning(f"\n-> state = {state}")
    logging.warning(f"V[state] = {V_vit[state]:0.4f}")
    for action in range(simulator.num_actions):
        logging.warning(f"Q[state][action] = {Q_vit[state][action]:0.4f}")
    logging.warning(f"best_action={np.argmax(Q_vit[state])}")
    
manager = mp.Manager()
ep_reward_list = manager.list()
Q_mcts_list = manager.list()

def run_trial(i_trial, Q_vit, args):

    random.seed(10000+i_trial)
    np.random.seed(10000+i_trial)

#     env = FrozenLakeCustom(
#         map_name=args["map_name"], is_slippery=args["is_slippery"],
#         render_mode=args["render_mode"])

#     simulator = FrozenLakeSimulator(env.P)

    Q_mcts, ep_reward = run_mcts_trial(env, simulator, Q_vit, i_trial, args)

    ep_reward_list.append(ep_reward)
    Q_mcts_list.append(Q_mcts)

In [3]:
# Q_mcts_dict = defaultdict()

# args["hparam_ucb_scale"] = 30

# # hparam_haver_var_ary = np.arange(0.0, 10, 0.5)
# hparam_haver_var_ary = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05]
# hparam_haver_var_ary = [0.0001, 0.0005, 0.0015]
# best_param = None
# max_reward_mean = -np.inf
# for hparam_haver_var in hparam_haver_var_ary:
#     start_time = time.time()
    
#     print(f"\n-> hparam_haver_var = {hparam_haver_var}")
#     args["hparam_haver_var"] = hparam_haver_var
    
#     pool = mp.Pool()
#     pool.starmap(run_trial, [(i, Q_vit, args) for i in range(args["num_trials"])])

#     reward_mean = np.mean(ep_reward_list)
#     reward_std = np.std(ep_reward_list, ddof=1)
#     print(f"reward = {reward_mean:.2f} +/- {reward_std:.2f}")
    
#     Q_mcts_dict[f"{hparam_haver_var}"] = copy.deepcopy(Q_mcts_list)
    
#     if reward_mean > max_reward_mean:
#         max_reward_mean = reward_mean 
#         best_param = hparam_haver_var
    
#     ep_reward_list[:] = []
#     Q_mcts_list[:] = []
    
#     end_time = time.time()
#     print(f"it takes {end_time-start_time:0.4f}")

In [None]:
print(f"num_trials = {args['num_trials']}")
print(f"mcts_max_its = {args['mcts_max_iterations']}")

Q_mcts_dict = defaultdict()

args["hparam_ucb_scale"] = 30

hparam_ucb_scale_ary = np.arange(20, 40, 2)

hparam_haver_var_ary = np.arange(0.0, 2, 0.5)
# hparam_haver_var_ary = [0.0001]
hparam_haver_var_ary = [0.1, 0.0001, 0.5, 1.0, 2.0]
best_param = None
max_reward_mean = -np.inf
for hparam_haver_var in hparam_haver_var_ary:
    print(f"\n-> hparam_haver_var = {hparam_haver_var}")
    args["hparam_haver_var"] = hparam_haver_var
    for hparam_ucb_scale in hparam_ucb_scale_ary:
        print(f"hparam_ucb_scale = {hparam_ucb_scale}")
        args["hparam_ucb_scale"] = hparam_ucb_scale
        
        start_time = time.time()
        
        pool = mp.Pool()
        pool.starmap(run_trial, [(i, Q_vit, args) for i in range(args["num_trials"])])

        reward_mean = np.mean(ep_reward_list)
        reward_std = np.std(ep_reward_list, ddof=1)
        print(f"reward = {reward_mean:.2f} +/- {reward_std:.2f}")

        # Q_mcts_dict[f"{hparam_haver_var}"] = copy.deepcopy(Q_mcts_list)

        if reward_mean > max_reward_mean:
            max_reward_mean = reward_mean 
            best_param = hparam_haver_var

        ep_reward_list[:] = []
        Q_mcts_list[:] = []

        end_time = time.time()
        print(f"it takes {end_time-start_time:0.4f}")


-> hparam_haver_var = 0.1
hparam_ucb_scale = 20
reward = -4.55 +/- 1.28
it takes 27.5323
hparam_ucb_scale = 22
reward = -4.70 +/- 1.13
it takes 28.1025
hparam_ucb_scale = 24
reward = -5.85 +/- 3.50
it takes 43.7254
hparam_ucb_scale = 26
reward = -5.45 +/- 2.98
it takes 40.5863
hparam_ucb_scale = 28
reward = -4.75 +/- 1.37
it takes 29.1379
hparam_ucb_scale = 30
reward = -5.85 +/- 2.83
it takes 39.9605
hparam_ucb_scale = 32
reward = -5.45 +/- 2.82
it takes 36.3366
hparam_ucb_scale = 34
reward = -5.15 +/- 2.37
it takes 41.5318
hparam_ucb_scale = 36
reward = -5.05 +/- 2.04
it takes 33.9615
hparam_ucb_scale = 38
reward = -5.20 +/- 2.63
it takes 41.7741

-> hparam_haver_var = 0.0001
hparam_ucb_scale = 20
reward = -5.60 +/- 2.37
it takes 39.5919
hparam_ucb_scale = 22
reward = -5.55 +/- 2.70
it takes 45.0100
hparam_ucb_scale = 24
reward = -5.45 +/- 2.14
it takes 36.3881
hparam_ucb_scale = 26
reward = -5.55 +/- 4.31
it takes 58.8227
hparam_ucb_scale = 28
reward = -9.85 +/- 21.57
it takes 43.62

In [None]:
Q_mcts_avg = defaultdict(lambda: np.zeros(simulator.num_actions))
for i_trial, Q_mcts in enumerate(Q_mcts_dict[f"{best_param}"]):
    for s in range(simulator.num_states):
        Q_mcts_avg[s] = (1-1/(i_trial+1))*Q_mcts_avg[s] + 1/(i_trial+1)*Q_mcts[s]
    
for state in range(simulator.num_states):
    print(f"\n-> state = {state}")
    print(f"V[state] = {np.max(Q_mcts_avg[state]):0.4f} | {np.max(Q_vit[state]):0.4f}")
    for action in range(simulator.num_actions):
        print(f"Q[state][action] = {Q_mcts_avg[state][action]:0.4f} | {Q_vit[state][action]:0.4f}")
    print(f"best_action = {np.argmax(Q_mcts_avg[state])} | {np.argmax(Q_vit[state])}")