In [1]:
from collections import defaultdict
import random
import numpy as np
np.set_printoptions(precision=2, suppress=True)

import time
import copy 
import multiprocess as mp

import gym
from env import FrozenLakeCustom, FrozenLakeSimulator

from mcts_haver import run_mcts_trial
from value_iteration import value_iteration

from config import parse_args

import logging
logger = logging.getLogger()
logger.setLevel(logging.FATAL)

In [2]:
np.random.seed(0)
random.seed(0)

# params
args = parse_args()
args["update_method"] = "avg"
args["rollout_method"] = ""

#
env_id = "FrozenLake-v1"
env = FrozenLakeCustom(
    map_name=args["map_name"], is_slippery=args["is_slippery"],
    render_mode=args["render_mode"])

simulator = FrozenLakeSimulator(env.P)

V_vit, Q_vit = value_iteration(
    simulator, args["gamma"], args["vit_thres"])
# global Q_vit_g = Q_vit
        
for state in range(simulator.num_states):
    logging.warning(f"\n-> state = {state}")
    logging.warning(f"V[state] = {V_vit[state]:0.4f}")
    for action in range(simulator.num_actions):
        logging.warning(f"Q[state][action] = {Q_vit[state][action]:0.4f}")
    logging.warning(f"best_action={np.argmax(Q_vit[state])}")
    
manager = mp.Manager()
ep_reward_list = manager.list()
Q_mcts_list = manager.list()

def run_trial(i_trial, Q_vit, args):

    random.seed(10000+i_trial)
    np.random.seed(10000+i_trial)

    env = FrozenLakeCustom(
        map_name=args["map_name"], is_slippery=args["is_slippery"],
        render_mode=args["render_mode"])

    simulator = FrozenLakeSimulator(env.P)

    Q_mcts, ep_reward = run_mcts_trial(env, simulator, Q_vit, i_trial, args)

    ep_reward_list.append(ep_reward)
    Q_mcts_list.append(Q_mcts)

In [3]:
print(f"num_trials = {args['num_trials']}")
print(f"mcts_max_its = {args['mcts_max_iterations']}")

Q_mcts_dict = defaultdict()

hparam_ucb_scale_ary = np.arange(20, 40, 2)
hparam_ucb_scale_ary = [1, 2, 4, 8, 16, 32, 64, 128]
# hparam_ucb_scale_ary = [30]
best_param = None
max_reward_mean = -np.inf
for hparam_ucb_scale in hparam_ucb_scale_ary:
    start_time = time.time()
    
    print(f"\n-> hparam_ucb_scale = {hparam_ucb_scale}")
    args["hparam_ucb_scale"] = hparam_ucb_scale
    
    pool = mp.Pool()
    pool.starmap(run_trial, [(i, Q_vit, args) for i in range(args["num_trials"])])

    reward_mean = np.mean(ep_reward_list)
    reward_std = np.std(ep_reward_list, ddof=1)
    print(f"reward = {reward_mean:.2f} +/- {reward_std:.2f}")
    
    Q_mcts_dict[f"{hparam_ucb_scale}"] = copy.deepcopy(Q_mcts_list)
    
    if reward_mean > max_reward_mean:
        max_reward_mean = reward_mean 
        best_param = hparam_ucb_scale
    
    ep_reward_list[:] = []
    Q_mcts_list[:] = []
    
    end_time = time.time()
    print(f"it takes {end_time-start_time:0.4f}")

num_trials = 20
mcts_max_its = 2000

-> hparam_ucb_scale = 1
reward = -10.00 +/- 0.00
it takes 24.9561

-> hparam_ucb_scale = 2
reward = -10.00 +/- 0.00
it takes 24.7511

-> hparam_ucb_scale = 4
reward = -10.00 +/- 0.00
it takes 24.9273

-> hparam_ucb_scale = 8
reward = -10.00 +/- 0.00
it takes 28.0737

-> hparam_ucb_scale = 16
reward = -10.00 +/- 0.00
it takes 25.1813

-> hparam_ucb_scale = 32
reward = -10.00 +/- 0.00
it takes 24.6962

-> hparam_ucb_scale = 64


Process ForkPoolWorker-63:
Process ForkPoolWorker-41:
Process ForkPoolWorker-18:
Process ForkPoolWorker-53:
Process ForkPoolWorker-59:
Process ForkPoolWorker-48:
Process ForkPoolWorker-25:
Process ForkPoolWorker-76:
Process ForkPoolWorker-14:
Process ForkPoolWorker-37:
Process ForkPoolWorker-20:
Process ForkPoolWorker-82:
Process ForkPoolWorker-13:
Process ForkPoolWorker-22:
Process ForkPoolWorker-16:
Process ForkPoolWorker-50:
Process ForkPoolWorker-69:
Process ForkPoolWorker-60:
Process ForkPoolWorker-2:
Process ForkPoolWorker-24:
Process ForkPoolWorker-28:
Process ForkPoolWorker-12:
Process ForkPoolWorker-40:
Process ForkPoolWorker-55:
Process ForkPoolWorker-9:
Process ForkPoolWorker-43:
Process ForkPoolWorker-49:
Process ForkPoolWorker-21:
Process ForkPoolWorker-7:
Process ForkPoolWorker-38:
Process ForkPoolWorker-15:
Process ForkPoolWorker-27:
Process ForkPoolWorker-11:
Process ForkPoolWorker-47:
Process ForkPoolWorker-26:
Process ForkPoolWorker-51:
Process ForkPoolWorker-19:
Proc

KeyboardInterrupt: 

  File "/data/tnn/10_School/02_ResearchProjects/01_MaxEstimator/01_main_codes/31_mcts_v11/MCTS-Haver/mcts_haver.py", line 440, in run_mcts_trial
    action = mcts.run(state)
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
  File "/home/tnn/miniconda3/envs/tnn1/lib/python3.7/site-packages/multiprocess/synchronize.py", line 102, in __enter__
    return self._semlock.__enter__()
  File "/home/tnn/miniconda3/envs/tnn1/lib/python3.7/site-packages/multiprocess/connection.py", line 382, in _recv
    chunk = read(handle, remaining)
  File "/home/tnn/miniconda3/envs/tnn1/lib/python3.7/site-packages/multiprocess/synchronize.py", line 102, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/data/tnn/10_School/02_ResearchProjects/01_MaxEstimator/01_main_codes/31_mcts_v11/MCTS-Haver/mcts_haver.py", line 440, in run_mcts_trial
    action = mcts.run(state)
  File "/data/tnn/10_School/02_ResearchProjects/01_MaxEstimator/01_main_codes/31_mcts_v11/MCTS-Haver/mcts_haver.py"

In [None]:
Q_mcts_avg = defaultdict(lambda: np.zeros(simulator.num_actions))
for i_trial, Q_mcts in enumerate(Q_mcts_dict[f"{best_param}"]):
    for s in range(simulator.num_states):
        Q_mcts_avg[s] = (1-1/(i_trial+1))*Q_mcts_avg[s] + 1/(i_trial+1)*Q_mcts[s]
    
for state in range(simulator.num_states):
    print(f"\n-> state = {state}")
    print(f"V[state] = {np.max(Q_mcts_avg[state]):0.4f} | {np.max(Q_vit[state]):0.4f}")
    for action in range(simulator.num_actions):
        print(f"Q[state][action] = {Q_mcts_avg[state][action]:0.4f} | {Q_vit[state][action]:0.4f}")
    print(f"best_action = {np.argmax(Q_mcts_avg[state])} | {np.argmax(Q_vit[state])}")