In [1]:
from collections import defaultdict
import random
import numpy as np
np.set_printoptions(precision=2, suppress=True)

import time
import copy 
import multiprocess as mp

import gym
from env import FrozenLakeCustom, FrozenLakeSimulator

from mcts_haver import run_mcts_trial
from value_iteration import value_iteration

from config import parse_args
from utils import MultiProcess

import logging
logger = logging.getLogger()
logger.setLevel(logging.FATAL)

In [2]:
np.random.seed(0)
random.seed(0)

# params
args = parse_args()

#
env_id = "FrozenLake-v1"
env = FrozenLakeCustom(
    map_name=args["map_name"], is_slippery=args["is_slippery"],
    render_mode=args["render_mode"])

simulator = FrozenLakeSimulator(env.P)

V_vit, Q_vit = value_iteration(
    simulator, args["gamma"], args["vit_thres"])
# global Q_vit_g = Q_vit
        
for state in range(simulator.num_states):
    logging.warning(f"\n-> state = {state}")
    logging.warning(f"V[state] = {V_vit[state]:0.4f}")
    for action in range(simulator.num_actions):
        logging.warning(f"Q[state][action] = {Q_vit[state][action]:0.4f}")
    logging.warning(f"best_action={np.argmax(Q_vit[state])}")
    
manager = mp.Manager()
ep_reward_list = manager.list()
Q_mcts_list = manager.list()

def run_trial(i_trial, Q_vit, args):

    random.seed(10000+i_trial)
    np.random.seed(10000+i_trial)

    env = FrozenLakeCustom(
        map_name=args["map_name"], is_slippery=args["is_slippery"],
        render_mode=args["render_mode"])

    simulator = FrozenLakeSimulator(env.P)

    Q_mcts, ep_reward = run_mcts_trial(env, simulator, Q_vit, i_trial, args)

    ep_reward_list.append(ep_reward)
    Q_mcts_list.append(Q_mcts)
    return ep_reward

In [3]:
args["update_method"] = "avg"
args["rollout_method"] = ""
args["action_multi"] = 4

print(f"num_trials = {args['num_trials']}")
# print(f"mcts_num_trajectories = {args['mcts_num_trajectories']}")

Q_mcts_dict = defaultdict()



hparam_ucb_scale_list = np.arange(10, 100, 10)
# hparam_ucb_scale_list = np.arange(20, 64, 4)
hparam_ucb_scale_list = [1, 2, 4, 8, 16, 32, 64, 128]
# hparam_ucb_scale_list = [2**i for i in range(1, 9)]


num_trajectories_list = [200, 500, 1000, 1500, 2000, 2500, 3000]
# num_trajectories_list = [200, 500, 800]
# num_trajectories_list = [2]
best_param_list = []
max_reward_mean_list = []
res_text1 = ""
res_text2 = ""
for num_trajectories in num_trajectories_list:
    print(f"\n-> num_trajectories = {num_trajectories}")
    args["mcts_num_trajectories"] = num_trajectories
    
    best_param = None
    max_reward_mean = -np.inf
    start_time = time.time()
    res_text1 += f"{num_trajectories} "
    res_text2 += f"{num_trajectories} "
    for hparam_ucb_scale in hparam_ucb_scale_list:
        # start_time = time.time()

        # print(f"hparam_ucb_scale = {hparam_ucb_scale}")
        args["hparam_ucb_scale"] = hparam_ucb_scale
        
        pool = mp.Pool()
        pool.starmap(run_trial, [(i, Q_vit, args) for i in range(args["num_trials"])])

        reward_mean = np.mean(ep_reward_list)
        reward_std = np.std(ep_reward_list, ddof=1) if len(ep_reward_list) > 1 else 0
        reward_error = reward_std/np.sqrt(args["num_trials"])
        if hparam_ucb_scale <= 8:
            res_text1 += f"& {reward_mean:0.2f} (\u00B1{reward_error:0.2f}) "
        else:
            res_text2 += f"& {reward_mean:0.2f} (\u00B1{reward_error:0.2f}) "
        print(f"reward = {reward_mean:0.2f} +/- {reward_error:0.2f}")

        Q_mcts_dict[f"{hparam_ucb_scale}"] = copy.deepcopy(Q_mcts_list)

        if reward_mean > max_reward_mean:
            max_reward_mean = reward_mean 
            best_param = hparam_ucb_scale
    
        ep_reward_list[:] = []
        Q_mcts_list[:] = []
    
        end_time = time.time()
        # print(f"it takes {end_time-start_time:0.4f}")
    
    res_text1 += "\\\\ \n \hline \n"
    res_text2 += "\\\\ \n \hline \n"
    
    # print(f"max_reward_mean = {max_reward_mean:0.2f}")
    print(f"it takes {end_time-start_time:0.4f}")
    
    max_reward_mean_list.append(max_reward_mean)
    best_param_list.append(best_param)

num_trials = 20

-> num_trajectories = 200
reward = -86.60 +/- 7.96
reward = -86.45 +/- 7.95
reward = -81.65 +/- 8.91
reward = -52.65 +/- 11.16
reward = -33.25 +/- 10.20
reward = -19.50 +/- 7.89
reward = -33.85 +/- 10.16
reward = -34.55 +/- 10.16
it takes 9.9672

-> num_trajectories = 500
reward = -86.50 +/- 7.95
reward = -96.20 +/- 4.85
reward = -71.95 +/- 10.21
reward = -23.40 +/- 8.90
reward = -23.45 +/- 8.92
reward = -38.40 +/- 10.56
reward = -38.20 +/- 10.64
reward = -19.05 +/- 8.03
it takes 20.2670

-> num_trajectories = 1000
reward = -101.00 +/- 0.00
reward = -86.45 +/- 7.95
reward = -71.90 +/- 10.20
reward = -33.10 +/- 10.20
reward = -13.70 +/- 6.68
reward = -13.70 +/- 6.68
reward = -18.80 +/- 7.92
reward = -23.60 +/- 8.96
it takes 38.1422

-> num_trajectories = 1500
reward = -101.05 +/- 0.05
reward = -86.50 +/- 7.95
reward = -52.50 +/- 11.13
reward = -23.40 +/- 8.90
reward = -8.90 +/- 4.85
reward = -8.85 +/- 4.85
reward = -18.60 +/- 7.94
reward = -47.75 +/- 11.05
it takes 61.0

Process ForkPoolWorker-36:
Process ForkPoolWorker-73:
Process ForkPoolWorker-415:
Process ForkPoolWorker-383:
Process ForkPoolWorker-37:
Process ForkPoolWorker-418:
Process ForkPoolWorker-313:
Process ForkPoolWorker-437:
Process ForkPoolWorker-359:
Process ForkPoolWorker-464:
Process ForkPoolWorker-209:
Process ForkPoolWorker-127:
Process ForkPoolWorker-223:
Process ForkPoolWorker-86:
Process ForkPoolWorker-417:
Process ForkPoolWorker-339:
Process ForkPoolWorker-293:
Traceback (most recent call last):
Process ForkPoolWorker-358:
Process ForkPoolWorker-260:
Process ForkPoolWorker-170:
Process ForkPoolWorker-195:
Traceback (most recent call last):
Process ForkPoolWorker-391:
Process ForkPoolWorker-365:
Process ForkPoolWorker-357:
Process ForkPoolWorker-332:
Process ForkPoolWorker-301:
Process ForkPoolWorker-92:
  File "/home/tnn/miniconda3/envs/tnn1/lib/python3.7/site-packages/multiprocess/process.py", line 297, in _bootstrap
    self.run()
  File "/home/tnn/miniconda3/envs/tnn1/lib/pyth

KeyboardInterrupt: 

Traceback (most recent call last):
  File "/home/tnn/miniconda3/envs/tnn1/lib/python3.7/site-packages/multiprocess/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/tnn/miniconda3/envs/tnn1/lib/python3.7/site-packages/multiprocess/queues.py", line 354, in get
    with self._rlock:
  File "/home/tnn/miniconda3/envs/tnn1/lib/python3.7/site-packages/multiprocess/process.py", line 297, in _bootstrap
    self.run()
  File "/home/tnn/miniconda3/envs/tnn1/lib/python3.7/site-packages/multiprocess/pool.py", line 110, in worker
    task = get()
  File "/home/tnn/miniconda3/envs/tnn1/lib/python3.7/site-packages/multiprocess/process.py", line 297, in _bootstrap
    self.run()
  File "/home/tnn/miniconda3/envs/tnn1/lib/python3.7/site-packages/multiprocess/synchronize.py", line 102, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/home/tnn/miniconda3/envs/tnn1/lib/python3.7/site-packages/multiprocess/process.py", line 297, in _bootst

In [None]:
print(res_text1)
print(res_text2)