In [1]:
from collections import defaultdict
import random
import numpy as np
np.set_printoptions(precision=2, suppress=True)

import time
import copy 
import multiprocess as mp

import gym
from env import FrozenLakeCustom, FrozenLakeSimulator

from mcts_haver import run_mcts_trial
from value_iteration import value_iteration

from config import parse_args
from utils import MultiProcess

import logging
logger = logging.getLogger()
logger.setLevel(logging.FATAL)

In [2]:
np.random.seed(0)
random.seed(0)


# params
args = parse_args()

m = args["num_trials"]
random_seeds = np.loadtxt("random_seeds.txt").astype("int64")
env_seeds = random_seeds[:m]
simulator_seeds = random_seeds[m:2*m]
mcts_seeds = random_seeds[2*m:]

#
env_id = "FrozenLake-v1"
env = FrozenLakeCustom(
    map_name=args["map_name"], is_slippery=args["is_slippery"],
    render_mode=args["render_mode"])

simulator = FrozenLakeSimulator(env.P, simulator_seed=0)

V_vit, Q_vit = value_iteration(
    simulator, args["gamma"], args["vit_thres"])
# global Q_vit_g = Q_vit
        
for state in range(simulator.num_states):
    logging.warning(f"\n-> state = {state}")
    logging.warning(f"V[state] = {V_vit[state]:0.4f}")
    for action in range(simulator.num_actions):
        logging.warning(f"Q[state][action] = {Q_vit[state][action]:0.4f}")
    logging.warning(f"best_action={np.argmax(Q_vit[state])}")
    
        
manager = mp.Manager()
ep_reward_list = manager.list()
Q_mcts_list = manager.list()

def run_trial(i_trial, Q_vit, env_seed, simulator_seed, mcts_seed, args):

    # random.seed(random_seeds[i_trial])
    # np.random.seed(random_seeds[i_trial])

    env = FrozenLakeCustom(
        map_name=args["map_name"], is_slippery=args["is_slippery"],
        render_mode=args["render_mode"])

    simulator = FrozenLakeSimulator(env.P, simulator_seed)

    Q_mcts, ep_reward = run_mcts_trial(env, simulator, Q_vit, i_trial, env_seed, mcts_seed, args)

    ep_reward_list.append(ep_reward)
    Q_mcts_list.append(Q_mcts)
    return ep_reward

In [3]:
args["update_method"] = "haver"
args["rollout_method"] = ""

print(f"num_trials = {args['num_trials']}")
# print(f"mcts_num_trajectories = {args['mcts_num_trajectories']}")


hparam_ucb_scale_list = np.arange(10, 100, 10)
# hparam_ucb_scale_list = [32, 64, 128, 256, 512, 1024]
# hparam_ucb_scale_list = [2**i for i in range(1, 9)]
args["hparam_ucb_scale"] = 64

hparam_haver_std_list = np.arange(10, 100, 10)
hparam_haver_std_list = [1/16, 1/8, 1/4, 1, 4, 8, 16]
# hparam_haver_std_list = [2**i for i in range(1, 9)]


# num_trajectories_list = [200, 500, 1000, 1500, 2000, 2500, 3000]
num_trajectories_list = [200]
# num_trajectories_list = [2]
best_param_list = []
max_reward_mean_list = []
res_text1 = ""
res_text2 = ""
for num_trajectories in num_trajectories_list:
    print(f"\n-> num_trajectories = {num_trajectories}")
    args["mcts_num_trajectories"] = num_trajectories
    
    # best_param = None
    # max_reward_mean = -np.inf
    start_time = time.time()
    res_text1 += f"{num_trajectories} "
    res_text2 += f"{num_trajectories} "
    for hparam_ucb_scale in hparam_ucb_scale_list: 
        
        args["hparam_ucb_scale"] = hparam_ucb_scale
        print(f"\n-> hparam_ucb_scale = {hparam_ucb_scale}")
        
        max_reward_mean = -np.inf
        best_param = None
        max_reward_error = None
        
        for hparam_haver_std in hparam_haver_std_list:
            # start_time = time.time()

            args["hparam_haver_var"] = hparam_haver_std**2
            # print(f"hparam_haver_var = {args['hparam_haver_var']}")
            # print(f"hparam_ucb_scale = {args['hparam_ucb_scale']}")

            pool = mp.Pool()
            pool.starmap(
                run_trial, 
                [(i, Q_vit, env_seeds[i], simulator_seeds[i], mcts_seeds[i], args) for i in range(args["num_trials"])])

            reward_mean = np.mean(ep_reward_list)
            reward_std = np.std(ep_reward_list, ddof=1) if len(ep_reward_list) > 1 else 0
            reward_error = reward_std/np.sqrt(args["num_trials"])
            # if hparam_haver_std <= 8:
            #     res_text1 += f"& {reward_mean:0.2f} (\u00B1{reward_error:0.2f}) "
            # else:
            #     res_text2 += f"& {reward_mean:0.2f} (\u00B1{reward_error:0.2f}) "
            print(f"reward = {reward_mean:0.2f} \u00B1 {reward_error:0.2f}")

            if reward_mean > max_reward_mean:
                max_reward_mean = reward_mean 
                max_reward_error = reward_error
                best_param = hparam_haver_std
                
            ep_reward_list[:] = []
            Q_mcts_list[:] = []

            end_time = time.time()
            # print(f"it takes {end_time-start_time:0.4f}")
        
        if hparam_ucb_scale <= 128:
            res_text1 += f"& {max_reward_mean:0.2f} (\u00B1{max_reward_error:0.2f}) "
        else:
            res_text2 += f"& {max_reward_mean:0.2f} (\u00B1{max_reward_error:0.2f}) "
            
        print(f"max_reward = {max_reward_mean:0.2f} \u00B1 {max_reward_error:0.2f}")
        print(f"best_param = {best_param}")
            
    res_text1 += "\\\\ \n \hline \n"
    res_text2 += "\\\\ \n \hline \n"

    # print(f"max_reward_mean = {max_reward_mean:0.2f}")
    print(f"it takes {end_time-start_time:0.4f}")

    max_reward_mean_list.append(max_reward_mean)
    best_param_list.append(best_param)

num_trials = 500

-> num_trajectories = 200

-> hparam_ucb_scale = 32
reward = -33.87 ± 2.00
reward = -29.23 ± 1.90
reward = -30.23 ± 1.92
reward = -30.73 ± 1.93
reward = -33.46 ± 1.99
reward = -33.46 ± 1.99
reward = -38.48 ± 2.07
max_reward = -29.23 ± 1.90
best_param = 0.125

-> hparam_ucb_scale = 64
reward = -24.89 ± 1.78
reward = -30.92 ± 1.94
reward = -30.87 ± 1.94
reward = -25.44 ± 1.80
reward = -25.44 ± 1.80
reward = -33.24 ± 1.99
reward = -30.68 ± 1.94
max_reward = -24.89 ± 1.78
best_param = 0.0625

-> hparam_ucb_scale = 128
reward = -24.37 ± 1.76
reward = -26.84 ± 1.84
reward = -27.46 ± 1.86
reward = -24.50 ± 1.77
reward = -28.26 ± 1.88
reward = -31.28 ± 1.95
reward = -31.20 ± 1.95
max_reward = -24.37 ± 1.76
best_param = 0.0625

-> hparam_ucb_scale = 256
reward = -26.37 ± 1.82
reward = -27.68 ± 1.86
reward = -28.02 ± 1.87
reward = -26.28 ± 1.82
reward = -28.65 ± 1.89
reward = -29.83 ± 1.92
reward = -34.69 ± 2.01
max_reward = -26.28 ± 1.82
best_param = 1

-> hparam_ucb_scale = 5

Process ForkPoolWorker-288:
Process ForkPoolWorker-370:
Process ForkPoolWorker-400:
Process ForkPoolWorker-282:
Process ForkPoolWorker-396:
Process ForkPoolWorker-266:
Process ForkPoolWorker-92:
Process ForkPoolWorker-201:
Process ForkPoolWorker-91:
Process ForkPoolWorker-299:
Process ForkPoolWorker-272:
Process ForkPoolWorker-366:
Process ForkPoolWorker-167:
Process ForkPoolWorker-174:
Process ForkPoolWorker-163:
Process ForkPoolWorker-81:
Process ForkPoolWorker-281:
Process ForkPoolWorker-232:
Process ForkPoolWorker-135:
Process ForkPoolWorker-99:
Process ForkPoolWorker-145:
Process ForkPoolWorker-109:
Process ForkPoolWorker-290:
Process ForkPoolWorker-267:
Process ForkPoolWorker-180:
Process ForkPoolWorker-8:
Process ForkPoolWorker-284:
Process ForkPoolWorker-313:
Process ForkPoolWorker-76:
Process ForkPoolWorker-20:
Process ForkPoolWorker-94:
Process ForkPoolWorker-194:
Process ForkPoolWorker-196:
Process ForkPoolWorker-150:
Process ForkPoolWorker-72:
Process ForkPoolWorker-289:
Pr

In [4]:
print(res_text1)
print(res_text2)

200 \\ 
 \hline 

200 & -29.23 (±1.90) & -24.89 (±1.78) & -24.37 (±1.76) & -26.28 (±1.82) & -21.49 (±1.66) \\ 
 \hline 



In [3]:
args["update_method"] = "haver"
args["rollout_method"] = ""

print(f"num_trials = {args['num_trials']}")
# print(f"mcts_num_trajectories = {args['mcts_num_trajectories']}")


hparam_ucb_scale_list = np.arange(10, 100, 10)
hparam_ucb_scale_list = [32, 64, 128, 256, 512, 1024]
# hparam_ucb_scale_list = [2**i for i in range(1, 9)]
args["hparam_ucb_scale"] = 64

hparam_haver_std_list = np.arange(10, 100, 10)
hparam_haver_std_list = [1/16, 1/8, 1/4, 1, 4, 8, 16]
# hparam_haver_std_list = [2**i for i in range(1, 9)]


# num_trajectories_list = [200, 500, 1000, 1500, 2000, 2500, 3000]
num_trajectories_list = [500]
# num_trajectories_list = [2]
best_param_list = []
max_reward_mean_list = []
res_text1 = ""
res_text2 = ""
for num_trajectories in num_trajectories_list:
    print(f"\n-> num_trajectories = {num_trajectories}")
    args["mcts_num_trajectories"] = num_trajectories
    
    # best_param = None
    # max_reward_mean = -np.inf
    start_time = time.time()
    res_text1 += f"{num_trajectories} "
    res_text2 += f"{num_trajectories} "
    for hparam_ucb_scale in hparam_ucb_scale_list: 
        
        args["hparam_ucb_scale"] = hparam_ucb_scale
        print(f"\n-> hparam_ucb_scale = {hparam_ucb_scale}")
        
        max_reward_mean = -np.inf
        best_param = None
        max_reward_error = None
        
        for hparam_haver_std in hparam_haver_std_list:
            # start_time = time.time()

            args["hparam_haver_var"] = hparam_haver_std**2
            # print(f"hparam_haver_var = {args['hparam_haver_var']}")
            # print(f"hparam_ucb_scale = {args['hparam_ucb_scale']}")

            pool = mp.Pool()
            pool.starmap(
                run_trial, 
                [(i, Q_vit, env_seeds[i], simulator_seeds[i], mcts_seeds[i], args) for i in range(args["num_trials"])])

            reward_mean = np.mean(ep_reward_list)
            reward_std = np.std(ep_reward_list, ddof=1) if len(ep_reward_list) > 1 else 0
            reward_error = reward_std/np.sqrt(args["num_trials"])
            # if hparam_haver_std <= 8:
            #     res_text1 += f"& {reward_mean:0.2f} (\u00B1{reward_error:0.2f}) "
            # else:
            #     res_text2 += f"& {reward_mean:0.2f} (\u00B1{reward_error:0.2f}) "
            print(f"reward = {reward_mean:0.2f} \u00B1 {reward_error:0.2f}")

            if reward_mean > max_reward_mean:
                max_reward_mean = reward_mean 
                max_reward_error = reward_error
                best_param = hparam_haver_std
                
            ep_reward_list[:] = []
            Q_mcts_list[:] = []

            end_time = time.time()
            # print(f"it takes {end_time-start_time:0.4f}")
        
        if hparam_ucb_scale <= 128:
            res_text1 += f"& {max_reward_mean:0.2f} (\u00B1{max_reward_error:0.2f}) "
        else:
            res_text2 += f"& {max_reward_mean:0.2f} (\u00B1{max_reward_error:0.2f}) "
            
        print(f"max_reward = {max_reward_mean:0.2f} \u00B1 {max_reward_error:0.2f}")
        print(f"best_param = {best_param}")
            
    res_text1 += "\\\\ \n \hline \n"
    res_text2 += "\\\\ \n \hline \n"

    # print(f"max_reward_mean = {max_reward_mean:0.2f}")
    print(f"it takes {end_time-start_time:0.4f}")

    max_reward_mean_list.append(max_reward_mean)
    best_param_list.append(best_param)

num_trials = 500

-> num_trajectories = 500

-> hparam_ucb_scale = 32
reward = -19.00 ± 1.57
reward = -20.75 ± 1.64
reward = -18.44 ± 1.54
reward = -20.74 ± 1.64
reward = -21.36 ± 1.66
reward = -17.84 ± 1.51
reward = -23.30 ± 1.73
max_reward = -17.84 ± 1.51
best_param = 8

-> hparam_ucb_scale = 64
reward = -20.69 ± 1.64
reward = -19.14 ± 1.58
reward = -17.20 ± 1.49
reward = -18.76 ± 1.56
reward = -18.75 ± 1.56
reward = -19.92 ± 1.61
reward = -20.70 ± 1.64
max_reward = -17.20 ± 1.49
best_param = 0.25

-> hparam_ucb_scale = 128
reward = -13.14 ± 1.27
reward = -15.65 ± 1.41
reward = -17.00 ± 1.48
reward = -17.59 ± 1.51
reward = -17.98 ± 1.52
reward = -18.17 ± 1.53
reward = -21.09 ± 1.65
max_reward = -13.14 ± 1.27
best_param = 0.0625

-> hparam_ucb_scale = 256
reward = -21.29 ± 1.66
reward = -20.70 ± 1.64
reward = -23.60 ± 1.74
reward = -22.25 ± 1.70
reward = -22.05 ± 1.69
reward = -24.58 ± 1.78
reward = -28.07 ± 1.87
max_reward = -20.70 ± 1.64
best_param = 0.125

-> hparam_ucb_scale = 512

In [4]:
print(res_text1)
print(res_text2)

500 & -17.84 (±1.51) & -17.20 (±1.49) & -13.14 (±1.27) \\ 
 \hline 

500 & -20.70 (±1.64) & -17.58 (±1.51) & -14.10 (±1.33) \\ 
 \hline 

