In [1]:
import gym
import numpy as np
import time
from prettytable import PrettyTable
from IPython import display

In [2]:
def play(env, policy, render=False):
    state = env.reset()
    total_reward = 0
    steps = 0
    done = False
    while not done:
        action = policy[state]
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        if render:
            env.render()
            time.sleep(0.2)
            if not done:
                display.clear_output(wait=True)
        state = next_state

    return (total_reward, steps)

In [3]:
def play_multiple_times(env, policy, max_episodes):
    num_of_success = 0
    succeed_steps = []
    succeed_rewards = [] 
    for i in range(max_episodes):
        total_reward, steps = play(env, policy)
        if total_reward > 0:
            succeed_rewards.append(total_reward)
            num_of_success += 1
            succeed_steps.append(steps)

    success = f'{num_of_success}/{max_episodes}'
    mean_succeed_steps = None
    mean_succeed_reward = None
    if len(succeed_steps) > 0:
      mean_succeed_reward = np.mean(succeed_rewards)
      mean_succeed_steps = np.mean(succeed_steps)

    return success, mean_succeed_reward, mean_succeed_steps

In [4]:
def policy_extraction(env, v_values, gamma):
    policy = np.zeros(env.observation_space.n)

    for state in range(env.observation_space.n):
        q_values = []
        for action in range(env.action_space.n):
            q_value = 0
            for prob, next_state, reward, done in env.P[state][action]:
                q_value += prob * (reward + gamma * v_values[next_state])
            
            q_values.append(q_value)
        
        policy[state] = np.argmax(q_values)
    
    return policy

In [5]:
def value_iteration(env, max_iters, gamma):
    # v_values = np.zeros(env.observation_space.n)
    v_values = np.random.rand(env.observation_space.n)

    for i in range(max_iters):
        prev_v_values = np.copy(v_values)

        for state in range(env.observation_space.n):
            q_values = []
            
            for action in range(env.action_space.n):
                q_value = 0
                for prob, next_state, reward, done in env.P[state][action]:
                    q_value += prob * (reward + gamma * prev_v_values[next_state])
                
                q_values.append(q_value)
            
            v_values[state] = max(q_values)
        
        if np.all(np.isclose(v_values, prev_v_values)):
            converge_at = i
            break
    
    policy = policy_extraction(env, v_values, gamma)
    return policy, converge_at

In [6]:
def policy_iteration(env, max_iters, gamma):
    # policy = np.zeros(env.observation_space.n)
    policy = np.random.randint(env.action_space.n, size=env.observation_space.n)
    # v_values = np.zeros(env.observation_space.n)
    v_values = np.random.rand(env.observation_space.n)

    equal_before = False
    converge_at = None

    for i in range(max_iters):
        prev_v_values = np.copy(v_values)
        prev_policy = policy

        for state in range(env.observation_space.n):
            action = prev_policy[state]
            v_values[state] = 0
            for prob, next_state, reward, done in env.P[state][action]:
                v_values[state] += prob * (reward + gamma * prev_v_values[next_state])
            
        policy = policy_extraction(env, v_values, gamma)
        
        if np.array_equal(policy, prev_policy):
          if equal_before:
            converge_at = i
            break
          else:
            equal_before = True
    
    return policy, converge_at

In [7]:
def compare(toy_game, max_episodes, max_iters, gamma):
  def run_algo(algo):
    nonlocal env, table, row
    
    start = time.time()
    policy, converge_at = algo(env, max_iters, gamma)
    end = time.time()
    learning_seconds = end - start

    row += [learning_seconds, converge_at] + list(play_multiple_times(env, policy, max_episodes))
    for i in range(len(row)):
      if type(row[i]) is float or type(row[i]) is np.float64:
        row[i] = round(row[i], 4)
    table.add_row(row)

  env = gym.make(toy_game)
  table = PrettyTable(['algo', 'learning_seconds', 'converge_at', 'success', 'mean_succeed_reward', 'mean_succeed_steps'])

  row = ['Value Iteration']
  run_algo(value_iteration)

  row = ['Policy Iteration']
  run_algo(policy_iteration)

  print(f'{toy_game}: {env.observation_space.n}x{env.action_space.n}')
  print(table)

In [15]:
# Hyperparameters
max_iters = 500
gamma = 0.9

max_episodes = 1000

toy_games = ["FrozenLake-v0", "FrozenLake8x8-v0", "Taxi-v3"]

for toy_game in toy_games:
  compare(toy_game, max_episodes, max_iters, gamma)
  print()

FrozenLake-v0: 16x4
+------------------+------------------+-------------+----------+---------------------+--------------------+
|       algo       | learning_seconds | converge_at | success  | mean_succeed_reward | mean_succeed_steps |
+------------------+------------------+-------------+----------+---------------------+--------------------+
| Value Iteration  |      0.0324      |     152     | 751/1000 |         1.0         |      37.1039       |
| Policy Iteration |      0.0011      |      2      | 369/1000 |         1.0         |      29.7263       |
+------------------+------------------+-------------+----------+---------------------+--------------------+

FrozenLake8x8-v0: 64x4
+------------------+------------------+-------------+----------+---------------------+--------------------+
|       algo       | learning_seconds | converge_at | success  | mean_succeed_reward | mean_succeed_steps |
+------------------+------------------+-------------+----------+---------------------+------

In [10]:
# Nhận xét
  # Policy Iteration hội tụ sớm hơn Value Iteration
  # Ở các ván thành công, Policy Iteration THƯỜNG cho số bước ít hơn Value Iteration
  # Khả năng hoàn thành ván chơi thành công cửa Policy Iteration ít hơn Value Iteration

In [None]:
# FrozenLake-v0: 16x4
# +------------------+------------------+-------------+----------+---------------------+--------------------+
# |       algo       | learning_seconds | converge_at | success  | mean_succeed_reward | mean_succeed_steps |
# +------------------+------------------+-------------+----------+---------------------+--------------------+
# | Value Iteration  |      0.0324      |     152     | 751/1000 |         1.0         |      37.1039       |
# | Policy Iteration |      0.0011      |      2      | 369/1000 |         1.0         |      29.7263       |
# +------------------+------------------+-------------+----------+---------------------+--------------------+

# FrozenLake8x8-v0: 64x4
# +------------------+------------------+-------------+----------+---------------------+--------------------+
# |       algo       | learning_seconds | converge_at | success  | mean_succeed_reward | mean_succeed_steps |
# +------------------+------------------+-------------+----------+---------------------+--------------------+
# | Value Iteration  |      0.1263      |     153     | 743/1000 |         1.0         |      71.5007       |
# | Policy Iteration |       0.05       |      24     | 625/1000 |         1.0         |      69.3872       |
# +------------------+------------------+-------------+----------+---------------------+--------------------+

# Taxi-v3: 500x6
# +------------------+------------------+-------------+-----------+---------------------+--------------------+
# |       algo       | learning_seconds | converge_at |  success  | mean_succeed_reward | mean_succeed_steps |
# +------------------+------------------+-------------+-----------+---------------------+--------------------+
# | Value Iteration  |      0.6865      |     116     | 1000/1000 |         7.95        |       13.05        |
# | Policy Iteration |      0.2682      |      19     | 1000/1000 |        8.029        |       12.971       |
# +------------------+------------------+-------------+-----------+---------------------+--------------------+
