In [0]:
import gym
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

## Value Iteration

### 初始化环境

In [0]:
environment = gym.make('FrozenLake-v0')

In [3]:
# 打印环境
environment.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [4]:
# 查看从一个state到new state的概率
# 从state=6开始(也就是从起点开始), 进行action=1(向下走), 会有可能向左或右走.
# LEFT = 0, DOWN = 1, RIGHT = 2, UP = 3

state = 6
action = 1
environment.P[state][action]

[(0.3333333333333333, 5, 0.0, True),
 (0.3333333333333333, 10, 0.0, False),
 (0.3333333333333333, 7, 0.0, True)]

### Value Iteration

In [0]:
def one_step_lookahead(environment, state, V, discount_factor):
    # Create a vector of dimensionality same as the number of actions
    action_values = np.zeros(environment.nA)

    for action in range(environment.nA):
        # 采取同一个action, 达到不同的state
        for probability, next_state, reward, terminated in environment.P[state][action]:
            if reward == 0:
                reward = -1
            elif reward == 1:
                reward = 10
            action_values[action] += probability * (reward + discount_factor * V[next_state])
    return action_values

In [0]:
def value_iteration(environment, discount_factor=1.0, max_iterations=1e9, theta=1e-9):
    # delta用来判断两次更新之间V的变化
    delta = 0
    # 初始化value
    V = np.zeros(environment.nS)
    # Store the number of policies evaluated
    evaluated_policies = 1
	
    for i in range(int(max_iterations)):	
        # 每一次迭代对所有的state进行更新
        for state in range(environment.nS):
            # 计算这个从这个state出发的每一个action最后可以获得的收益
            action_values = one_step_lookahead(environment, state, V, discount_factor)
            # 获得最好的动作
            best_value = np.max(action_values)
            # 保存一个更新前后最大的差距
            delta = max(delta, abs(V[state] - best_value))
            # 更新V
            V[state] = best_value
		
        evaluated_policies += 1
        if (i+1) % 100 == 0:
            print('='*10)
            print('Steps:{}'.format(evaluated_policies))
            print(V.reshape(4,4))

        # 是否要早停止
        if delta < theta:
            print('Early Stop. Steps:{}'.format(evaluated_policies))
            return V

    print('Steps:{}'.format(evaluated_policies))
    return V

In [0]:
def generate_policy(environment, V, discount_factor):
    """给定V, 生成对应的policy
    """
    policy = np.zeros((environment.nS, environment.nA))
    # 找出每一个state的最佳步骤
    for state in range(environment.nS):
        action_values = one_step_lookahead(environment, state, V, discount_factor)
        # 获得收益最大的那个action
        best_action = np.argmax(action_values)
        # 更新policy
        policy[state][best_action] = 1
    
    return policy

In [57]:
V = value_iteration(environment=environment, discount_factor=0.9, max_iterations=500, theta=1e-1)
print(V.reshape(4,4))

Steps:101
[[-9.24207816 -9.32430612 -9.18133513 -9.38597772]
 [-8.98947819 -9.99973439 -8.765515   -9.99973439]
 [-8.40006607 -7.27737513 -6.70402963 -9.99973439]
 [-9.99973439 -5.82052745 -2.97056562 -9.99973439]]
Steps:201
[[-9.24220004 -9.32443971 -9.18149261 -9.38611946]
 [-8.98960006 -9.99999999 -8.76570972 -9.99999999]
 [-8.40020009 -7.2775335  -6.70420648 -9.99999999]
 [-9.99999999 -5.82070508 -2.97077837 -9.99999999]]
Steps:301
[[ -9.24220005  -9.32443971  -9.18149262  -9.38611946]
 [ -8.98960006 -10.          -8.76570973 -10.        ]
 [ -8.4002001   -7.2775335   -6.70420648 -10.        ]
 [-10.          -5.82070509  -2.97077837 -10.        ]]
Steps:401
[[ -9.24220005  -9.32443971  -9.18149262  -9.38611946]
 [ -8.98960006 -10.          -8.76570973 -10.        ]
 [ -8.4002001   -7.2775335   -6.70420648 -10.        ]
 [-10.          -5.82070509  -2.97077837 -10.        ]]
Steps:501
[[ -9.24220005  -9.32443971  -9.18149262  -9.38611946]
 [ -8.98960006 -10.          -8.76570973 -1

In [58]:
policy= generate_policy(environment, V, discount_factor=0.9)
print(policy)

[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
