### Gridworld

1. 문제: 4x4 격자로 이루어진 판 위에서 규칙에 따라 이동할 때 위치(state) 별로 value function 값을 구해보자.
2. 조건
    - state: 말의 위치
    - action: 상,하,좌,우 (grid를 벗어나지 못한다.) 
    - reward: 좌측 상단, 우측 하단에 있을 경우 reward +0(terminal), 나머지 경우 움직임마다 reward -1
3. 학습 목표
    - policy evaluation 
        - Bellman equation
            - episode 별로 value function 확인
            - 무한히 돌린 경우 value function 수렴 여부 확인

In [1]:
from collections import defaultdict
import numpy as np

In [3]:
WORLD_SIZE = 4
UP, DOWN, LEFT, RIGHT = 0, 1, 2, 3
L_CORNER = [0, 0]
R_CORNET = [3, 3]

class Gridworld:
    def __init__(self, shape=[4, 4]):        
        self.nA = 4
        self.nS = np.prod(shape)
        
        grid = np.arange(WORLD_SIZE**2).reshape(shape)
        it = np.nditer(grid, flags=['multi_index'])
        
        P = defaultdict(lambda: [[] for i in range(self.nA)])
        while not it.finished: 
            s = it.iterindex
            y, x = it.multi_index
                    
            is_done = lambda s: s == self.nS - 1 or s == 0
            MAX_Y = shape[0]
            MAX_X = shape[1]
            
            if is_done(s):
                P[s][UP] = [1.0, s, 0, True]
                P[s][DOWN] = [1.0, s, 0, True]
                P[s][LEFT] = [1.0, s, 0, True]
                P[s][RIGHT] = [1.0, s, 0, True]            
            else:
                s_up = s - MAX_Y if y != 0 else s
                s_down = s + MAX_Y if y != MAX_Y-1 else s
                s_left = s - 1 if x != 0 else s
                s_right = s + 1 if x != MAX_X-1 else s
                
                P[s][UP] = [1.0, s_up, -1, False]
                P[s][DOWN] = [1.0, s_down, -1, False]
                P[s][LEFT] = [1.0, s_left, -1, False]
                P[s][RIGHT] = [1.0, s_right, -1, False]
            it.iternext()
                
        self.P = P

In [4]:
def random_policy():
    return np.array([0.25, 0.25, 0.25, 0.25])

### Iterative policy evaluation
아래 식은 policy가 주어졌을 때 value function의 값을 구하는 **Bellman equation** 이다. random policy를 집어넣은 뒤에 문제를 풀어보자.

$$v_{\pi}(s) = \displaystyle \sum_{a} \pi(a \vert s) \sum_{s', r} p(s',r \lvert s, a) \big [r+\gamma v_{\pi}(s') \big], \text{ for all } s \in S : \text{ Bellman equation for }v_{\pi}$$

In [5]:
def policy_evaluation(env, num_episodes, policy, discount_factor=1.0):
    V = np.zeros(env.nS)
    for i_episode in range(num_episodes):
        old_V = V.copy()
        for s in range(env.nS):
            v = 0
            for a, action_prob in enumerate(policy):
                prob, next_state, reward, done = env.P[s][a]
                v += action_prob * prob * (reward + discount_factor*old_V[next_state])
            V[s] = v
            
    return V.reshape(WORLD_SIZE, WORLD_SIZE)

In [6]:
def check_value_iteration(policy, episodes_list=[0, 1, 2, 3, 10]):
    for k in episodes_list:
        print('=' * 50)
        print('k={}'.format(k))
        value = policy_evaluation(env, k, policy)
        print(value)
    print('=' * 50)

In [7]:
env = Gridworld()
policy = random_policy()
check_value_iteration(policy)

k=0
[[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]
k=1
[[ 0. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1.  0.]]
k=2
[[ 0.   -1.75 -2.   -2.  ]
 [-1.75 -2.   -2.   -2.  ]
 [-2.   -2.   -2.   -1.75]
 [-2.   -2.   -1.75  0.  ]]
k=3
[[ 0.     -2.4375 -2.9375 -3.    ]
 [-2.4375 -2.875  -3.     -2.9375]
 [-2.9375 -3.     -2.875  -2.4375]
 [-3.     -2.9375 -2.4375  0.    ]]
k=10
[[ 0.         -6.13796997 -8.35235596 -8.96731567]
 [-6.13796997 -7.73739624 -8.42782593 -8.35235596]
 [-8.35235596 -8.42782593 -7.73739624 -6.13796997]
 [-8.96731567 -8.35235596 -6.13796997  0.        ]]


아래 코드는 무한히(*는 불가능하므로 엄청 true value와 거의 같아질 때까지*) 돌렸을 때 true value function 값으로 수렴하는지 확인한다.

In [8]:
def policy_evaluation(env, policy, discount_factor=1.0, theta=1e-5):
    V = np.zeros(env.nS)
    while True:
        delta = 0
        old_V = V.copy()
        for s in range(env.nS):
            v = 0
            for a, action_prob in enumerate(policy):
                prob, next_state, reward, done = env.P[s][a]
                v += action_prob * prob * (reward + discount_factor*old_V[next_state])
            V[s] = v
            delta = max(delta, np.abs(old_V[s] - v))
        if delta < theta:
            break
    return V.reshape(WORLD_SIZE, WORLD_SIZE)

In [9]:
env = Gridworld()
policy = random_policy()
value = policy_evaluation(env, policy)

In [10]:
print(value)

[[  0.         -13.99989315 -19.99984167 -21.99982282]
 [-13.99989315 -17.99986052 -19.99984273 -19.99984167]
 [-19.99984167 -19.99984273 -17.99986052 -13.99989315]
 [-21.99982282 -19.99984167 -13.99989315   0.        ]]
