In [34]:
import numpy as np

# Policy Evaluation in grid world

In [35]:
## grid world를 4*4의 배열로 표현하였고, 각 grid cell에서의 움직임을 상하는 (1,0),(-1,0) 좌우는 (-1,0),(1,0)로 표현
## 벽에 걸릴경우 state의 변화가 없도록 하였다

def get_state(state, action):
    
    action_grid = [(-1, 0), (1, 0), (0, -1), (0, 1)]
    
    state[0]+=action_grid[action][0]
    state[1]+=action_grid[action][1]
    
    if state[0] < 0 :
        state[0] = 0
    elif state[0] > 3 :
        state[0] = 3
    
    if state[1] < 0 :
        state[1] = 0
    elif state[1] > 3 :
        state[1] = 3
    
    return state[0], state[1]

In [36]:
## 벨만 기대 방정식을 토대로 reward는 (-1), 감가율은 1로 기본설정하여 해놓은 함수
## 가치는 소수점 셋째 자리에서 반올림 하였고 10회 itr마다 프린트하였다

def policy_evaluation(grid_width, grid_height, action, policy, iter_num, reward=-1, dis=1, rand=False):
    
    # table initialize
    if(rand==False):
        post_value_table = np.zeros([grid_height, grid_width], dtype=float)
    else:
        post_value_table = np.random.randn(grid_height, grid_width)
    
    # iteration
    if iter_num == 0:
        print('Iteration: {} \n{}\n'.format(iter_num, post_value_table))
        return post_value_table
    
    for iteration in range(iter_num):
        if(rand==False):
            next_value_table = np.zeros([grid_height, grid_width], dtype=float)
        else:
            next_value_table = np.random.randn(grid_height, grid_width)
            
        for i in range(grid_height):
            for j in range(grid_width):
                if i == j and ((i == 0) or (i == 3)):
                    value_t = 0
                else :
                    value_t = 0
                    for act in action:
                        i_, j_ = get_state([i,j], act)
#                         print("policy : ", policy)
                        value = policy[i][j][act] * (reward + dis*post_value_table[i_][j_])
                        value_t += value
                next_value_table[i][j] = round(value_t, 3)
        iteration += 1
        
        # print result
        if (iteration % 10) != iter_num: 
            # print result 
            if iteration > 100 :
                if (iteration % 20) == 0: 
                    print('Iteration: {} \n{}\n'.format(iteration, next_value_table))
            else :
                if (iteration % 10) == 0:
                    print('Iteration: {} \n{}\n'.format(iteration, next_value_table))
        else :
            print('Iteration: {} \n{}\n'.format(iteration, next_value_table ))
        
       
        post_value_table = next_value_table
        
            
    return next_value_table

In [37]:
## 4*4의 grid world에서 랜덤정책으로 가치가 평가가 되는걸 볼 수 있습니다

grid_width = 4
grid_height = grid_width
action = [0, 1, 2, 3] # up, down, left, right
policy = np.empty([grid_height, grid_width, len(action)], dtype=float)
for i in range(grid_height):
    for j in range(grid_width):
        for k in range(len(action)):
            if i==j and ((i==0) or (i==3)):
                policy[i][j]=0.00
            else :
                policy[i][j]=0.25
policy[0][0] = [0] * grid_width
policy[3][3] = [0] * grid_width

In [38]:
value = policy_evaluation(grid_width, grid_height, action, policy, 100, rand=False)

Iteration: 10 
[[ 0.    -6.138 -8.352 -8.968]
 [-6.138 -7.737 -8.428 -8.352]
 [-8.352 -8.428 -7.737 -6.138]
 [-8.968 -8.352 -6.138  0.   ]]

Iteration: 20 
[[  0.     -9.45  -13.257 -14.454]
 [ -9.45  -12.06  -13.302 -13.257]
 [-13.257 -13.302 -12.06   -9.45 ]
 [-14.454 -13.257  -9.45    0.   ]]

Iteration: 30 
[[  0.    -11.366 -16.096 -17.632]
 [-11.366 -14.562 -16.123 -16.097]
 [-16.096 -16.123 -14.562 -11.366]
 [-17.632 -16.097 -11.366   0.   ]]

Iteration: 40 
[[  0.    -12.475 -17.74  -19.471]
 [-12.475 -16.01  -17.755 -17.74 ]
 [-17.74  -17.755 -16.01  -12.475]
 [-19.471 -17.74  -12.475   0.   ]]

Iteration: 50 
[[  0.    -13.117 -18.691 -20.536]
 [-13.117 -16.847 -18.7   -18.691]
 [-18.691 -18.7   -16.847 -13.117]
 [-20.536 -18.691 -13.117   0.   ]]

Iteration: 60 
[[  0.    -13.489 -19.242 -21.152]
 [-13.489 -17.333 -19.248 -19.242]
 [-19.242 -19.248 -17.333 -13.489]
 [-21.152 -19.242 -13.489   0.   ]]

Iteration: 70 
[[  0.    -13.704 -19.562 -21.51 ]
 [-13.704 -17.614 -19.56

# Policy Improvement in grid world

greedy policy improvement

In [39]:
def policy_improvement(value, action, policy, reward = -1, grid_width = 4):
    
    grid_height = grid_width
    
    action_match = ['Up', 'Down', 'Left', 'Right']
    action_table = []
    
    # get Q-func.
    for i in range(grid_height):
        for j in range(grid_width):
            q_func_list=[]
            if i==j and ((i==0)or (i==3)):
                action_table.append('T')
            else:
                for k in range(len(action)):
                    i_, j_ = get_state([i, j], k)
                    q_func_list.append(value[i_][j_])
                max_actions = [action_v for action_v, x in enumerate(q_func_list) if x == max(q_func_list)] 

                # update policy
                policy[i][j]= [0]*len(action) # initialize q-func_list
                for y in max_actions :
                    policy[i][j][y] = (1 / len(max_actions))

                # get action
                idx = np.argmax(policy[i][j])
                action_table.append(action_match[idx])
    action_table=np.asarray(action_table).reshape((grid_height, grid_width))                
    
    print('Updated policy is :\n{}\n'.format(policy))
    print('at each state, chosen action is :\n{}'.format(action_table))
    
    return policy

In [40]:
updated_policy = policy_improvement(value, action, policy)

Updated policy is :
[[[0.  0.  0.  0. ]
  [0.  0.  1.  0. ]
  [0.  0.  1.  0. ]
  [0.  0.5 0.5 0. ]]

 [[1.  0.  0.  0. ]
  [0.5 0.  0.5 0. ]
  [0.  0.5 0.5 0. ]
  [0.  1.  0.  0. ]]

 [[1.  0.  0.  0. ]
  [0.5 0.  0.  0.5]
  [0.  0.5 0.  0.5]
  [0.  1.  0.  0. ]]

 [[0.5 0.  0.  0.5]
  [0.  0.  0.  1. ]
  [0.  0.  0.  1. ]
  [0.  0.  0.  0. ]]]

at each state, chosen action is :
[['T' 'Left' 'Left' 'Down']
 ['Up' 'Up' 'Down' 'Down']
 ['Up' 'Up' 'Down' 'Down']
 ['Up' 'Right' 'Right' 'T']]


# Value Iteration

In [41]:
def policy_evaluation(grid_width, grid_height, action, policy, iter_num, reward=-1, dis=1):
    
    # table initialize
    post_value_table = np.zeros([grid_height, grid_width], dtype=float)
    
    # iteration
    if iter_num == 0:
        print('Iteration: {} \n{}\n'.format(iter_num, post_value_table))
        return post_value_table
    
    for iteration in range(iter_num):
        next_value_table = np.zeros([grid_height, grid_width], dtype=float)
        for i in range(grid_height):
            for j in range(grid_width):
                if i == j and ((i == 0) or (i == 3)):
                    value_t = 0
                else :
                    value_t_list= []
                    for act in action:
                        i_, j_ = get_state([i,j], act)
                        value = (reward + dis*post_value_table[i_][j_])
                        value_t_list.append(value)
                    next_value_table[i][j] = max(value_t_list)
        iteration += 1
        
        # print result
        if (iteration % 10) != iter_num: 
            # print result 
            if iteration > 100 :
                if (iteration % 20) == 0: 
                    print('Iteration: {} \n{}\n'.format(iteration, next_value_table))
            else :
                if (iteration % 10) == 0:
                    print('Iteration: {} \n{}\n'.format(iteration, next_value_table))
        else :
            print('Iteration: {} \n{}\n'.format(iteration, next_value_table ))
        
       
        post_value_table = next_value_table
        
            
    return next_value_table

In [42]:
grid_width = 4
grid_height = grid_width
action = [0, 1, 2, 3] # up, down, left, right
policy = np.empty([grid_height, grid_width, len(action)], dtype=float)
for i in range(grid_height):
    for j in range(grid_width):
        for k in range(len(action)):
            if i==j and ((i==0) or (i==3)):
                policy[i][j]=0.00
            else :
                policy[i][j]=0.25
policy[0][0] = [0] * grid_width
policy[3][3] = [0] * grid_width

In [43]:
value = policy_evaluation(grid_width, grid_height, action, policy, 1)
value = policy_evaluation(grid_width, grid_height, action, policy, 2)
value = policy_evaluation(grid_width, grid_height, action, policy, 3)
value = policy_evaluation(grid_width, grid_height, action, policy, 10)

Iteration: 1 
[[ 0. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1.  0.]]

Iteration: 2 
[[ 0. -1. -2. -2.]
 [-1. -2. -2. -2.]
 [-2. -2. -2. -1.]
 [-2. -2. -1.  0.]]

Iteration: 3 
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]

Iteration: 10 
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]



In [44]:
value = policy_evaluation(grid_width, grid_height, action, policy, 100)

Iteration: 10 
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]

Iteration: 20 
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]

Iteration: 30 
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]

Iteration: 40 
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]

Iteration: 50 
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]

Iteration: 60 
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]

Iteration: 70 
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]

Iteration: 80 
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]

Iteration: 90 
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]

Iteration: 100 
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]

