In [1]:
import numpy as np

In [2]:
'''
####################### GRIDWORLD ###########################
    GridWorld like:

    T o o ..........o
    o o o ..........o
    . .             .        
    . .             .  
    . .             o
    . .             o
    o o o ......o o T
Actions: 
    UP (0)
    DOWN (1)
    RIGHT (2)
    LEFT (3)

Rewards: 
     0 for going in Terminal state
    -1 for all other actions in any state

Note: State remains the same on going out of the maze (but -1 reward is given)
'''
def env(state, action):
    # return_val = [prob, next state, reward, isdone]
    num_states = rows * columns
    isdone = lambda state: state==0 or state==(num_states-1)
    reward = lambda state: 0 if isdone(state) else -1

    if(isdone(state)):
        next_state = state
    else:
        if(action==0):
            next_state = state-columns if state-columns>=0 else state
        elif(action==1):
            next_state = state+columns if state+columns<num_states else state
        elif(action==2):
            next_state = state+1 if (state+1)%columns else state
        elif(action==3):
            next_state = state-1 if state%columns else state 
    # State Transition Probability is 1 because the environment is deterministic
    return_val = [1, next_state, reward(next_state), isdone(next_state)]
    return return_val

In [3]:
def td_pred(policy, flag):
    # Initialize Value function
    VF = np.zeros(num_states)
    for episode in range(episodes):
        # Initialize S
        curr_state = np.random.randint(0, num_states)
        while True:
            # Sample an action from S
            curr_action = np.argmax(policy[curr_state]) if flag else np.random.randint(0, num_actions)
            # prob: State Transition Probability 
            # reward, next_state: Immediate reward and next state on taking curr_action in curr_state
            # isdone: Whether the next state is Terminal or not
            prob, next_state, reward, isdone = env(curr_state, curr_action)
            # Update the current state value
            VF[curr_state] = VF[curr_state] + alpha * (reward + gamma * VF[next_state] - VF[curr_state])
            curr_state = next_state
            if isdone:
                break
    return VF

In [4]:
alpha = 0.1 # Learning Rate
rows = 5
columns = 7
num_states = rows * columns
num_actions = 4
gamma = 0.99 # Discount Factor
episodes = 100000 # Number of games played
# UNIFORM RANDOM POLICY
rand_policy = np.ones((num_states, num_actions))/num_actions
# GREEDY DETERMINISTIC POLICY 
deter_policy = [[1, 0, 0, 0],[0, 0, 0, 1],[0, 0, 0, 1],[0, 0, 0, 1],[0, 0, 0, 1],[0, 1, 0, 0],[0, 1, 0, 0],
          [1, 0, 0, 0],[1, 0, 0, 0],[1, 0, 0, 0],[1, 0, 0, 0],[1, 0, 0, 0],[0, 1, 0, 0],[0, 1, 0, 0],
          [1, 0, 0, 0],[1, 0, 0, 0],[1, 0, 0, 0],[1, 0, 0, 0],[0, 1, 0, 0],[0, 1, 0, 0],[0, 1, 0, 0],
          [1, 0, 0, 0],[1, 0, 0, 0],[1, 0, 0, 0],[0, 1, 0, 0],[0, 1, 0, 0],[0, 1, 0, 0],[0, 1, 0, 0],
          [1, 0, 0, 0],[1, 0, 0, 0],[0, 0, 1, 0],[0, 0, 1, 0],[0, 0, 1, 0],[0, 0, 1, 0],[1, 0, 0, 0]]

In [5]:
# Flag is 0 for random policy and 1 for deterministic policy
VF = td_pred(rand_policy, flag = 0)
print(f'Value Function for Uniform Random policy:\n {VF.reshape(rows, columns)}')

Value Function for Uniform Random policy:
 [[  0.         -26.70110685 -33.72970448 -37.31734068 -38.96098171
  -39.49996922 -39.46662963]
 [-18.63437237 -29.20014148 -35.36728293 -37.96337221 -38.22157014
  -37.24253909 -36.6876679 ]
 [-27.70288602 -34.23803591 -36.69153357 -37.53295002 -37.06745825
  -33.57396437 -29.4488588 ]
 [-35.63237252 -37.25021585 -38.13534024 -36.12940037 -33.878469
  -23.91097296 -21.03887742]
 [-38.24304362 -38.56195133 -38.35172435 -35.86120631 -27.5404141
  -20.5893415    0.        ]]


In [6]:
VF = td_pred(deter_policy, flag = 1)
print(f'Value Function for deterministic greedy policy:\n {VF.reshape(rows, columns)}')

Value Function for deterministic greedy policy:
 [[ 0.        0.       -1.       -1.99     -2.9701   -3.940399 -2.9701  ]
 [ 0.       -1.       -1.99     -2.9701   -3.940399 -2.9701   -1.99    ]
 [-1.       -1.99     -2.9701   -3.940399 -2.9701   -1.99     -1.      ]
 [-1.99     -2.9701   -3.940399 -2.9701   -1.99     -1.        0.      ]
 [-2.9701   -3.940399 -2.9701   -1.99     -1.        0.        0.      ]]
