In [1]:
import numpy as np

In [2]:
''' 
####################CLIFF WALKING ENVIRONMENT#########################

A schematic view of the environment-

o  o  o  o  o  o  o
o  o  o  o  o  o  o
o  o  o  o  o  o  o
S  x  x  x  x  x  T

Actions: 
    UP (0)
    DOWN (1)
    RIGHT (2)
    LEFT (3)

Rewards: 
     0 for going in Terminal state
    -100 for falling in the cliff
    -1 for all other actions in any state

Note: State remains the same on going out of the maze (but -1 reward is given)
      The episode ends and the agent returns to the start state after falling in the cliff

'''
START_STATE = 36
TERMINAL_STATE = 47
def reward(state):
    if(state == TERMINAL_STATE):
        reward = 0
    elif(state > START_STATE and state < TERMINAL_STATE):
        reward = -100
    else:
        reward = -1
    return reward

def env(state, action):
    # return_val = [prob, next state, reward, isdone]
    num_states = rows * columns
    isdone = lambda state: state > START_STATE and state <= TERMINAL_STATE
    
    if(isdone(state)):
        next_state = state
    else:
        if(action==0):
            next_state = state-columns if state-columns>=0 else state
        elif(action==1):
            next_state = state+columns if state+columns<num_states else state
        elif(action==2):
            next_state = state+1 if (state+1)%columns else state
        elif(action==3):
            next_state = state-1 if state%columns else state 
    # State Transition Probability is 1 because the environment is deterministic
    return_val = [1, next_state, reward(next_state), isdone(next_state)]
    return return_val

In [3]:
alpha = 0.1 # Learning Rate
epsilon = 0.1 # For Epsilon-greedy policy to balance exploration and exploitation
rows = 4
columns = 12
num_states = rows * columns
num_actions = 4
gamma = 1 # Discount Factor
episodes = 100000 # Number of games played

In [4]:
def qlearning():
    # Initialize the action value function
    Q = np.zeros((num_states, num_actions))
    for episode in range(episodes):
        # Initialize S
        curr_state = START_STATE
        while True:
            # Generate a random number between 0 and 1
            P = np.random.random()
            if(P > epsilon):
                # Pick the greedy action
                curr_action = np.argmax(Q[curr_state])
            else:
                # Pick a random action to explore
                curr_action = np.random.randint(0, num_actions)
            # prob: State Transition Probability 
            # reward, next_state: Immediate reward and next state on taking curr_action in curr_state
            # isdone: Whether the next state is Terminal or not    
            prob, next_state, reward, isdone = env(curr_state, curr_action)
            # Update the current state-action value
            Q[curr_state, curr_action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[curr_state, curr_action])
            curr_state = next_state
            if isdone:
                break
    return Q

In [5]:
Q = qlearning()
# Deterministic policy obtained using updated Q values
policy = np.argmax(Q,axis=1)
print(f'Value Function:\n {Q.reshape(num_states, num_actions)}')
print('\n')
print(f'Deterministic Policy:\n {policy.reshape(rows, columns)}')

Value Function:
 [[ -12.94362168  -12.92297404  -12.91765322  -12.98086948]
 [ -12.20518607  -11.97353105  -11.97340154  -12.39101783]
 [ -11.15738487  -10.9896074   -10.99008776  -11.10067077]
 [ -10.23340067   -9.99517922   -9.99562576  -10.88078715]
 [  -9.22534248   -8.99806412   -8.99791942   -9.8341118 ]
 [  -8.30784849   -7.99930593   -7.99928418   -8.59131066]
 [  -7.49242829   -6.99967207   -6.9996564    -7.74176099]
 [  -6.49474385   -5.99983169   -5.99983218   -6.67940256]
 [  -5.43802304   -4.99992311   -4.99992005   -6.04972726]
 [  -4.33752367   -3.99997412   -3.99997054   -4.72241688]
 [  -3.3308109    -2.99999181   -2.99999096   -3.79557165]
 [  -2.46100471   -2.           -2.17834858   -2.79399297]
 [ -13.85899652  -12.          -12.          -12.99939911]
 [ -12.92766845  -11.          -11.          -12.99983799]
 [ -11.95720726  -10.          -10.          -11.9977193 ]
 [ -10.9855592    -9.           -9.          -10.99748233]
 [  -9.9858338    -8.           -8.    

In [6]:
map_dict = {0:'Up', 1:'Down', 2:'Right', 3:'Left'}