In [1]:
import numpy as np

In [2]:
''' 
####################CLIFF WALKING ENVIRONMENT#########################

A schematic view of the environment-

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
S  x  x  x  x  x  x  x  x  x  x  T

Actions: 
    UP (0)
    DOWN (1)
    RIGHT (2)
    LEFT (3)

Rewards: 
     0 for going in Terminal state
    -100 for falling in the cliff
    -1 for all other actions in any state

Note: State remains the same on going out of the maze (but -1 reward is given)
      The episode ends and the agent returns to the start state after falling in the cliff

'''
START_STATE = 36
TERMINAL_STATE = 47
def reward(state):
    if(state == TERMINAL_STATE):
        reward = 0
    elif(state > START_STATE and state < TERMINAL_STATE):
        reward = -100
    else:
        reward = -1
    return reward

def env(state, action):
    # return_val = [prob, next state, reward, isdone]
    num_states = rows * columns
    isdone = lambda state: state > START_STATE and state <= TERMINAL_STATE
    
    if(isdone(state)):
        next_state = state
    else:
        if(action==0):
            next_state = state-columns if state-columns>=0 else state
        elif(action==1):
            next_state = state+columns if state+columns<num_states else state
        elif(action==2):
            next_state = state+1 if (state+1)%columns else state
        elif(action==3):
            next_state = state-1 if state%columns else state 
    # State Transition Probability is 1 because the environment is deterministic
    return_val = [1, next_state, reward(next_state), isdone(next_state)]
    return return_val

In [3]:
alpha = 0.1 # Learning Rate
epsilon = 0.1 # For Epsilon-greedy policy to balance exploration and exploitation
rows = 4
columns = 12
num_states = rows * columns
num_actions = 4
gamma = 1 # Discount Factor
episodes = 100000 # Number of games played

In [4]:
def qlearning():
    # Initialize the action value function
    Q = np.zeros((num_states, num_actions))
    for episode in range(episodes):
        # Initialize S
        curr_state = START_STATE
        while True:
            # Generate a random number between 0 and 1
            P = np.random.random()
            if(P > epsilon):
                # Pick the greedy action
                curr_action = np.argmax(Q[curr_state])
            else:
                # Pick a random action to explore
                curr_action = np.random.randint(0, num_actions)
            # prob: State Transition Probability 
            # reward, next_state: Immediate reward and next state on taking curr_action in curr_state
            # isdone: Whether the next state is Terminal or not    
            prob, next_state, reward, isdone = env(curr_state, curr_action)
            # Update the current state-action value
            Q[curr_state, curr_action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[curr_state, curr_action])
            curr_state = next_state
            if isdone:
                break
    return Q

In [5]:
Q = qlearning()
# Deterministic policy obtained using updated Q values
policy = np.argmax(Q,axis=1)
print(f'Value Function:\n {Q.reshape(num_states, num_actions)}')
print('\n')
print(f'Deterministic Policy:\n {policy.reshape(rows, columns)}')

Value Function:
 [[ -12.96438022  -12.89352392  -12.89723806  -12.95246411]
 [ -12.06702773  -11.97597529  -11.97464032  -12.3063454 ]
 [ -11.28379205  -10.99074828  -10.99051255  -11.60347591]
 [ -10.13001207   -9.99630553   -9.99589533  -10.96812538]
 [  -9.22874498   -8.99824401   -8.99829562   -9.74899718]
 [  -8.5260799    -7.99919384   -7.9991563    -8.58422317]
 [  -7.37981913   -6.99960567   -6.99958835   -7.78854913]
 [  -6.36248133   -5.9998284    -5.99981456   -7.06538996]
 [  -5.36731915   -4.99994338   -4.99994586   -6.35806051]
 [  -4.38089186   -3.9999833    -3.99998309   -5.23763811]
 [  -3.51467045   -2.99999647   -2.99999678   -3.650619  ]
 [  -2.26462869   -2.           -2.09981      -2.50448295]
 [ -13.81426218  -12.          -12.          -12.99968731]
 [ -12.93203483  -11.          -11.          -12.99928642]
 [ -11.96700495  -10.          -10.          -11.9968391 ]
 [ -10.98624184   -9.           -9.          -10.99954981]
 [  -9.98854161   -8.           -8.    

In [6]:
map_dict = {0:'Up', 1:'Down', 2:'Right', 3:'Left'}