In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
%matplotlib inline

In [None]:
env=gym.make('CliffWalking-v0')

print(env.observation_space)
print(env.action_space)

nactions = env.action_space.n

In [None]:
#Theoretical optimal state vale function
V_opt = np.zeros((4,12))
V_opt[0:13][0] = -np.arange(3, 15)[::-1]
V_opt[0:13][1] = -np.arange(3, 15)[::-1] + 1
V_opt[0:13][2] = -np.arange(3, 15)[::-1] + 2
V_opt[3][0] = -13

In [None]:
def eps_greedy(Q,state,eps):
    """Implements epsilon greedy stratetgy for selecting actions based on exploration exploitation tradeoff"""
    prob = np.ones((nactions,)) * eps/nactions
    prob[np.argmax(Q[state])] = 1 - eps + eps/nactions
    action = np.random.choice(np.arange(nactions), p=prob)
    return action

In [None]:
Q_sarsa = defaultdict(lambda: np.zeros((nactions,)))
num_eps = 3000
alpha = 0.2
gamma = 1

eps_init=1
eps_decay = 0.99
eps_fin = 0.1

his=[]
for i in range(num_eps):
    state = env.reset()
    done = False
    eps = max(eps_fin,eps_init*eps_decay)
    t = 0
    action = eps_greedy(Q_sarsa,state,eps)
    r=0
    while not done:
        next_state, reward, done, _ = env.step(action)
        next_action = eps_greedy(Q_sarsa,next_state,eps)
        his.append(r)
        tmp = Q_sarsa[state][action]
        Q_sarsa[state][action] = tmp + alpha*(reward + gamma* Q_sarsa[next_state][next_action]-tmp)
        state,action = next_state, next_action
        t+=1
        if done:
            his.append(r)
            
    if (i+1)%100==0:
        print(f"Episode: {i+1}, Reward:{r}, Avg: {np.mean(his[-100:]):.3f}")

In [None]:
policy_sarsa = np.array([np.argmax(Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)]).reshape(4,12)

In [None]:
Q_QL = defaultdict(lambda: np.zeros((nactions,)))
num_eps = 3000
alpha = 0.2
gamma = 1

eps_init=1
eps_decay = 0.99
eps_fin = 0.1

his=[]
for i in range(num_eps):
    state = env.reset()
    done = False
    eps = max(eps_fin,eps_init*eps_decay)
    t = 0
    r=0
    while not done:
        action = eps_greedy(Q_QL,state,eps)
        next_state, reward, done, _ = env.step(action)
        r+=reward
        tmp = Q_QL[state][action]
        Q_QL[state][action] = tmp + alpha*(reward + gamma* np.max(Q_QL[next_state])-tmp)
        state,action = next_state, next_action
        t+=1
        if done:
            his.append(r)
    if (i+1)%100==0:
        print(f"Episode: {i+1}, Reward:{r}, Avg: {np.mean(his[-100:]):.3f}")

In [None]:
policy_QL = np.array([np.argmax(Q_QL[key]) if key in Q_QL else -1 for key in np.arange(48)]).reshape(4,12)

In [None]:
Q_exp_sarsa = defaultdict(lambda: np.zeros((nactions,)))
num_eps = 3000
alpha = 0.2
gamma = 1

eps_init=1
eps_decay = 0.99
eps_fin = 0.1

his=[]
for i in range(num_eps):
    state = env.reset()
    done = False
    eps = max(eps_fin,eps_init*eps_decay)
    t = 0
    r=0
    while not done:
        action = eps_greedy(Q_exp_sarsa,state,eps)
        next_state, reward, done, _ = env.step(action)
        r+=reward
        prob = np.ones((nactions,)) * eps/nactions
        prob[np.argmax(Q_exp_sarsa[next_state])] = 1 - eps + eps/nactions
        
        tmp = Q_exp_sarsa[state][action]
        Q_exp_sarsa[state][action] = tmp + alpha*(reward + gamma* np.dot(prob,Q_exp_sarsa[next_state])-tmp)
        state = next_state
        t+=1
        if done:
            his.append(r)
        
    if (i+1)%100==0:
        print(f"Episode: {i+1}, Reward:{r}, Avg: {np.mean(his[-100:]):.3f}")

In [None]:
policy_exp_sarsa = np.array([np.argmax(Q_exp_sarsa[key]) if key in Q_exp_sarsa else -1 for key in np.arange(48)]).reshape(4,12)