In [1]:
import numpy as np
import pandas as pd

from environments import *

# Functions

In [15]:
def epsilon_greedy(qlist, epsilon):
    '''
        Choose an action based on q(s,*) values using epsilon greedy
    '''
    num = np.random.uniform(0,1,1)
    num_actions = len(qlist)
    best_action = np.argmax(qlist)

    if num >= num_actions*epsilon:
        return best_action
    else:
        return int(num/epsilon)

def SARSA(env, alpha, gamma=1, epsilon = 0.1, num_iters = 100, max_steps=1000, random_seed=None):
    '''
        Implement SARSA on-policy alg using epsilon-greedy as both behavior and target policy
        Q(s,a) <-- Q(s,a) + alpha * (R + gamma * Q(s_next, a_next) - Q(s,a))
        params:
            env: environment
            alpha: learning rate
            gamma: discount factor
            epsilon: probability used in generating epsilon-greedy algs
            num_iters: number of iterations for training
            max_steps: maximum number of steps for each policy rollout
    '''
    env.seed(random_seed)
    np.random.seed(random_seed)
    
    q = np.zeros((len(env.state_dict),len(env.action_list)))
    for i in range(num_iters):
        env.reset()
        state = env.state
        state_index = env.state_dict[state]
        action_index = epsilon_greedy(q[state_index], epsilon)
        action = env.action_list[action_index]
        num_steps = 0
        
        while (not env.terminated) and (num_steps<max_steps):
            reward = env.step(action)
            num_steps += 1
            
            s_next = env.state
            s_next_index = env.state_dict[s_next]
            a_next_index = epsilon_greedy(q[s_next_index], epsilon)      
            a_next = env.action_list[a_next_index]

            q[state_index, action_index] = q[state_index, action_index] + alpha*(reward + gamma*q[s_next_index, a_next_index] - q[state_index, action_index])

            state = s_next
            state_index = s_next_index
            action = a_next
            action_index = a_next_index
            
    # get policy from best q
    policy = {}
    for state in env.state_dict.keys():
        policy[state] = np.zeros(len(env.action_list)) + 0.05
        policy[state][np.argmax(q[env.state_dict[state]])] = 1 - 0.05*(len(env.action_list)-1)

    return q, policy

In [16]:
def eval(env,policy, num_iter = 100, max_steps=1000, random_seed=None):
    '''
        Evaluate a given policy, where policy[s] is a probability distribution over p(a|s), using an average over num_iter rollouts
    '''
    rewards_list = []
    steps_list = []
    env.seed(random_seed)
    np.random.seed(random_seed)
    
    for _ in range(num_iter):
        env.reset()
        state = env.state
        num_steps = 0
        rewards = 0
        while (not env.terminated) and (num_steps<max_steps):
            action_index = np.random.choice([i for i in range(len(policy[state]))], 1, p=policy[state])[0]
            action = env.action_list[action_index]

            r = env.step(action)
            state = env.state
            #print(state, r)
            num_steps += 1
            rewards += r
        
        rewards_list.append(rewards)
        steps_list.append(num_steps)
    
    r = np.array(rewards_list)
    
    print('mean rewards:', r.mean())
    print('std:', r.std())
    print('max rewards:', r.max())
    print('min rewards:', r.min())
    return r

In [17]:
def Qlearning(env, alpha, gamma=1, nsteps=1, epsilon = 0.1, num_iters = 100, max_steps=1000, random_seed=None):
    '''
        Implement Q-learning, off-policy alg using epsilon-greedy as behavior policy and deterministic policy (based on Q values) as target policy
        Q(s,a) <-- Q(s,a) + alpha * (R + gamma * max_a{Q(s_next, a)} - Q(s,a))
        params:
            env: environment
            alpha: learning rate
            gamma: discount factor
            epsilon: probability used in generating epsilon-greedy algs
            num_iters: number of iterations for training
            max_steps: maximum number of steps for each policy rollout
    '''
    env.seed(random_seed)
    np.random.seed(random_seed)
    
    q = np.zeros((len(env.state_dict),len(env.action_list)))   
    for i in range(num_iters):
        env.reset()
        state = env.state
        state_index = env.state_dict[state]
        num_steps = 0
        
        while (not env.terminated) and (num_steps<max_steps):
            action_index = epsilon_greedy(q[state_index], epsilon)
            action = env.action_list[action_index]
            reward = env.step(action)
            num_steps += 1
            
            s_next = env.state
            s_next_index = env.state_dict[s_next]

            q[state_index, action_index] = q[state_index, action_index] + alpha*(reward + gamma*q[s_next_index].max() - q[state_index, action_index])

            state = s_next
            state_index = s_next_index
            
    # get policy from best q
    policy = {}
    for state in env.state_dict.keys():
        policy[state] = np.zeros(len(env.action_list)) + 0.05
        policy[state][np.argmax(q[env.state_dict[state]])] = 1 - 0.05*(len(env.action_list)-1)

    return q, policy

# WindyGrid

In [2]:
xmax = 10
ymax = 7
upwind = [0,0,0,1,1,1,2,2,1,0]

grid_wind = {}
for x in range(xmax):
    for y in range(ymax):
        grid_wind[(x,y)] = upwind[x]

start_state = (0,3)
goal_state = (7,3)

In [3]:
env = windy_gridworld(grid_wind, goal_state, start_state, prob=0)

In [9]:
[q,policy] = SARSA(env,alpha=0.5,num_iters=10000,random_seed=1)

In [10]:
p = {}
for state in policy:
    p[state] = env.action_list[np.argmax(policy[state])]

In [11]:
q[env.state_dict[(3,5)]]

array([-31.14742522, -31.9457328 , -27.82297286, -32.35990984])

In [12]:
env.reset()
rewards = 0

while not env.terminated:
    print(env.state)
    action = p[env.state]
    r = env.step(action)
    rewards += r
print(rewards)

(0, 3)
(1, 3)
(2, 3)
(3, 3)
(4, 4)
(5, 5)
(6, 6)
(7, 6)
(8, 6)
(9, 6)
(9, 5)
(9, 4)
(9, 3)
(9, 2)
(8, 2)
-15


In [14]:
r = eval(env, policy, random_seed=1)

mean rewards: -20.49
std: 5.828370269637989
max rewards: -15
min rewards: -46


In [19]:
[q, policy] = Qlearning(env, alpha=0.5, num_iters=8000, random_seed=1)
r = eval(env, policy, random_seed=1)

mean rewards: -19.91
std: 4.310672801315359
max rewards: -15
min rewards: -35


In [22]:
[q, policy] = SARSA(env, alpha=0.5, num_iters=8000, random_seed=1)
r = eval(env, policy, random_seed=1)

mean rewards: -116.16
std: 53.930088077065115
max rewards: -30
min rewards: -325


In [22]:
env = windy_gridworld(grid_wind, goal_state, start_state, prob=1.0/3)

In [27]:
[q, policy] = Qlearning(env, alpha=0.5, num_iters=8000)

In [28]:
r = eval(env, policy)

mean rewards: -73.44
std: 57.46169506723587
max rewards: -8
min rewards: -291


In [29]:
policy

{(0, 0): array([0.05, 0.05, 0.85, 0.05]),
 (0, 1): array([0.05, 0.05, 0.85, 0.05]),
 (0, 2): array([0.05, 0.05, 0.85, 0.05]),
 (0, 3): array([0.05, 0.05, 0.85, 0.05]),
 (0, 4): array([0.05, 0.05, 0.85, 0.05]),
 (0, 5): array([0.05, 0.05, 0.85, 0.05]),
 (0, 6): array([0.05, 0.85, 0.05, 0.05]),
 (1, 0): array([0.05, 0.05, 0.85, 0.05]),
 (1, 1): array([0.05, 0.05, 0.85, 0.05]),
 (1, 2): array([0.05, 0.05, 0.85, 0.05]),
 (1, 3): array([0.05, 0.05, 0.85, 0.05]),
 (1, 4): array([0.05, 0.05, 0.85, 0.05]),
 (1, 5): array([0.05, 0.05, 0.85, 0.05]),
 (1, 6): array([0.05, 0.85, 0.05, 0.05]),
 (2, 0): array([0.05, 0.05, 0.85, 0.05]),
 (2, 1): array([0.05, 0.05, 0.85, 0.05]),
 (2, 2): array([0.05, 0.05, 0.85, 0.05]),
 (2, 3): array([0.05, 0.05, 0.85, 0.05]),
 (2, 4): array([0.05, 0.85, 0.05, 0.05]),
 (2, 5): array([0.05, 0.05, 0.85, 0.05]),
 (2, 6): array([0.05, 0.05, 0.85, 0.05]),
 (3, 0): array([0.05, 0.05, 0.85, 0.05]),
 (3, 1): array([0.05, 0.05, 0.85, 0.05]),
 (3, 2): array([0.05, 0.05, 0.85, 

In [25]:
[q,policy] = SARSA(env, alpha=0.5, num_iters=8000)

In [26]:
r = eval(env, policy)

mean rewards: -45.0
std: 28.852382917187274
max rewards: -11
min rewards: -233


# Cliff walking

In [18]:
env = cliff_walking(size=(12,4))

In [19]:
[q,policy] = SARSA(env, alpha=0.5, num_iters=8000)

In [21]:
p = {}
for state in policy:
    p[state] = env.action_list[np.argmax(policy[state])]

In [22]:
p

{(0, 0): 'up',
 (0, 1): 'up',
 (0, 2): 'up',
 (0, 3): 'right',
 (1, 0): 'up',
 (1, 1): 'up',
 (1, 2): 'right',
 (1, 3): 'right',
 (2, 0): 'up',
 (2, 1): 'left',
 (2, 2): 'up',
 (2, 3): 'up',
 (3, 0): 'up',
 (3, 1): 'up',
 (3, 2): 'up',
 (3, 3): 'right',
 (4, 0): 'up',
 (4, 1): 'left',
 (4, 2): 'right',
 (4, 3): 'right',
 (5, 0): 'up',
 (5, 1): 'up',
 (5, 2): 'right',
 (5, 3): 'right',
 (6, 0): 'up',
 (6, 1): 'up',
 (6, 2): 'up',
 (6, 3): 'right',
 (7, 0): 'up',
 (7, 1): 'up',
 (7, 2): 'right',
 (7, 3): 'right',
 (8, 0): 'up',
 (8, 1): 'up',
 (8, 2): 'right',
 (8, 3): 'right',
 (9, 0): 'up',
 (9, 1): 'up',
 (9, 2): 'up',
 (9, 3): 'right',
 (10, 0): 'up',
 (10, 1): 'right',
 (10, 2): 'right',
 (10, 3): 'down',
 (11, 0): 'up',
 (11, 1): 'down',
 (11, 2): 'down',
 (11, 3): 'down'}