In [1]:
width, height = 6, 6

In [2]:
from maze import MazeEnv, ACTIONS
import numpy as np
import matplotlib.pylab as plt
from PIL import Image
from IPython import display
import numpy as np
from tqdm import tqdm

filename = 'imagedraw.gif'
images = []
env = MazeEnv(width, height)
state = env.reset()
done = False
reward_sum = 0.0

img = env.render()
im = Image.fromarray(img)
im = im.resize((400, 400), resample=0)
images.append(im)

while not done:
    action = np.random.choice(len(ACTIONS))
    state, reward, done, _ = env.step(action)
    # print(state, reward, done)
    reward_sum += reward
    img = env.render()
    im = Image.fromarray(img)
    im = im.resize((400, 400), resample=0)
    images.append(im)
    
images[0].save(filename,
               save_all=True, append_images=images[1:],
               optimize=False, duration=40, loop=0)

In [3]:
display.Image(filename)

<IPython.core.display.Image object>

In [4]:
def get_probs(Q_s, epsilon, nA):
    """ obtains the action probabilities corresponding to epsilon-greedy policy """
    policy_s = np.ones(nA) * epsilon / nA
    best_a = np.argmax(Q_s)
    policy_s[best_a] = 1 - epsilon + (epsilon / nA)
    return policy_s

def get_episode(env, Q=None):
    episode = []
    state = env.reset()
    done = False
    while not done:
        if Q is None:
            action = np.random.choice(len(ACTIONS))
        else:
           action = np.random.choice(len(ACTIONS), p=get_probs(Q[state], 0.01, len(ACTIONS)))
        new_state, reward, done, _ = env.step(action)
        episode.append((tuple(state), action, reward))
        state = new_state
    return episode

In [5]:
from collections import defaultdict
from pprint import pprint

def approximate_q_function(num_episodes = 10000):
    env = MazeEnv(width, height)
    returns_sum = defaultdict(lambda: np.zeros(env.action_space.n))
    N = defaultdict(lambda: np.zeros(env.action_space.n))
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    for e_id in tqdm(range(num_episodes)):
        episode = get_episode(env)
        states, actions, rewards = zip(*episode)

        for i, state in enumerate(states):
            returns_sum[state][actions[i]] += sum(rewards[i:])
            N[state][actions[i]] += 1.0
            Q[state][actions[i]] = returns_sum[state][actions[i]] / N[state][actions[i]]

    return Q, N

In [6]:
Q, N = approximate_q_function(1000)

100%|██████████| 1000/1000 [00:12<00:00, 81.93it/s]


In [7]:
env = MazeEnv(width, height, (0,0))
episode = get_episode(env, Q)
pprint(episode)

[((0, 0), 0, -1.0),
 ((0, 1), 0, -1.0),
 ((0, 2), 0, -1.0),
 ((0, 3), 0, -1.0),
 ((0, 4), 2, -1.0),
 ((1, 4), 2, -1.0),
 ((2, 4), 2, -1.0),
 ((3, 4), 2, -1.0),
 ((4, 4), 2, -1.0),
 ((5, 4), 0, 20.0)]


In [8]:
def mc_policy(state, eps=0.01):
    policy = dict((k,np.argmax(v)) for k, v in Q.items())
    action = policy[state] if np.random.random() > eps else env.action_space.sample()
    return action

In [9]:
def eval_policy(env, policy, num_episodes=100):
    reward_list = []
    for i in tqdm(range(num_episodes)):
        state = env.reset()
        done = False
        reward_sum = 0.0
        while not done:
            action = policy(state)
            new_state, reward, done, _ = env.step(action)
            # print(state, action, new_state, reward)
            reward_sum += reward
            state = new_state

        reward_list.append(reward_sum)

    avg = np.mean(reward_list)
    print("")
    print("The average cumulative reward {} while {} episodes".format(avg, num_episodes))
    return avg, reward_list

In [10]:
%%time
env = MazeEnv(width, height, (0,0))
avg, _ = eval_policy(env, lambda s: mc_policy(s, eps=1.0), 1000)

100%|██████████| 1000/1000 [00:26<00:00, 37.78it/s]
The average cumulative reward -471.298 while 1000 episodes
Wall time: 26.5 s



In [11]:
%%time
env = MazeEnv(width, height, (0,0))
avg, _ = eval_policy(env, mc_policy, 1000)

100%|██████████| 1000/1000 [00:01<00:00, 822.49it/s]
The average cumulative reward 10.761 while 1000 episodes
Wall time: 1.22 s

