In [1]:
from maze_2d.maze_2d_q_learning import simulate

In [1]:
import numpy as np

In [5]:
make_action_proba = 1
np.random.choice([True, False], p=[make_action_proba, 1 - make_action_proba])

True

In [9]:
np.random.choice([True, False], p=[make_action_proba, 1 - make_action_proba])

True

In [10]:
np.array((10, 10)) - np.array((1, 1))

array([9, 9])

In [6]:
import sys
import numpy as np
import math
import random

import gym
from gym_maze.envs import maze_env

ModuleNotFoundError: No module named 'gym_maze'

In [6]:
# env = gym.make("maze-random-12x12-plus-v0")
env = maze_env.MazeEnv(maze_size=(12, 12), )

In [7]:

'''
Defining the environment related constants
'''
# Number of discrete states (bucket) per state dimension
MAZE_SIZE = tuple((env.observation_space.high + np.ones(env.observation_space.shape)).astype(int))
NUM_BUCKETS = MAZE_SIZE  # one bucket per grid

# Number of discrete actions
NUM_ACTIONS = env.action_space.n  # ["N", "S", "E", "W"]
# Bounds for each discrete state
STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))

'''
Learning related constants
'''
MIN_EXPLORE_RATE = 0.001
MIN_LEARNING_RATE = 0.2
DECAY_FACTOR = np.prod(MAZE_SIZE, dtype=float) / 10.0

'''
Defining the simulation related constants
'''
NUM_EPISODES = 50000
MAX_T = np.prod(MAZE_SIZE, dtype=int) * 100
STREAK_TO_END = 100
SOLVED_T = np.prod(MAZE_SIZE, dtype=int)
DEBUG_MODE = 1
RENDER_MAZE = True
ENABLE_RECORDING = True


In [8]:
def select_action(env, q_table, state, explore_rate):
    # Select a random action
    if random.random() < explore_rate:
        action = env.action_space.sample()
    # Select the action with the highest q
    else:
        action = int(np.argmax(q_table[state]))
    return action


def get_explore_rate(t):
    return max(MIN_EXPLORE_RATE, min(0.8, 1.0 - math.log10((t+1)/DECAY_FACTOR)))


def get_learning_rate(t):
    return max(MIN_LEARNING_RATE, min(0.8, 1.0 - math.log10((t+1)/DECAY_FACTOR)))


def state_to_bucket(state):
    bucket_indice = []
    for i in range(len(state)):
        if state[i] <= STATE_BOUNDS[i][0]:
            bucket_index = 0
        elif state[i] >= STATE_BOUNDS[i][1]:
            bucket_index = NUM_BUCKETS[i] - 1
        else:
            # Mapping the state bounds to the bucket array
            bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0]
            offset = (NUM_BUCKETS[i]-1)*STATE_BOUNDS[i][0]/bound_width
            scaling = (NUM_BUCKETS[i]-1)/bound_width
            bucket_index = int(round(scaling*state[i] - offset))
        bucket_indice.append(bucket_index)
    return tuple(bucket_indice)

In [9]:
def simulate(q_table, env):

    # Instantiating the learning related parameters
    learning_rate = get_learning_rate(0)
    explore_rate = get_explore_rate(0)
    discount_factor = 0.99

    num_streaks = 0

    # Render tha maze
    env.render()

    for episode in range(NUM_EPISODES):

        # Reset the environment
        obv = env.reset()

        # the initial state
        state_0 = state_to_bucket(obv)
        total_reward = 0

        for t in range(MAX_T):

            # Select an action
            action = select_action(env, q_table, state_0, explore_rate)

            # execute the action
            obv, reward, done, _ = env.step(action)

            # Observe the result
            state = state_to_bucket(obv)
            total_reward += reward

            # Update the Q based on the result
            best_q = np.amax(q_table[state])
            q_table[state_0 + (action,)] += learning_rate * (reward + discount_factor * (best_q) - q_table[state_0 + (action,)])

            # Setting up for the next iteration
            state_0 = state

            # Print data
            if DEBUG_MODE == 2:
                print("\nEpisode = %d" % episode)
                print("t = %d" % t)
                print("Action: %d" % action)
                print("State: %s" % str(state))
                print("Reward: %f" % reward)
                print("Best Q: %f" % best_q)
                print("Explore rate: %f" % explore_rate)
                print("Learning rate: %f" % learning_rate)
                print("Streaks: %d" % num_streaks)
                print("")

            elif DEBUG_MODE == 1:
                if done or t >= MAX_T - 1:
                    print("\nEpisode = %d" % episode)
                    print("t = %d" % t)
                    print("Explore rate: %f" % explore_rate)
                    print("Learning rate: %f" % learning_rate)
                    print("Streaks: %d" % num_streaks)
                    print("Total reward: %f" % total_reward)
                    print("")

            # Render tha maze
            if RENDER_MAZE:
                env.render()

            if env.is_game_over():
                sys.exit()

            if done:
                print("Episode %d finished after %f time steps with total reward = %f (streak %d)."
                      % (episode, t, total_reward, num_streaks))

                if t <= SOLVED_T:
                    num_streaks += 1
                else:
                    num_streaks = 0
                break

            elif t >= MAX_T - 1:
                print("Episode %d timed out at %d with total reward = %f."
                      % (episode, t, total_reward))

        # It's considered done when it's solved over 120 times consecutively
        if num_streaks > STREAK_TO_END:
            break

        # Update parameters
        explore_rate = get_explore_rate(episode)
        learning_rate = get_learning_rate(episode)

In [10]:
from gym.wrappers import Monitor

In [11]:
recording_folder = "/home/klimchuk/projects/TN/notebooks/WebPush/Multiagent/videos"

In [12]:
env = Monitor(env, recording_folder, video_callable=lambda episode: True,force=True)



In [13]:

'''
Creating a Q-Table for each state-action pair
'''
q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,), dtype=float)

'''
Begin simulation
'''


if ENABLE_RECORDING:
    env._start(recording_folder, video_callable=lambda episode: True,force=True)
    #env.monitor.start(recording_folder, force=True)

simulate(q_table, env)

if ENABLE_RECORDING:
    env.close()
    #env.monitor.close()


Episode = 0
t = 2343
Explore rate: 0.800000
Learning rate: 0.800000
Streaks: 0
Total reward: -0.627083

Episode 0 finished after 2343.000000 time steps with total reward = -0.627083 (streak 0).

Episode = 1
t = 3105
Explore rate: 0.800000
Learning rate: 0.800000
Streaks: 0
Total reward: -1.156250

Episode 1 finished after 3105.000000 time steps with total reward = -1.156250 (streak 0).

Episode = 2
t = 1177
Explore rate: 0.800000
Learning rate: 0.800000
Streaks: 0
Total reward: 0.182639

Episode 2 finished after 1177.000000 time steps with total reward = 0.182639 (streak 0).

Episode = 3
t = 2143
Explore rate: 0.800000
Learning rate: 0.800000
Streaks: 0
Total reward: -0.488194

Episode 3 finished after 2143.000000 time steps with total reward = -0.488194 (streak 0).

Episode = 4
t = 538
Explore rate: 0.800000
Learning rate: 0.800000
Streaks: 0
Total reward: 0.626389

Episode 4 finished after 538.000000 time steps with total reward = 0.626389 (streak 0).

Episode = 5
t = 477
Explore ra


Episode = 43
t = 80
Explore rate: 0.524894
Learning rate: 0.524894
Streaks: 14
Total reward: 0.944444

Episode 43 finished after 80.000000 time steps with total reward = 0.944444 (streak 14).

Episode = 44
t = 80
Explore rate: 0.514910
Learning rate: 0.514910
Streaks: 15
Total reward: 0.944444

Episode 44 finished after 80.000000 time steps with total reward = 0.944444 (streak 15).

Episode = 45
t = 91
Explore rate: 0.505150
Learning rate: 0.505150
Streaks: 16
Total reward: 0.936806

Episode 45 finished after 91.000000 time steps with total reward = 0.936806 (streak 16).

Episode = 46
t = 106
Explore rate: 0.495605
Learning rate: 0.495605
Streaks: 17
Total reward: 0.926389

Episode 46 finished after 106.000000 time steps with total reward = 0.926389 (streak 17).

Episode = 47
t = 82
Explore rate: 0.486265
Learning rate: 0.486265
Streaks: 18
Total reward: 0.943056

Episode 47 finished after 82.000000 time steps with total reward = 0.943056 (streak 18).

Episode = 48
t = 77
Explore rate


Episode = 86
t = 49
Explore rate: 0.223864
Learning rate: 0.223864
Streaks: 57
Total reward: 0.965972

Episode 86 finished after 49.000000 time steps with total reward = 0.965972 (streak 57).

Episode = 87
t = 50
Explore rate: 0.218843
Learning rate: 0.218843
Streaks: 58
Total reward: 0.965278

Episode 87 finished after 50.000000 time steps with total reward = 0.965278 (streak 58).

Episode = 88
t = 53
Explore rate: 0.213880
Learning rate: 0.213880
Streaks: 59
Total reward: 0.963194

Episode 88 finished after 53.000000 time steps with total reward = 0.963194 (streak 59).

Episode = 89
t = 47
Explore rate: 0.208972
Learning rate: 0.208972
Streaks: 60
Total reward: 0.967361

Episode 89 finished after 47.000000 time steps with total reward = 0.967361 (streak 60).

Episode = 90
t = 51
Explore rate: 0.204120
Learning rate: 0.204120
Streaks: 61
Total reward: 0.964583

Episode 90 finished after 51.000000 time steps with total reward = 0.964583 (streak 61).

Episode = 91
t = 55
Explore rate: 


Episode = 129
t = 41
Explore rate: 0.047773
Learning rate: 0.200000
Streaks: 100
Total reward: 0.971528

Episode 129 finished after 41.000000 time steps with total reward = 0.971528 (streak 100).


TypeError: 'NoneType' object is not callable

In [17]:
from IPython.display import HTML, Video
from base64 import b64encode
video = open("videos/openaigym.video.1.26037.video000000.mp4", "rb").read()
video_encoded = b64encode(video).decode('ascii')

In [18]:
Video(video_encoded, embed=True)

In [19]:
from glob import glob
import os

In [21]:
filepaths = sorted(glob('videos/*.mp4'), key=os.path.getmtime)
filepaths

['videos/openaigym.video.1.23139.video000000.mp4',
 'videos/openaigym.video.1.23139.video000001.mp4',
 'videos/openaigym.video.1.23139.video000002.mp4',
 'videos/openaigym.video.1.23139.video000003.mp4',
 'videos/openaigym.video.1.23139.video000004.mp4',
 'videos/openaigym.video.1.23139.video000005.mp4',
 'videos/openaigym.video.1.23139.video000006.mp4',
 'videos/openaigym.video.1.23139.video000007.mp4',
 'videos/openaigym.video.1.23139.video000008.mp4',
 'videos/openaigym.video.1.23139.video000009.mp4',
 'videos/openaigym.video.1.23139.video000010.mp4',
 'videos/openaigym.video.1.23139.video000011.mp4',
 'videos/openaigym.video.1.23139.video000012.mp4',
 'videos/openaigym.video.1.23139.video000013.mp4',
 'videos/openaigym.video.1.23139.video000014.mp4',
 'videos/openaigym.video.1.23139.video000015.mp4',
 'videos/openaigym.video.1.23139.video000016.mp4',
 'videos/openaigym.video.1.23139.video000017.mp4',
 'videos/openaigym.video.1.23139.video000018.mp4',
 'videos/openaigym.video.1.2313

In [24]:
from ipywidgets import Output, GridspecLayout
from IPython import display

grid = GridspecLayout(1, len(filepaths))

for i, filepath in enumerate(filepaths):
    out = Output()
    with out:
        video = open(filepath, "rb").read()
        video_encoded = b64encode(video).decode('ascii')
        display.display(display.Video(video_encoded, embed=True))
        #display.display(display.Video(filepath, embed=True))
    grid[0, i] = out

grid

GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…