In [1]:
%load_ext autoreload

In [2]:
import tensorflow as tf
import numpy as np
from retro_contest.local import make
import sys
from functools import reduce

In [27]:
from deep_q_agent import flatten, as_binary_array

def pretrain(env, pretrain_length):
    action_size = 2**env.action_space.n
    memory = []
    state = flatten(env.reset())
    for i in range(pretrain_length):
        action = np.random.randint(action_size)
        next_state, reward, done, _ = env.step(as_binary_array(action, length=env.action_space.n))
        next_state = flatten(next_state)
        memory.append((state, action, reward, next_state, done))
        state = next_state
        if done:
            state = flatten(env.reset())
    return memory

try:
    env = make(game='SonicTheHedgehog-Genesis', state='LabyrinthZone.Act1')
    pretrain_data = pretrain(env, 10000)
finally:
    env.close()

In [28]:
%autoreload
from deep_q_agent import *

train_episodes = 10            # max number of episodes to learn from
gamma = 0.99                   # future reward discount

# Exploration parameters
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Network parameters
hidden_size = 64               # number of units in each Q-network hidden layer
learning_rate = 0.0001         # Q-network learning rate

# Memory parameters
memory_size = 1000000            # memory capacity
batch_size = 10000                # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory

tf.reset_default_graph()
tf.logging.set_verbosity(tf.logging.WARN)

try:
    env = make(game='SonicTheHedgehog-Genesis', state='LabyrinthZone.Act1')

    state_size = reduce(lambda x,y: x*y, env.observation_space.shape, 1)
    action_size = 2**env.action_space.n

    agent = DeepQAgent(
        num_actions=action_size,
        state_shape=state_size,
        explore_start=explore_start,
        explore_stop=explore_stop,
        decay_rate=decay_rate,
        hidden=hidden_size,
        learning_rate=learning_rate,
        memory_size=memory_size,
        memory_prepop=pretrain_data,
        batch_size=batch_size,
        gamma=gamma,
    )

    # agent.pretrain(env, pretrain_length)

    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        agent.learn(sess, gamma=gamma)
        
        for episode in range(train_episodes):
            state = flatten(env.reset())
            done = False
            while not done:
                action = agent.act(sess, state, train=True)
                next_state, reward, done, info = env.step(as_binary_array(action, length=env.action_space.n))
                next_state = flatten(next_state)
                agent.step(sess, state, action, reward, next_state, done)
                state = next_state
                if done:
                    print("\rEpisode = {:4d}, Total Reward = {:.3f}".format(
                        episode, agent.total_rewards[-1]), end="")
            sys.stdout.flush()
        saver.save(sess, "checkpoints/deep-q-agent.ckpt")
finally:
    env.close()

Episode =    9, Total Reward = 0.000377

In [29]:
agent.losses

[40605.656,
 25573.062,
 8467.266,
 7354.199,
 6160.559,
 4886.541,
 4302.5034,
 3316.8113,
 2895.5972,
 2948.9966]

In [18]:
env.close()

In [25]:
x = (10000, 4096)
c = tuple([True]*5000 + [False]*5000)
y = np.zeros(x)
y[c] = np.ones(x[1:])

IndexError: too many indices for array

In [24]:
y[4999]

array([1., 1., 1., ..., 1., 1., 1.])

In [15]:
x = np.array(range(2**4)).reshape([4, 2, 2])
[np.squeeze(a) for a in np.split(x, x.shape[0])]

[array([[0, 1],
        [2, 3]]), array([[4, 5],
        [6, 7]]), array([[ 8,  9],
        [10, 11]]), array([[12, 13],
        [14, 15]])]