In [1]:
IMG_HEIGHT=100
IMG_WIDTH=100

epsilon = 1
gamma = 0.99
n_episodes = 1

#### Display related

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

def show_state(observation, env_id, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(observation, cmap='gray')
    plt.title("%s | Step: %d %s" % (env_id, step, info))
    plt.axis('off')
    
    display.clear_output(wait=True)
    display.display(plt.gcf())
    plt.close()

#### Preprocessing

In [3]:
import cv2
import numpy as np

def downsize(img_arry):
    return cv2.resize(img_arry, dsize=(IMG_WIDTH, IMG_HEIGHT), interpolation=cv2.INTER_CUBIC)

def rgb2gray(img_arr):
    return np.dot(img_arr[...,:3], [0.299, 0.587, 0.114])

def normalize(img_arr):
    return np.divide(img_arr, 255.0)

def preprocess_image(img_arr):
    downsized = downsize(img_arr)
    gray = rgb2gray(downsized)
    normalized = normalize(gray)
    extra_dim = normalized[..., np.newaxis]
    for i in range(2):
        extra_dim = np.append(extra_dim, extra_dim, axis=2)
    return np.stack(extra_dim)
    

#### Memory

In [4]:
import random

class Memory:

    # wants the following input
    # [(state, action, reward, next_state, done)...]
    def __init__(self):
        self.memory = np.zeros(shape=(0, 5))

    # @params
    # state = previous 4 images
    # action = what action we took
    # reward = reward
    # next_state = what happened
    # done = done
    def add(self, state, action, reward, next_state, done):
        new_row = np.array([[state, action, reward, next_state, done]])
        self.memory = np.append(self.memory, new_row, axis=0)

    def sample(self, batch_size):
        #return np.random.choice(self.memory, size=batch_size, replace=False)
        return self.memory[np.random.choice(self.memory.shape[0], batch_size, replace=False)]

#### Model

In [5]:
from keras.layers.core import Flatten
from keras.layers import Dense, Conv2D, MaxPooling2D
from keras.models import Sequential

def atari_model(state_shape, n_actions):
    model = Sequential()
    
    model.add(Conv2D(16,
        kernel_size=(4, 4),
        strides=(2, 2),
        activation="relu",
        input_shape=state_shape))
    
    #model.add(MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(32,
        kernel_size=(4, 4),
        strides=(2, 2),
        activation="relu"))
    
    #model.add(MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Flatten())
    model.add(Dense(512, activation="relu"))
    model.add(Dense(n_actions))
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

Using TensorFlow backend.


#### Train model

In [6]:
import numpy as np

def stack(array):
    stacked_array = np.stack(array[0])
    for i in range(1, array.shape[0]):
        stacked_array = np.append(stacked_array, np.stack(array[i]), axis=0)
    return stacked_array

In [18]:
def fit(batch, gamma):
    states, actions, rewards, next_states, done = np.hsplit(batch, batch.shape[1])
    
    states = stack(states)
    next_states = stack(next_states)
    actions = stack(actions) # TODO: Needs one hot encoding
    
    next_Q_values = model.predict(np.stack(next_states))
    Q_values = rewards + gamma * np.max(next_Q_values, axis=1)
    
    model.fit(
        states, 
        actions * Q_values[:, None],
        epochs=1, 
        batch_size=len(states), 
        verbose=0
    )

#### Main loop

In [17]:
import gym
env = gym.make("BreakoutDeterministic-v4")

state_shape = (IMG_WIDTH, IMG_HEIGHT, 4)
n_actions = env.action_space.n
model = atari_model(state_shape, n_actions)
done = False

for episode in range(n_episodes):
    state = env.reset()
    memory = Memory()
    current_state, _, _, _ = env.step(env.action_space.sample())
    current_state = preprocess_image(current_state)
    
    # Fill memory
    for t in range(32):
        action = env.action_space.sample()
        next_state_raw, reward, done, _ = env.step(action)
        next_state = preprocess_image(next_state_raw)
        memory.add(current_state, action, reward, next_state, done)
        current_state = next_state

    while not done:
        frame = preprocess_image(env.render(mode='rgb_array'))
        #show_state(frame, env.spec.id, t)

        if random.random() < epsilon:
            action = env.action_space.sample() 
        else:
            # TODO: Select optimal actions
            action = env.action_space.sample()

        next_state_raw, reward, done, info = env.step(action)
        next_state = preprocess_image(next_state_raw)
        memory.add(current_state, action, reward, next_state, done)
        current_state = next_state
        
        batch = memory.sample(32)
        fit(batch, gamma)
            
    print("Finished")
    

[[3]
 [0]
 [2]
 [3]
 [1]
 [3]
 [2]
 [2]
 [2]
 [1]
 [3]
 [0]
 [2]
 [2]
 [1]
 [1]
 [2]
 [1]
 [3]
 [3]
 [0]
 [2]
 [2]
 [1]
 [1]
 [2]
 [0]
 [2]
 [2]
 [3]
 [2]
 [3]]
(32,)


ValueError: Error when checking target: expected dense_12 to have 2 dimensions, but got array with shape (32, 1, 32)