In [1]:
IMG_HEIGHT=100
IMG_WIDTH=100

epsilon = 0.4
gamma = 0.99
n_episodes = 1

#### Display related

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

def show_state(observation, env_id, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(observation, cmap='gray')
    plt.title("%s | Step: %d %s" % (env_id, step, info))
    plt.axis('off')
    
    display.clear_output(wait=True)
    display.display(plt.gcf())
    plt.close()

#### Preprocessing

In [3]:
import cv2
import numpy as np

def downsize(img_arry):
    return cv2.resize(img_arry, dsize=(IMG_WIDTH, IMG_HEIGHT), interpolation=cv2.INTER_CUBIC)

def rgb2gray(img_arr):
    return np.dot(img_arr[...,:3], [0.299, 0.587, 0.114])

def normalize(img_arr):
    return np.divide(img_arr, 255.0)

def preprocess_image(img_arr):
    downsized = downsize(img_arr)
    gray = rgb2gray(downsized)
    normalized = normalize(gray)
    extra_dim = normalized[..., np.newaxis]
    for i in range(2):
        extra_dim = np.append(extra_dim, extra_dim, axis=2)
    return np.stack(extra_dim)
    

#### Memory

In [4]:
import random

class Memory:

    def __init__(self):
        self.memory = np.zeros(shape=(0, 5))

    def add(self, state, action, reward, next_state, done):
        new_row = np.array([[state, action, reward, next_state, done]])
        self.memory = np.append(self.memory, new_row, axis=0)

    def sample(self, batch_size):
        return self.memory[np.random.choice(self.memory.shape[0], batch_size, replace=False)]

#### Model

In [5]:
from keras.layers.core import Flatten
from keras.layers import Dense, Conv2D, MaxPooling2D
from keras.models import Sequential

def atari_model(state_shape, n_actions):
    model = Sequential()
    
    model.add(Conv2D(16,
        kernel_size=(4, 4),
        strides=(2, 2),
        activation="relu",
        input_shape=state_shape))
    
    model.add(Conv2D(32,
        kernel_size=(4, 4),
        strides=(2, 2),
        activation="relu"))
    
    model.add(Flatten())
    model.add(Dense(512, activation="relu"))
    model.add(Dense(n_actions))
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

Using TensorFlow backend.


#### Train model

In [6]:
import numpy as np

def stack(array):
    stacked_array = np.stack(array[0])
    for i in range(1, array.shape[0]):
        stacked_array = np.append(stacked_array, np.stack(array[i]), axis=0)
    return stacked_array

def one_hot_encode(size, values):
    vector = np.array(values, dtype=int).reshape(-1)
    return np.eye(size, dtype=int)[vector]

In [16]:
def predict(state):
    input_state = np.stack([state])
    print(input_state.shape)
    Q_values = model.predict(input_state)[0]
    Q_values = stack(Q_values)
    print(Q_values.shape)
    action = np.where(Q_values == np.amax(Q_values))
    print(action)
    return action

In [8]:
def fit(batch, gamma):
    states, actions, rewards, next_states, done = np.hsplit(batch, batch.shape[1])
    
    states = stack(states)
    next_states = stack(next_states)
    actions = one_hot_encode(4, actions) # TODO: Needs one hot encoding
    
    next_Q_values = model.predict(next_states)
    Q_values = rewards + gamma * np.max(next_Q_values, axis=0)
    
    print((actions * Q_values).shape)
    
    model.fit(
        states, 
        actions * Q_values,
        epochs=1, 
        batch_size=len(states), 
        verbose=1
    )

#### Main loop

In [17]:
import gym
env = gym.make("BreakoutDeterministic-v4")

state_shape = (IMG_WIDTH, IMG_HEIGHT, 4)
n_actions = env.action_space.n
model = atari_model(state_shape, n_actions)
done = False

for episode in range(n_episodes):
    state = env.reset()
    memory = Memory()
    
    # Fill memory
    for t in range(32):
        state = preprocess_image(env.render(mode='rgb_array'))
        action = env.action_space.sample()
        next_state_raw, reward, done, _ = env.step(action)
        next_state = preprocess_image(next_state_raw)
        memory.add(state, action, reward, next_state, done)

    while not done:
        state = preprocess_image(env.render(mode='rgb_array'))
        #show_state(frame, env.spec.id, t)

        if random.random() < epsilon:
            action = env.action_space.sample() 
        else:
            action = predict(state)
            # TODO: Select optimal actions
            #action = env.action_space.sample()

        next_state_raw, reward, done, info = env.step(action)
        next_state = preprocess_image(next_state_raw)
        memory.add(state, action, reward, next_state, done)
        
        batch = memory.sample(32)
        fit(batch, gamma)
            
    print("Finished")
    

(1, 100, 100, 4)


TypeError: 'numpy.float32' object is not iterable