In [1]:
pip install ale-py gym gym[Atari] tensorflow matplotlib pyglet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import gym
import random
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
from collections import deque
from itertools import islice

In [3]:
env = gym.make('ALE/SpaceInvaders-v5', render_mode='human')
env.reset()

array([[[ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0],
        ...,
        [ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0]],

       [[ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0],
        ...,
        [ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0]],

       [[ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0],
        ...,
        [ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0]],

       ...,

       [[80, 89, 22],
        [80, 89, 22],
        [80, 89, 22],
        ...,
        [80, 89, 22],
        [80, 89, 22],
        [80, 89, 22]],

       [[80, 89, 22],
        [80, 89, 22],
        [80, 89, 22],
        ...,
        [80, 89, 22],
        [80, 89, 22],
        [80, 89, 22]],

       [[80, 89, 22],
        [80, 89, 22],
        [80, 89, 22],
        ...,
        [80, 89, 22],
        [80, 89, 22],
        [80, 89, 22]]], dtype=uint8)

In [4]:
state_shape = env.observation_space.shape
action_shape = env.action_space.n

In [5]:
BATCH_SIZE = 64
REPLAY_SIZE = 1000
EPISODES = 300
TARGET_MODEL_UPDATE = 100
REPLAY_MEMORY = 100_000

# CNN model params
LEARNING_RATE = 0.001
KERNEL_SIZE = 3
POOL_SIZE = 2

# Q-learning params
Q_LEARNING_RATE = 0.5
DISCOUNT_FACTOR = 0.5

In [6]:
# Model - Create a convolutional neural network with Keras

def network(state_shape, action_shape):
    
    initializer = keras.initializers.HeUniform()
    model = keras.Sequential()

    # Input layer
    model.add(keras.layers.Conv2D(32, kernel_size=KERNEL_SIZE, input_shape=(85,80,1), activation='relu', 
        padding='same', kernel_initializer=initializer))
    model.add(keras.layers.AveragePooling2D(pool_size=POOL_SIZE))

    # Hidden convolutional layers
    model.add(keras.layers.Conv2D(64, kernel_size=KERNEL_SIZE, activation='relu', padding='same', 
        kernel_initializer=initializer))
    model.add(keras.layers.AveragePooling2D(pool_size=POOL_SIZE))
    model.add(keras.layers.Conv2D(64, kernel_size=KERNEL_SIZE, activation='relu', padding='same', 
        kernel_initializer=initializer))
    model.add(keras.layers.AveragePooling2D(pool_size=POOL_SIZE))

    # Flatten and use fully connected network
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(1024, activation='relu', kernel_initializer=initializer))

    # Output layer
    model.add(keras.layers.Dense(action_shape, activation='softmax', kernel_initializer=initializer))

    model.compile(loss=keras.losses.Huber(), optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE), metrics=['accuracy'])
    
    return model

In [7]:
model = network(state_shape, action_shape)
target_model = network(state_shape, action_shape)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 85, 80, 32)        320       
                                                                 
 average_pooling2d (AverageP  (None, 42, 40, 32)       0         
 ooling2D)                                                       
                                                                 
 conv2d_1 (Conv2D)           (None, 42, 40, 64)        18496     
                                                                 
 average_pooling2d_1 (Averag  (None, 21, 20, 64)       0         
 ePooling2D)                                                     
                                                                 
 conv2d_2 (Conv2D)           (None, 21, 20, 64)        36928     
                                                                 
 average_pooling2d_2 (Averag  (None, 10, 10, 64)       0

In [8]:
#################################################################################
# Preprocessing

def preprocess(obs, normalize=False):
    # Crop out score and floor
    img = obs[25:195]  

    # Downsize
    img = img[::2, ::2]

    # Take greyscale (black and white)
    img = img.mean(axis=2)  

    # color = np.array([210, 164, 74]).mean()
    # img[img==color] = 0  
    # img[img==144] = 0
    # img[img==109] = 0
    img[img != 0] = 1

    # Is this needed? normalize the image from -1 to +1  
    # No difference visually but tensor is different
    if normalize:
        img = (img - 128) / 128 - 1  

#     print("before: ", obs.shape)
#     print("after: ", img.shape)

    # reshape to 1D tensor
    return img.reshape(85,80,1)

In [9]:
# Training agent

def train(env, replay_memory, model, target_model):

    replay_len = len(replay_memory)

    if len(replay_memory) <= REPLAY_SIZE:
        return

    batch = random.sample(replay_memory, BATCH_SIZE)
    states = np.array([step[0] for step in batch])
    q_values = model.predict(states)
    succesive_states = np.array([step[3] for step in batch])
    succesive_q_values = target_model.predict(succesive_states)

    X_train = []
    Y_train = []

    for i, (state, action, reward, new_state, done) in enumerate(batch):
        if not done:
            # Bellman Equation : r(s) + gamma * max_a'(Q(s',a'))
            qValue = reward + DISCOUNT_FACTOR * np.max(succesive_q_values[i])
        else:
            # Pick reward as the episode has ended; no succesive state
            qValue = reward
        
        # TODO: Figure out y_train values work or not
        # Temporal Difference
        # q_value_arr for a state s : [qVal action1, qval action1, ..., qval action18] 
        q_value_arr = q_values[i]
        # Qvalue for action a  : Q(s,a) + alpha(r(s) + gamma*max_a'(Q(s',a')) - Q(s, a))         
        q_value_arr[action] = q_value_arr[action] + LEARNING_RATE * (qValue - q_value_arr[action])

        X_train.append(state)
        Y_train.append(q_value_arr)
    
    model.fit(np.array(X_train), np.array(Y_train), batch_size=BATCH_SIZE)

In [None]:
# Deep Q-Learning agent

epsilon = 1
decay = 0.01

target_model.set_weights(model.get_weights())

# Memory buffer to store the last N experiences
replay_memory = deque(maxlen=REPLAY_MEMORY)

update_target_counter = 0
step_counter = 0

for episode in range(EPISODES):
    state = env.reset()
    score = 0 
    done = False

    while not done:
        step_counter += 1

        # Epsilon Greedy Strategy with explore probability epsilon
        if np.random.rand() <= epsilon:
            # Explore 
            action = env.action_space.sample()
        else:
            # Exploit best action from cnn
            preprocessed_state = preprocess(state) # Preprocess state
            predictions = model.predict(np.array([preprocessed_state,])).flatten()
            action = np.argmax(predictions)

        new_state, reward, done, info = env.step(action)
        replay_memory.append([preprocess(state), action, reward, preprocess(new_state), done]) # ERROR CHECK: When done cannot preprocess next state

        if step_counter % 4 == 0 or done:
            train(env, replay_memory, model, target_model)

        score += reward
        state = new_state

        if update_target_counter >= TARGET_MODEL_UPDATE:
                update_target_counter = 0
                target_model.set_weights(model.get_weights())

        if done:
              print('Score: {} after epsidoe = {} and final reward = {}'.format(score, episode, reward))

    # Exponetial decay for epsilon (explore with atleast 0.01 or 1% probability)
    epsilon = 0.01 + 0.99 * np.exp(-decay*episode)

In [None]:
env.close()

In [None]:
0.01 + 0.99 * np.exp(-0.01*150)