source: https://towardsdatascience.com/getting-an-ai-to-play-atari-pong-with-deep-reinforcement-learning-47b0c56e78ae

STRATEGY:
1. Take a tensor of pixel values from the 4 most recent frames to be the __current state__.
2. Using the epsilon-greedy strategy, take a random action or input __current state__ into the __CNN__ to get the next action.
3. Perform the action, receive a reward, and arrive at the next state. Store values in memory for training.
4. After each action, randomly sample data from the agent's memory and train the agent against a __loss__ function.

PREPROCESS FRAMES

In [1]:
import cv2
import numpy as np


def resize_frame(frame):
    # STEP 1: Crop the image, and convert to grayscale.
    frame = frame[30:-12, 5:-4]
    frame = np.average(frame, axis=2)
    # STEP 2: Resize the frame using "nearest-neighbour interpolation".
    # This method of interpolation takes the rounded value of the expected position, and finds the closest data value at integer position.
    frame = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_NEAREST)
    # STEP 3: Convert image datatype to np.uint8 (unsigned integer).
    frame = np.array(frame, dtype=np.uint8)
    return frame


MEMORY

In [2]:
from collections import deque

# "deque" stands for double-ended queue. it's essentially a stack of things.
# In this Memory class, we have 4 separate deques that contain frames, actions, rewards, and done-flags.
class Memory():
    def __init__(self, max_len):
        self.max_len = max_len
        self.frames = deque(maxlen=max_len)
        self.actions = deque(maxlen=max_len)
        self.rewards = deque(maxlen=max_len)
        self.done_flags = deque(maxlen=max_len)
        
    # This add_experience function simply adds new experiences to each of the 4 deques created above.
    def add_experience(self, next_frame, next_frames_reward, next_action, next_frame_terminal):
        self.frames.append(next_frame)
        self.actions.append(next_action)
        self.rewards.append(next_frames_reward)
        self.done_flags.append(next_frame_terminal)


ENVIRONMENT

In [3]:
import gym
#import preprocess_frame as ppf
import numpy as np

# This function resets the environment, gets the starting frame, and declares a dummy action + reward.
def initialize_new_game(name, env, agent):
    """We don't want an agents past game influencing its new game, so we add in some dummy data to initialize"""

    env.reset()
    starting_frame = resize_frame(env.step(0)[0])  # refer to OG doc

    dummy_action = 0
    dummy_reward = 0
    dummy_done = False
    # Recall that we're using 4 frames stacked together for the agent to understand what's happening, so we need to repeat this action 3 more times.
    for i in range(3):
        agent.memory.add_experience(
            starting_frame, dummy_reward, dummy_action, dummy_done)

# This function gets the environment from Open AI Gym.
def make_env(name, agent):
    env = gym.make(name, render_mode='human')  # added render_mode
    return env

# This function is where the actual playing of the game occurs.
# The agent performs some action, the weights and scores are documented and updated, we get new frames for the next action, and we add the experience to the memory until the game is over.
def take_step(name, env, agent, score, debug):

    # 1 and 2: Update timesteps and save weights
    agent.total_timesteps += 1
    if agent.total_timesteps % 50000 == 0:
        agent.model.save_weights('recent_weights.hdf5')
        print('\nWeights saved!')

    # 3: Take action
    # env.step() takes a step in the environment by performing an action.
    # In return, we get the next frame, reward, a done flag, and info. The done flag represents the progress of the game (finished or ongoing).
    next_frame, next_frames_reward, next_frame_terminal, info = env.step(
        agent.memory.actions[-1])

    # 4: Get next state
    next_frame = resize_frame(next_frame)  # refer to OG doc
    new_state = [agent.memory.frames[-3], agent.memory.frames[-2],
                 agent.memory.frames[-1], next_frame]
    # We have to do this to get it into keras's goofy format of [batch_size,rows,columns,channels]
    new_state = np.moveaxis(new_state, 0, 2)/255
    new_state = np.expand_dims(new_state, 0)  # ^^^

    # 5: Get next action, using next state
    next_action = agent.get_action(new_state)

    # 6: If game is over, return the score
    if next_frame_terminal:
        agent.memory.add_experience(
            next_frame, next_frames_reward, next_action, next_frame_terminal)
        return (score + next_frames_reward), True

    # 7: Now we add the next experience to memory
    agent.memory.add_experience(
        next_frame, next_frames_reward, next_action, next_frame_terminal)

    # 8: If we are trying to debug this then render
    if debug:
        env.render()

    # 9: If the threshold memory is satisfied, make the agent learn from memory
    if len(agent.memory.frames) > agent.starting_mem_len:
        agent.learn(debug)

    return (score + next_frames_reward), False

# This function calls the take_step() function until an episode is completed.
def play_episode(name, env, agent, debug=False):
    initialize_new_game(name, env, agent)
    done = False
    score = 0
    while True:
        score, done = take_step(name, env, agent, score, debug)
        if done:
            break
    return score


AGENT CLASS

- In this class, we describe the actions of the agent and create the CNN.
- For this CNN, we are using the __Huber loss__ function which tends to be less sensitive to outliers in data than the mean squared error loss function.
- We are also using __Adam__ for our optimiser.
    - __Stochastic gradient descent (SGD)__ is the usual optimiser that we've used previously.
        - In SGD, we have a single __learning rate (alpha)__ for all weight updates, and the learning rate does NOT change during training.
    - But Adam (_adaptive moment estimation_) "adapts" the learning rate, which means that training for the agent can become more precise and time-saving.

In [4]:
from tensorflow.python.keras.models import Sequential, clone_model
# changed from Input to InputLayer
from tensorflow.python.keras.layers import Dense, Flatten, Conv2D, InputLayer
from tensorflow.python.keras.optimizer_v1 import Adam  # refer to OG doc
import keras.backend as K
import tensorflow as tf
#from agent_memory import Memory
import numpy as np
import random

tf.compat.v1.disable_eager_execution()


class Agent():
    def __init__(self, possible_actions, starting_mem_len, max_mem_len, starting_epsilon, learn_rate, starting_lives=5, debug=False):
        self.memory = Memory(max_mem_len)
        self.possible_actions = possible_actions
        self.epsilon = starting_epsilon
        self.epsilon_decay = .9/100000
        self.epsilon_min = .05
        self.gamma = .95
        self.learn_rate = learn_rate
        self.model = self._build_model()
        self.model_target = clone_model(self.model)
        self.total_timesteps = 0
        self.lives = starting_lives  # this parameter does not apply to pong
        self.starting_mem_len = starting_mem_len
        self.learns = 0

    # This is where the CNN gets constructed.
    def _build_model(self):
        model = Sequential()
        model.add(InputLayer((84, 84, 4)))  # changed from Input to InputLayer
        model.add(Conv2D(filters=32, kernel_size=(8, 8), strides=4, data_format="channels_last",
                  activation='relu', kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Conv2D(filters=64, kernel_size=(4, 4), strides=2, data_format="channels_last",
                  activation='relu', kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Conv2D(filters=64, kernel_size=(3, 3), strides=1, data_format="channels_last",
                  activation='relu', kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Flatten())
        model.add(Dense(512, activation='relu',
                  kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Dense(len(self.possible_actions), activation='linear'))
        optimizer = Adam(self.learn_rate)
        model.compile(optimizer, loss=tf.keras.losses.Huber())
        model.summary()
        print('\nAgent Initialized\n')
        return model

    # This is where we implement the epsilon-greedy strategy.
    # We generate a random number, and if it is less than our epsilon value, we take a random action.
    # If not, we pass the current state into our CNN and return the max output.
    def get_action(self, state):
        """Explore"""
        if np.random.rand() < self.epsilon:
            return random.sample(self.possible_actions, 1)[0]

        """Do Best Acton"""
        a_index = np.argmax(self.model.predict(state))
        return self.possible_actions[a_index]

    # This function makes sure that the frame collected is not from 2 different games
    def _index_valid(self, index):
        if self.memory.done_flags[index-3] or self.memory.done_flags[index-2] or self.memory.done_flags[index-1] or self.memory.done_flags[index]:
            return False
        else:
            return True

    # Notice on Line 24 that we've also created a clone of the CNN and called it model_target.
    # This is a method used to decrease training noise.
    ## During training, we input the next state into our CNN to help generate the target for the error function. After the weights are updated, the target within the error function will change since the CNN is being used to generate the target, and even though the CNN weights were just changed.
    ## So, instead, we create a clone of the CNN and use this model_target to generate the targets for our loss function, and every so often, we update the model_target to match our original CNN.
    ## This method decreases training time!
    def learn(self, debug=False):
        """we want the output[a] to be R_(t+1) + Qmax_(t+1)."""
        """So target for taking action 1 should be [output[0], R_(t+1) + Qmax_(t+1), output[2]]"""

        """First we need 32 random valid indicies"""
        states = []
        next_states = []
        actions_taken = []
        next_rewards = []
        next_done_flags = []

        while len(states) < 32:
            index = np.random.randint(4, len(self.memory.frames) - 1)
            if self._index_valid(index):
                state = [self.memory.frames[index-3], self.memory.frames[index-2],
                         self.memory.frames[index-1], self.memory.frames[index]]
                state = np.moveaxis(state, 0, 2)/255
                next_state = [self.memory.frames[index-2], self.memory.frames[index-1],
                              self.memory.frames[index], self.memory.frames[index+1]]
                next_state = np.moveaxis(next_state, 0, 2)/255

                states.append(state)
                next_states.append(next_state)
                actions_taken.append(self.memory.actions[index])
                next_rewards.append(self.memory.rewards[index+1])
                next_done_flags.append(self.memory.done_flags[index+1])

        """Now we get the ouputs from our model, and the target model. We need this for our target in the error function"""
        labels = self.model.predict(np.array(states))
        next_state_values = self.model_target.predict(np.array(next_states))

        """Now we define our labels, or what the output should have been
           We want the output[action_taken] to be R_(t+1) + Qmax_(t+1) """
        for i in range(32):
            action = self.possible_actions.index(actions_taken[i])
            labels[i][action] = next_rewards[i] + \
                (not next_done_flags[i]) * \
                self.gamma * max(next_state_values[i])

        """Train our model using the states and outputs generated"""
        self.model.fit(np.array(states), labels,
                       batch_size=32, epochs=1, verbose=0)

        """Decrease epsilon and update how many times our agent has learned"""
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
        self.learns += 1

        """Every 10000 learned, copy our model weights to our target model"""
        if self.learns % 10000 == 0:
            self.model_target.set_weights(self.model.get_weights())
            print('\nTarget model updated')


MAIN

In [5]:
import matplotlib.pyplot as plt
import time
from collections import deque
import numpy as np

#name = 'PongDeterministic-v4'
name = 'Pong-v4'
agent = Agent(possible_actions=[0, 2, 3], starting_mem_len=50000,
              max_mem_len=750000, starting_epsilon=1, learn_rate=.00025)
env = make_env(name, agent)

last_100_avg = [-21]
scores = deque(maxlen=100)
max_score = -21

""" If testing:
agent.model.load_weights('recent_weights.hdf5')
agent.model_target.load_weights('recent_weights.hdf5')
agent.epsilon = 0.0
"""

env.reset()

for i in range(10000):  # changed from 1000000 to 10000
    timesteps = agent.total_timesteps
    timee = time.time()
    # set debug to true for rendering
    score = play_episode(name, env, agent, debug=False)
    scores.append(score)
    if score > max_score:
        max_score = score

    print('\nEpisode: ' + str(i))
    print('Steps: ' + str(agent.total_timesteps - timesteps))
    print('Duration: ' + str(time.time() - timee))
    print('Score: ' + str(score))
    print('Max Score: ' + str(max_score))
    print('Epsilon: ' + str(agent.epsilon))

    if i % 100 == 0 and i != 0:
        last_100_avg.append(sum(scores)/len(scores))
        plt.plot(np.arange(0, i+1, 100), last_100_avg)
        plt.show()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 20, 20, 32)        8224      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________________________________
flatten (Flatten)            (None, 3136)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               1606144   
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 1539      
Total params: 1,685,667
Trainable params: 1,685,667
Non-trainable params: 0
______________________________________________

KeyboardInterrupt: 