In [2]:
import numpy as np
import gymnasium as gym
import ale_py
import cv2
import time
import matplotlib.pyplot as plt
from IPython.display import HTML, display, clear_output
from matplotlib import animation
from collections import deque, defaultdict
import pickle
from tqdm import tqdm
import pandas as pd
from random import random
import os
from datetime import datetime
import warnings
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential, clone_model
from tensorflow.keras.layers import Dense, Flatten, Conv2D, Input
from tensorflow.keras.optimizers import Adam
warnings.filterwarnings("ignore")

# To plot pretty figures
#%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')

  File "/usr/local/lib/python3.10/dist-packages/gymnasium/envs/registration.py", line 594, in load_plugin_envs
    fn()
  File "/usr/local/lib/python3.10/dist-packages/shimmy/registration.py", line 304, in register_gymnasium_envs
    _register_atari_envs()
  File "/usr/local/lib/python3.10/dist-packages/shimmy/registration.py", line 205, in _register_atari_envs
    import ale_py
  File "/usr/local/lib/python3.10/dist-packages/ale_py/__init__.py", line 68, in <module>
    register_v0_v4_envs()
  File "/usr/local/lib/python3.10/dist-packages/ale_py/registration.py", line 178, in register_v0_v4_envs
    _register_rom_configs(legacy_games, obs_types, versions)
  File "/usr/local/lib/python3.10/dist-packages/ale_py/registration.py", line 63, in _register_rom_configs
    gymnasium.register(
AttributeError: partially initialized module 'gymnasium' has no attribute 'register' (most likely due to a circular import)
[0m
  logger.warn(f"plugin: {plugin.value} raised {traceback.format_exc()}")


In [13]:
device = '/GPU:0' if tf.config.experimental.list_physical_devices('GPU') else '/CPU:0'

def resize_frame(frame):
    #frame = frame[30:-12,5:-4] # no need for cropping, we need the whole frame
    frame = np.average(frame,axis = 2) # grayscale
    frame = cv2.resize(frame,(84,84),interpolation = cv2.INTER_NEAREST) # resize to 84x84, standard for Atari and DQN
    frame = np.array(frame,dtype = np.uint8) # convert to uint8
    return frame


class Memory():
    def __init__(self,max_len):
        self.max_len = max_len
        self.frames = deque(maxlen = max_len)
        self.actions = deque(maxlen = max_len)
        self.rewards = deque(maxlen = max_len)
        self.done_flags = deque(maxlen = max_len)
        #self.lives = deque(maxlen = max_len) # added for multiple lives

    def add_experience(self,next_frame, next_frames_reward, next_action, next_frame_terminal, next_lives):
        self.frames.append(next_frame)
        self.actions.append(next_action)
        self.rewards.append(next_frames_reward)
        self.done_flags.append(next_frame_terminal)
        #self.lives.append(next_lives)


def initialize_new_game(name, env, agent):
    """We don't want an agents past game influencing its new game, so we add in some dummy data to initialize"""
    
    env.reset()
    starting_frame = resize_frame(env.step(0)[0])

    dummy_action = 0
    dummy_reward = 0
    dummy_done = False
    #dummy_lives = 4 # added for multiple lives
    for i in range(3):
        #agent.memory.add_experience(starting_frame, dummy_reward, dummy_action, dummy_done, dummy_lives)
        agent.memory.add_experience(starting_frame, dummy_reward, dummy_action, dummy_done)


def make_env(name, agent):
    gym.register_envs(ale_py) # for gymnasium
    env = gym.make(name, render_mode = 'rgb_array')
    return env


def take_step(name, env, agent, score, lives, debug):
    
    #1 and 2: Update timesteps and save weights
    agent.total_timesteps += 1
    if agent.total_timesteps % 50000 == 0:
      agent.model.save_weights('recent.weights.h5')
      print('\nWeights saved!')

    #3: Take action
    next_frame, next_frames_reward, next_frame_terminal, _, info = env.step(agent.memory.actions[-1])
    
    #4: Get next state
    next_frame = resize_frame(next_frame)
    new_state = [agent.memory.frames[-3], agent.memory.frames[-2], agent.memory.frames[-1], next_frame]
    new_state = np.moveaxis(new_state,0,2)/255 #We have to do this to get it into keras' goofy format of [batch_size,rows,columns,channels]
    
    # added for multiple lives
    # 1) here we can include the number of lives as a feature
    # so that the agent will take actions based on the number of lives it has
    # if the number of lives decreases, the agent will take different actions
    # or
    # 2) we can make it believe each life is a different game
    # then we presume we can finish the game with max score with only one life
    # but also it would make sense since it gets respawned each time it dies
    # so it must learn anyways to deal with that state
    # here the second is implemented
    new_lives = info['lives']
    max_lives = 4
    #lives_channel = np.full((84,84,1), new_lives / max_lives) # normalize the number of lives for better learning
    
    #new_state = np.concatenate((new_state, lives_channel), axis=2) # add the number of lives as a channel

    
    new_state = np.expand_dims(new_state,0) #^^^
    
    #5: Get next action, using next state
    next_action = agent.get_action(new_state, new_lives / max_lives)

    #6: Now we add the next experience to memory
    #agent.memory.add_experience(next_frame, next_frames_reward, next_action, next_frame_terminal)
    
    # we can make a penalty for losing lives and a reward for keeping them
    #if new_lives < lives:
    #    next_frames_reward -= 5
    #else:
    #    next_frames_reward += 0.5
    
    agent.memory.add_experience(next_frame, next_frames_reward, next_action, next_frame_terminal, new_lives) # added for multiple lives

    #7: If game is over, return the score
    if next_frame_terminal:
        return (score + next_frames_reward),True, new_lives

    #8: If we are trying to debug this then render
    if debug:
        img = env.render()
        global frames
        frames.append(img)

    #9: If the threshold memory is satisfied, make the agent learn from memory
    if len(agent.memory.frames) > agent.starting_mem_len:
        agent.learn(debug)

    return (score + next_frames_reward),False, new_lives


def play_episode(name, env, agent, debug = False):
    initialize_new_game(name, env, agent)
    done = False
    score = 0
    lives = 4 # added for multiple lives
    prev_lives = 4
    while True:
        score,done, lives = take_step(name,env,agent,score, lives, debug)
        if lives != prev_lives or done:
            break
    return score


class Agent():
    def __init__(self,possible_actions,starting_mem_len,max_mem_len,starting_epsilon,learn_rate, starting_lives = 5, debug = False):
        self.memory = Memory(max_mem_len)
        self.possible_actions = possible_actions
        self.epsilon = starting_epsilon
        self.epsilon_decay = .9/10000
        self.epsilon_min = .05
        self.gamma = .95
        self.learn_rate = learn_rate
        self.model = self._build_model()
        self.model_target = clone_model(self.model)
        self.total_timesteps = 0
        self.lives = starting_lives
        self.starting_mem_len = starting_mem_len
        self.learns = 0


    def _build_model(self):
        model = Sequential()
        model.add(Input((84,84,4))) # 4+1 - added for multiple lives
        model.add(Conv2D(filters = 32,kernel_size = (8,8),strides = 4,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Conv2D(filters = 64,kernel_size = (4,4),strides = 2,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Conv2D(filters = 64,kernel_size = (3,3),strides = 1,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Flatten())
        model.add(Dense(512,activation = 'relu', kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Dense(len(self.possible_actions), activation = 'linear'))
        optimizer = Adam(self.learn_rate)
        model.compile(optimizer, loss=tf.keras.losses.Huber())
        model.summary()
        print('\nAgent Initialized\n')
        return model

    def get_action(self,state, normalized_lives):
        """Explore"""
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.possible_actions)

        """Do Best Acton"""
        with tf.device(device):
            #normalized_lives_tensor = np.full((state.shape[0], state.shape[1], 1), normalized_lives)
            #state_with_lives = np.concatenate([state, normalized_lives_tensor], axis=2)
            #a_index = np.argmax(self.model.predict(np.expand_dims(state_with_lives,0), verbose=0))
            a_index = np.argmax(self.model.predict(state, verbose=0))
        return self.possible_actions[a_index]

    def _index_valid(self,index):
        if self.memory.done_flags[index-3] or self.memory.done_flags[index-2] or self.memory.done_flags[index-1] or self.memory.done_flags[index]:
            return False
        else:
            return True

    def learn(self,debug = False):
        """we want the output[a] to be R_(t+1) + Qmax_(t+1)."""
        """So target for taking action 1 should be [output[0], R_(t+1) + Qmax_(t+1), output[2]]"""

        """First we need 32 random valid indicies"""
        states = []
        next_states = []
        actions_taken = []
        next_rewards = []
        next_done_flags = []
        lives = [] # added for multiple lives
        next_lives = [] # added for multiple lives

        while len(states) < 32:
            index = np.random.randint(4,len(self.memory.frames) - 1)
            if self._index_valid(index):
                state = [self.memory.frames[index-3], self.memory.frames[index-2], self.memory.frames[index-1], self.memory.frames[index]]
                state = np.moveaxis(state,0,2)/255
                next_state = [self.memory.frames[index-2], self.memory.frames[index-1], self.memory.frames[index], self.memory.frames[index+1]]
                next_state = np.moveaxis(next_state,0,2)/255

                states.append(state)
                next_states.append(next_state)
                actions_taken.append(self.memory.actions[index])
                next_rewards.append(self.memory.rewards[index+1])
                next_done_flags.append(self.memory.done_flags[index+1])
                # lives.append(self.memory.lives[index]) # current and next number of lives
                # next_lives.append(self.memory.lives[index+1]) # added for multiple lives

        #states_with_lives = np.concatenate([np.array(states), np.array(lives).reshape(-1, 1, 1, 1)], axis=-1)
        #next_states_with_lives = np.concatenate([np.array(next_states), np.array(next_lives).reshape(-1, 1, 1, 1)], axis=-1)

        # lives_channels = np.expand_dims(np.array(lives), axis=(1, 2))
        # lives_channels = np.repeat(lives_channels, 84, axis=1)
        # lives_channels = np.repeat(lives_channels, 84, axis=2)
        # lives_channels = np.expand_dims(lives_channels, axis=-1)
        # states_with_lives = np.concatenate((np.array(states), lives_channels), axis=-1)
        # 
        # next_lives_channels = np.expand_dims(np.array(next_lives), axis=(1, 2))
        # next_lives_channels = np.repeat(next_lives_channels, 84, axis=1)
        # next_lives_channels = np.repeat(next_lives_channels, 84, axis=2)
        # next_lives_channels = np.expand_dims(next_lives_channels, axis=-1)
        # next_states_with_lives = np.concatenate((np.array(next_states), next_lives_channels), axis=-1)
        
        
        """Now we get the ouputs from our model, and the target model. We need this for our target in the error function"""
        with tf.device(device):
            # changed in order to include the lives as a feature
            #labels = self.model.predict(states_with_lives, verbose=0)
            #next_state_values = self.model_target.predict(next_states_with_lives, verbose=0)
            labels = self.model.predict(states, verbose=0)
            next_state_values = self.model_target.predict(next_states, verbose=0)
        
        """Now we define our labels, or what the output should have been
           We want the output[action_taken] to be R_(t+1) + Qmax_(t+1) """
        for i in range(32):
            action = self.possible_actions.index(actions_taken[i])

            # add penalty for lost life
            #life_penalty = -5 if next_lives[i] < self.memory.lives[index] else 0
            # no more penalty if condidered as a separate game

            labels[i][action] = next_rewards[i] + (not next_done_flags[i]) * self.gamma * max(next_state_values[i])

        """Train our model using the states and outputs generated"""
        with tf.device(device):
            # changed in order to include the lives as a feature
            #self.model.fit(states_with_lives,labels,batch_size = 32, epochs = 1, verbose = 0)
            self.model.fit(states, labels, batch_size = 32, epochs = 1, verbose = 0)

        """Decrease epsilon and update how many times our agent has learned"""
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
        self.learns += 1
        
        """Every 10000 learned, copy our model weights to our target model"""
        if self.learns % 10000 == 0:
            self.model_target.set_weights(self.model.get_weights())
            print('\nTarget model updated')

def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim


In [14]:
name = 'ALE/Pacman-v5'

#agent = Agent(possible_actions=[0,1,2,3,4],starting_mem_len=50000,max_mem_len=750000,starting_epsilon = 1, learn_rate = .0005)
agent = Agent(possible_actions=[0,1,2,3,4],starting_mem_len=50000,max_mem_len=750000,starting_epsilon = 1, learn_rate = .0005)
env = make_env(name,agent)

last_100_avg = [-21] # worst possible score (value of the minimum score)
scores = deque(maxlen = 100)
max_score = -21 # in our case, it gets +1 for each pellet eaten, 0 if caught


Agent Initialized



In [None]:
env.reset()

if os.path.exists('recent_weights.hdf5'):
    agent.model.load_weights('recent_weights.hdf5')
    agent.model_target.load_weights('recent_weights.hdf5')
    print('\nWeights loaded!')
else:
    print('\nNo weights found')
# only when previously trained!
#agent.epsilon = 0 # Set the epsilon at the value you had when you stopped training



for i in tqdm(range(1000)):
    frames = [] # Saving the frames for the gif
    timesteps = agent.total_timesteps
    timee = time.time()
    score = play_episode(name, env, agent, debug = True) #set debug to true for rendering
    scores.append(score)
    if score > max_score:
        max_score = score

    if i%50==0:
        print('\nEpisode: ' + str(i))
        print('Steps: ' + str(agent.total_timesteps - timesteps))
        print('Duration: ' + str(time.time() - timee))
        print('Score: ' + str(score))
        print('Max Score: ' + str(max_score))
        print('Epsilon: ' + str(agent.epsilon))
        
        print('Avg Weights: ' + str(np.mean(agent.model.get_weights()[0])))
        #print(agent.model.get_weights()[0])
    
    if i%50==0 and i!=0:
        anim = plot_animation(frames)
        anim.save("pacman{}.gif".format(i), dpi=100, writer= animation.PillowWriter(fps=20))# Saving the gif
        
    if i%100==0 and i!=0:
        last_100_avg.append(sum(scores)/len(scores))
        plt.plot(np.arange(0,i+1,100),last_100_avg)
        plt.show()


agent.model.save_weights('recent.weights.h5')
agent.model_target.save_weights('recent_target.weights.h5')
print('\nWeights saved!')


No weights found


  0%|          | 1/1000 [00:00<16:25,  1.01it/s]


Episode: 0
Steps: 505
Duration: 0.9780395030975342
Score: 3.0
Max Score: 3.0
Epsilon: 1
Avg Weights: 0.00022293274


  4%|▍         | 39/1000 [00:33<12:17,  1.30it/s]