In [145]:
import numpy as np
import gymnasium as gym
!pip install ale_py
import ale_py
import sys
import pylab
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Conv2D, Flatten
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from gymnasium import wrappers
from tqdm import tqdm


[notice] A new release of pip available: 22.3 -> 24.3.1
[notice] To update, run: C:\Users\v\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip




In [146]:
class DQN:
    def __init__(self, state_size, action_size, load_model=False):
        self.state_size = state_size
        self.action_size = action_size
        
        if load_model: # loading the model will disable training
            self.discount = 0.99
            self.learning_rate = 0.00001
            self.epsilon = 0.2 # exploring at 1 and exploiting at 0
            self.epsilon_decay = 0.99999
            self.epsilon_min = 0.2
        else:
            self.discount = 0.99
            self.learning_rate = 0.001
            self.epsilon = 1.0
            self.epsilon_decay = 0.9999
            self.epsilon_min = 0.1
            
        self.batch_size = 64
        self.train_start = 1000
        
        self.memory = deque(maxlen=2000)
        self.model = self.build_model()
        
        if load_model:
            self.model.load_weights('./training-dqn/pacman.h5')
            
    def build_model(self):
        model = Sequential()
        #model.add(Input(shape=(self.state_size,)))
        model.add(Input(shape=(84, 84, 1)))
        
        #model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
        #model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
        
        """model.add(Conv2D(32, (8, 8), strides=(4,4), activation='relu', kernel_initializer='he_uniform'))
        model.add(Conv2D(64, (4, 4), strides=(2,2), activation='relu', kernel_initializer='he_uniform'))
        model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu', kernel_initializer='he_uniform'))
        
        model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform'))"""
        
        """model.add(Conv2D(filters = 32,kernel_size = (8,8),strides = 4,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Conv2D(filters = 64,kernel_size = (4,4),strides = 2,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Conv2D(filters = 64,kernel_size = (3,3),strides = 1,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Flatten())
        model.add(Dense(512,activation = 'relu', kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
        model.add(Dense(len(self.action_size), activation = 'linear'))"""
        
        model.add(Conv2D(32, (8, 8), strides=4, activation='relu', kernel_initializer='he_uniform', ))
        model.add(Conv2D(64, (4, 4), strides=2, activation='relu', kernel_initializer='he_uniform', ))
        model.add(Conv2D(64, (3, 3), strides=1, activation='relu', kernel_initializer='he_uniform', ))
        model.add(Flatten())
        model.add(Dense(512, activation='relu', kernel_initializer='he_uniform'))
        
        model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform'))
        
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        model.summary()
        return model
    
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            with tf.device(device):
                act_values = self.model.predict(state, verbose=0) # get the Q-values for the state
            return np.argmax(act_values[0]) # return the action with the highest Q-value
        
    def append_memory(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        # if self.epsilon > self.epsilon_min:
        #     self.epsilon *= self.epsilon_decay
            
    def experience_replay(self): # experience replay
        # if len(self.memory) < self.train_start:
        #     return
        if len(self.memory) < self.batch_size:
            return
        
        #batch_size = min(len(self.memory), self.batch_size)
        batch_size = self.batch_size
        minibatch = random.sample(self.memory, batch_size)
        
        #state = np.zeros((batch_size, self.state_size))
        #next_state = np.zeros((batch_size, self.state_size))
        state = np.zeros((batch_size, 84, 84, 1))
        next_state = np.zeros((batch_size, 84, 84, 1))
        
        action, reward, done = [], [], []
        
        for i in range(batch_size):
            state[i] = minibatch[i][0].reshape((84, 84, 1))
            action.append(minibatch[i][1])
            reward.append(minibatch[i][2])
            next_state[i] = minibatch[i][3].reshape((84, 84, 1))
            done.append(minibatch[i][4])
            
        with tf.device(device):
            target = self.model.predict(state, verbose=0)
            target_next = self.model.predict(next_state, verbose=0)
        
        for i in range(self.batch_size):
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                target[i][action[i]] = reward[i] + self.discount * np.amax(target_next[i])
        with tf.device(device):
            self.model.fit(state, target, batch_size=self.batch_size, epochs=1, verbose=0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [147]:
n_episodes = 1200
environment = 'ALE/Pacman-v5'
path_training = './DQN/training-dqn/'
path_training = '/kaggle/working/'
gym.register_envs(ale_py)
device = '/GPU:0' if tf.config.experimental.list_physical_devices('GPU') else '/CPU:0'

In [148]:
class Pacman:
    def __init__(self, mode, view):
        self.env = gym.make(environment, frameskip=1, render_mode=view) # rgb_array or human
        self.env.reset()
        size_state = self.env.observation_space.shape[0]
        size_action = self.env.action_space.n
        
        # network construction
        if mode.lower() == 'test':
            load_model = True
        else:
            load_model = False
            
        #self.agent = DQN(size_state, size_action, load_model)
        self.agent = DQN((84, 84, 1), size_action, load_model)
        
    def train(self, path, statistics, mode):
        if path:
            train_path = path
        else:
            train_path = path_training
            
        if statistics:
            print('Scores will be plotted')
        else:
            print('Scores will not be plotted')
            
        env = self.env
        env = wrappers.AtariPreprocessing(env, frame_skip=4, grayscale_obs=True, screen_size=84, )
        
        agent = self.agent
        
        size_state = self.env.observation_space.shape[0]
        scores, episodes = [], []
        
        for e in tqdm(range(n_episodes)):
            done = False
            score = 0
            state, _ = env.reset()
            #state = np.reshape(state, [1, size_state])
            state = np.expand_dims(state, axis=-1) # 84x84x1 now
            state = np.moveaxis(state, 2, 0)
            
            lives = 4
            while not done:
                dead = False
                while not dead:
                    action = agent.get_action(state)
                    next_state, reward, done, _, info = env.step(action)
                    #next_state = np.reshape(next_state, [1, size_state])
                    next_state = np.expand_dims(next_state, axis=-1)
                    next_state = np.moveaxis(next_state, 2, 0)

                    
                    reward = reward if not dead else -10 # penalize deaths
                    
                    agent.append_memory(state, action, reward, next_state, done)
                    agent.experience_replay()
                    
                    
                    state = next_state
                    score += reward
                    dead = info['lives'] < lives
                    lives = info['lives']
                    #reward = reward if not dead else -10 # penalize deaths
                    
                if done:
                    scores.append(score)
                    episodes.append(e)
                    #agent.update_target_model()
                    
                    if statistics:
                        pylab.plot(episodes, scores, 'b')
                        pylab.savefig(train_path + 'pacman.png')
                    print('episode: {}/{}, score: {}, epsilon: {:.2}'.format(e, episodes, score, agent.epsilon))
         
            if e%50 == 0 and mode.lower() == 'train': # save the model every 50 episodes
                agent.model.save_weights(train_path + 'pacman.weights.h5')
                print('Model saved')

In [149]:
# run the code
pacman = Pacman('train', 'rgb_array')

In [150]:
pacman.train(path=path_training, statistics=True, mode='train')

Scores will be plotted


  0%|          | 0/1200 [00:00<?, ?it/s]

(1, 84, 84)


  0%|          | 0/1200 [00:43<?, ?it/s]


ValueError: Exception encountered when calling Sequential.call().

[1mCannot take the length of shape with unknown rank.[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=<unknown>, dtype=float32)
  • training=False
  • mask=None