In [8]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D, Dropout
from keras.optimizers import Adam

Using TensorFlow backend.


# Agent

In [13]:
class DQN_Agent:
    #
    # Initializes attributes and constructs CNN model and target_model
    #
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=5000)
        
        # Hyperparameters
        self.gamma = 0.9            # Discount rate
        self.epsilon = 1.0          # Exploration rate
        self.epsilon_min = 0.1      # Minimal exploration rate (epsilon-greedy)
        self.epsilon_decay = 0.995  # Decay rate for epsilon
        self.update_rate = 1000     # Number of steps until updating the target network
        
        # Construct DQN models
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.target_model.set_weights(self.model.get_weights())
        self.model.summary()

    #
    # Constructs CNN
    #
    def _build_model(self):
        model = Sequential()
        
        # Conv Layers
        model.add(Conv2D(32, (8, 8), strides=4, padding='same', input_shape=self.state_size))
        model.add(Activation('relu'))
        model.add(Conv2D(64, (4, 4), strides=2, padding='same'))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2), strides = (2,2)))
        model.add(Conv2D(128, (3, 3), strides=1, padding='same'))
        model.add(Activation('relu'))
        model.add(Conv2D(128, (3, 3), strides=1, padding='same'))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2), strides = (2,2)))
        model.add(Flatten())

        # FC Layers
        model.add(Dense(512, activation='relu'))
        model.add(Dropout(0.25))
        model.add(Dense(512, activation='relu'))
        model.add(Dropout(0.25))
        model.add(Dense(self.action_size, activation='linear'))
        
        model.compile(loss='mse', optimizer=Adam())
        return model

    #
    # Stores experience in replay memory
    #
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    #
    # Chooses action based on epsilon-greedy policy
    #
    def act(self, state):
        # Random exploration
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        act_values = self.model.predict(state)
        
        return np.argmax(act_values[0])  # Returns action using policy

    #
    # Trains the model using randomly selected experiences in the replay memory
    #
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in minibatch:
            
            if not done:
                target = (reward + self.gamma * np.amax(self.target_model.predict(next_state)))
            else:
                target = reward
                
            # Construct the target vector as follows:
            # 1. Use the current model to output the Q-value predictions
            target_f = self.model.predict(state)
            
            # 2. Rewrite the chosen action value with the computed target
            target_f[0][action] = target
            
            # 3. Use vectors in the objective computation
            self.model.fit(state, target_f, epochs=1, verbose=0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    #
    # Sets the target model parameters to the current model parameters
    #
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
            
    #
    # Loads a saved model
    #
    def load(self, name):
        self.model.load_weights(name)

    #
    # Saves parameters of a trained model
    #
    def save(self, name):
        self.model.save_weights(name)

# Preprocessing

In [14]:
# Helpful preprocessing taken from github.com/ageron/tiny-dqn
def process_frame(frame):
    mspacman_color = np.array([210, 164, 74]).mean()
    img = frame[1:176:2, ::2]    # Crop and downsize
    img = img.mean(axis=2)       # Convert to greyscale
    img[img==mspacman_color] = 0 # Improve contrast by making pacman white
    img = (img - 128) / 128 - 1  # Normalize from -1 to 1.
    
    return np.expand_dims(img.reshape(88, 80, 1), axis=0)

In [15]:
def blend_images(images, blend):
    avg_image = np.expand_dims(np.zeros((88, 80, 1), np.float64), axis=0)

    for image in images:
        avg_image += image
        
    if len(images) < blend:
        return avg_image / len(images)
    else:
        return avg_image / blend

# Environment

In [16]:
env = gym.make('MsPacman-v0')
state_size = (88, 80, 1)
action_size = env.action_space.n
agent = DQN_Agent(state_size, action_size)

episodes = 800
batch_size = 8
skip_start = 90  # MsPacman-v0 waits for 90 actions before the episode begins
total_time = 0   # Counter for total number of steps taken
all_rewards = 0  # Used to compute avg reward over time
blend = 4        # Number of images to blend
done = False

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_9 (Conv2D)            (None, 22, 20, 32)        2080      
_________________________________________________________________
activation_9 (Activation)    (None, 22, 20, 32)        0         
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 11, 10, 64)        32832     
_________________________________________________________________
activation_10 (Activation)   (None, 11, 10, 64)        0         
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 5, 5, 64)          0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 5, 5, 128)         73856     
_________________________________________________________________
activation_11 (Activation)   (None, 5, 5, 128)        

In [17]:
for e in range(episodes):
    total_reward = 0
    game_score = 0
    state = process_frame(env.reset())
    images = deque(maxlen=blend)  # Array of images to be blended
    images.append(state)
    
    for skip in range(skip_start): # skip the start of each game
        env.step(0)
    
    for time in range(20000):
        env.render()
        total_time += 1
        
        # Every update_rate timesteps we update the target network parameters
        if total_time % agent.update_rate == 0:
            agent.update_target_model()
        
        # Return the avg of the last 4 frames
        state = blend_images(images, blend)
        
        # Transition Dynamics
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        
        # Return the avg of the last 4 frames
        next_state = process_frame(next_state)
        images.append(next_state)
        next_state = blend_images(images, blend)
        
        # Store sequence in replay memory
        agent.remember(state, action, reward, next_state, done)
        
        state = next_state
        game_score += reward
        reward -= 1  # Punish behavior which does not accumulate reward
        total_reward += reward
        
        if done:
            all_rewards += game_score
            
            print("episode: {}/{}, game score: {}, reward: {}, avg reward: {}, time: {}, total time: {}"
                  .format(e+1, episodes, game_score, total_reward, all_rewards/(e+1), time, total_time))
            
            break
            
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        agent.save('model.h5')


episode: 1/800, game score: 240.0, reward: -206.0, avg reward: 240.0, time: 445, total time: 446
episode: 2/800, game score: 310.0, reward: -343.0, avg reward: 275.0, time: 652, total time: 1099
episode: 3/800, game score: 250.0, reward: -325.0, avg reward: 266.6666666666667, time: 574, total time: 1674
episode: 4/800, game score: 760.0, reward: -291.0, avg reward: 390.0, time: 1050, total time: 2725
episode: 5/800, game score: 380.0, reward: -161.0, avg reward: 388.0, time: 540, total time: 3266
episode: 6/800, game score: 400.0, reward: -374.0, avg reward: 390.0, time: 773, total time: 4040
episode: 7/800, game score: 690.0, reward: -383.0, avg reward: 432.85714285714283, time: 1072, total time: 5113
episode: 8/800, game score: 650.0, reward: -42.0, avg reward: 460.0, time: 691, total time: 5805
episode: 9/800, game score: 380.0, reward: -333.0, avg reward: 451.1111111111111, time: 712, total time: 6518
episode: 10/800, game score: 450.0, reward: -421.0, avg reward: 451.0, time: 870

episode: 76/800, game score: 620.0, reward: -5.0, avg reward: 634.2105263157895, time: 624, total time: 54550
episode: 77/800, game score: 200.0, reward: -110.0, avg reward: 628.5714285714286, time: 309, total time: 54860
episode: 78/800, game score: 310.0, reward: -45.0, avg reward: 624.4871794871794, time: 354, total time: 55215
episode: 79/800, game score: 270.0, reward: -63.0, avg reward: 620.0, time: 332, total time: 55548
episode: 80/800, game score: 880.0, reward: 64.0, avg reward: 623.25, time: 815, total time: 56364
episode: 81/800, game score: 360.0, reward: -186.0, avg reward: 620.0, time: 545, total time: 56910
episode: 82/800, game score: 740.0, reward: -104.0, avg reward: 621.4634146341464, time: 843, total time: 57754
episode: 83/800, game score: 380.0, reward: -123.0, avg reward: 618.5542168674699, time: 502, total time: 58257
episode: 84/800, game score: 230.0, reward: -169.0, avg reward: 613.9285714285714, time: 398, total time: 58656
episode: 85/800, game score: 720.

episode: 150/800, game score: 380.0, reward: -219.0, avg reward: 602.5333333333333, time: 598, total time: 105534
episode: 151/800, game score: 420.0, reward: -440.0, avg reward: 601.3245033112582, time: 859, total time: 106394
episode: 152/800, game score: 310.0, reward: -283.0, avg reward: 599.4078947368421, time: 592, total time: 106987
episode: 153/800, game score: 140.0, reward: -657.0, avg reward: 596.40522875817, time: 796, total time: 107784
episode: 154/800, game score: 670.0, reward: 96.0, avg reward: 596.8831168831168, time: 573, total time: 108358
episode: 155/800, game score: 460.0, reward: -206.0, avg reward: 596.0, time: 665, total time: 109024
episode: 156/800, game score: 510.0, reward: -132.0, avg reward: 595.4487179487179, time: 641, total time: 109666
episode: 157/800, game score: 410.0, reward: -406.0, avg reward: 594.2675159235669, time: 815, total time: 110482
episode: 158/800, game score: 790.0, reward: 20.0, avg reward: 595.506329113924, time: 769, total time: 

episode: 223/800, game score: 550.0, reward: -195.0, avg reward: 562.9147982062781, time: 744, total time: 154118
episode: 224/800, game score: 780.0, reward: -43.0, avg reward: 563.8839285714286, time: 822, total time: 154941
episode: 225/800, game score: 210.0, reward: -101.0, avg reward: 562.3111111111111, time: 310, total time: 155252
episode: 226/800, game score: 370.0, reward: -305.0, avg reward: 561.4601769911504, time: 674, total time: 155927
episode: 227/800, game score: 1150.0, reward: 414.0, avg reward: 564.0528634361234, time: 735, total time: 156663
episode: 228/800, game score: 290.0, reward: -194.0, avg reward: 562.8508771929825, time: 483, total time: 157147
episode: 229/800, game score: 260.0, reward: -386.0, avg reward: 561.528384279476, time: 645, total time: 157793
episode: 230/800, game score: 540.0, reward: -173.0, avg reward: 561.4347826086956, time: 712, total time: 158506
episode: 231/800, game score: 480.0, reward: -191.0, avg reward: 561.0822510822511, time: 

episode: 296/800, game score: 280.0, reward: -179.0, avg reward: 547.331081081081, time: 458, total time: 201861
episode: 297/800, game score: 1230.0, reward: 352.0, avg reward: 549.6296296296297, time: 877, total time: 202739
episode: 298/800, game score: 260.0, reward: -180.0, avg reward: 548.6577181208054, time: 439, total time: 203179
episode: 299/800, game score: 440.0, reward: -195.0, avg reward: 548.2943143812709, time: 634, total time: 203814
episode: 300/800, game score: 1040.0, reward: 297.0, avg reward: 549.9333333333333, time: 742, total time: 204557
episode: 301/800, game score: 470.0, reward: -104.0, avg reward: 549.6677740863787, time: 573, total time: 205131
episode: 302/800, game score: 320.0, reward: -301.0, avg reward: 548.9072847682119, time: 620, total time: 205752
episode: 303/800, game score: 350.0, reward: -480.0, avg reward: 548.2508250825083, time: 829, total time: 206582
episode: 304/800, game score: 1090.0, reward: 439.0, avg reward: 550.0328947368421, time:

KeyboardInterrupt: 