In [2]:
%matplotlib inline

import tensorflow as tf
import collections
import gym
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from baselines.common.atari_wrappers import make_atari, wrap_deepmind


ModuleNotFoundError: No module named 'baselines'

In [10]:
seed = 42
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = (epsilon_max - epsilon_min)
batch_size = 32
max_step_per_episode = 10000

# Use the Baseline Atari environment because of Deepmind helper functions
env = make_atari("BreakoutNoFrameskip-v4")
# Warp the frames, grey scale, stake four frame and scale to smaller ratio
env = wrap_deepmind(env, frame_stack=True, scale=True)
env.seed(seed)

num_actions = env.action_space.n

NameError: name 'make_atari' is not defined

### Implement the Deep Q-Network

In [7]:
def create_q_model():
    # Network defined by the Deepmind paper
    inputs = layers.Input(shape=(84, 84, 4,))

    # Convolutions on the frames on the screen
    layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs)
    layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1)
    layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2)

    layer4 = layers.Flatten()(layer3)

    layer5 = layers.Dense(512, activation="relu")(layer4)
    Q_value = layers.Dense(num_actions, activation="linear")(layer5)

    return keras.Model(inputs=inputs, outputs=Q_value)


# The first model makes the predictions for Q-values which are used to
# make an action.
Q_network_model = create_q_model()
# Build a target model for the prediction of future rewards.
# The weights of a target model get updated every 10000 steps thus when the
# loss between the Q-values is calculated the target Q-value is stable.
target_network_model = create_q_model()


### Train

In [8]:
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)
action_history = []
state_history = []
next_state_history = []
rewards_history = []
done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0

epsilon_random_frames = 50000
epsilon_greedy_frames = 1000000
max_memory_length = 100000
update_after_actions = 4
update_target_network = 10000
loss_function = keras.losses.Huber()

while True:
    state = np.array(env.reset())
    episode_reward = 0
    
    for timestep in range(1, max_step_per_episode):
        frame_count += 1
        
        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            # Take random action for exploration
            action = np.random.choice(num_actions)
        else:
            state_tensor = tf.convert_to_tensor(state)
            staet_tensor = tf.expanddims(state_tensor, 0)
            
            action_probs = Q_network_model(state_tensor, training=False)
            action = tf.argmax(action_probs[0]).numpy()
            
        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)
        
        next_state, reward, done, _ = env.step(action)
        next_state = np.array(next_state)
        episode_reward += reward
        
        # Save actions and states in replay buffer
        state_history.append(state)
        action_history.append(action)
        rewards_history.append(reward)
        next_state_history.append(next_state)
        done_history.append(done)
        state = next_state
        
        # Update every fourth frame and once batch size is over 32
        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
            # Get indices of samples for replay buffers (The number of samples is [batch_size]).
            indices = np.random.choice(range(len(done_history)), size=batch_size)
            
            state_sample = np.array([state_history[i] for i in indices])
            next_state_sample = np.array([next_state_history[i] for i in indices])
            rewards_sample = np.array([rewards_history[i] for i in indices])
            action_sample = np.array([action_history[i] for i in indices])
            done_sample = tf.convert_to_tensor([float(done_history[i]) for i in indices])
            
            future_rewards = target_network_model.predict(next_state_sample)
            updated_q_values = rewards_sample + gamma * tf.math.reduce_max(future_rewards, axis=1)
            
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample
            
            masks = tf.one_hot(action_sample, num_actions)
            
            with tf.GradientTape() as tape:
                q_values = Q_network_model(state_sample)
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)
                
            # Backpropagation
            grads = tape.gradient(loss, Q_network_model.trainable_variables)
            optimizer.apply_gradients(zip(grads, Q_network_model.trainable_variables))
    
        if frame_count % update_target_network == 0:
            # update the the target network with new weights
            target_network_model.set_weights(Q_network_model.get_weights())
            # Log details
            template = "running reward: {:.2f} at episode {}, frame count {}"
            print(template.format(running_reward, episode_count, frame_count))
        
        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del action_history[:1]
            del next_state_history[:1]
            del state_history[:1] 
            del done_history[:1]
        
        if done:
            break
    
    # Update running reward to check condition for solving
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1

    if running_reward > 40:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break


ValueError: in user code:

    File "c:\Users\37103\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1801, in predict_function  *
        return step_function(self, iterator)
    File "c:\Users\37103\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1790, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\37103\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1783, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Users\37103\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1751, in predict_step
        return self(x, training=False)
    File "c:\Users\37103\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\37103\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "model_3" is incompatible with the layer: expected shape=(None, 84, 84, 4), found shape=(32, 210, 160, 3)


In [None]:
state = np.array(env.reset())