In [1]:
import gym
import itertools
import os
import random
import sys
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

if "../" not in sys.path:
  sys.path.append("../")

from lib import plotting
from collections import deque, namedtuple

%matplotlib inline

In [2]:
env = gym.envs.make("Breakout-v0")

[2017-02-28 17:10:18,230] Making new env: Breakout-v0


In [3]:
# Atari actions" 0 (no op), 1 (fire), 2 (left), 3 (right)
VALID_ACTIONS = [0, 1, 2, 3]

In [4]:
print(len(VALID_ACTIONS))

4


In [5]:
class StateProcessor(object):
    """A class to process an atari image for input into the nn"""
    def __init__(self):
        # Build the Tensorflow graph.
        with tf.variable_scope('state_processor'):
            # Input is a 210 x 160 x 3 array describing the screen
            self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
            # Transform to grayscale
            self.output = tf.image.rgb_to_grayscale(self.input_state)
            # Crop top and bottom of image to make it square
            self.output = tf.image.crop_to_bounding_box(self.output,
                                                        34, 0, 160, 160)
            # resize image to 84 x 84
            self.output = tf.image.resize_images(self.output, [84, 84],
                                               method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
            self.output = tf.squeeze(self.output)
    
    def process(self, sess, state):
        """
        process an atari image
        
        Args:
            sess: A TensorFlow session object
            state: A [210, 160, 3] Atari RGB state
            
        Returns:
            A processed [84, 84, 1] state representing grayscale values
        """
        return sess.run(self.output, {self.input_state: state})
    

In [6]:
class Estimator(object):
    """Neural Network to estimate Q-Value
    
    This network is used for both the Q-Network and Target Network
    """
    
    def __init__(self, scope='estimator', summaries_dir=None):
        self.scope = scope
        # Write Tensorboard summaries to disk
        self.summary_writer = None
        with tf.variable_scope(scope):
            # Build the graph
            self._build_model()
            if summaries_dir:
                summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
                if not os.path.exists(summary_dir):
                    os.makedirs(summary_dir)
                self.summary_writer = tf.summary.FileWriter(summary_dir)
    
    def _build_model(self):
        """
        Builds the Tensorflow graph
        """
        
        # Placeholders for input
        # Input are 4 RGB frames of shape 160, 160 each
        self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], 
                                   dtype=tf.uint8,
                                   name='X')
        # TD Target value
        self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name='y')
        # Integer if of which action was selected
        self.actions_pl = tf.placeholder(shape=[None],
                                         dtype=tf.int32,
                                         name='actions')
        # Variables
        layer1_weights = tf.Variable(tf.truncated_normal(
                [8, 8, 4, 32], stddev=0.1))
        layer1_biases = tf.Variable(tf.zeros([32]))
        
        layer2_weights = tf.Variable(tf.truncated_normal(
                [4, 4, 32, 64], stddev=0.1))
        layer2_biases = tf.Variable(tf.zeros([64]))
        
        layer3_weights = tf.Variable(tf.truncated_normal(
                [3, 3, 64, 64], stddev=0.1))
        layer3_biases = tf.Variable(tf.zeros([64]))
        
        layer4_weights = tf.Variable(tf.truncated_normal(
                [7*7*64, 512], stddev=0.1))
        layer4_biases = tf.Variable(tf.zeros([512]))
        
        layer5_weights = tf.Variable(tf.truncated_normal(
                [512, len(VALID_ACTIONS)], stddev=0.1))
        layer5_biases = tf.Variable(tf.zeros([len(VALID_ACTIONS)]))
        
        
        X = tf.to_float(self.X_pl) / 255.0
        batch_size = tf.shape(self.X_pl)[0]

        # Three convolutional layers
        conv = tf.nn.conv2d(X, layer1_weights, [1, 4, 4, 1],  padding='VALID', name='conv1')
        relu = tf.nn.relu(conv + layer1_biases, name='relu1')
        conv = tf.nn.conv2d(relu, layer2_weights, [1, 2, 2, 1], padding='VALID', name='conv2')
        relu = tf.nn.relu(conv + layer2_biases, name='relu2')
        conv = tf.nn.conv2d(relu, layer3_weights, [1, 1, 1, 1], padding='VALID', name='conv3')
        relu = tf.nn.relu(conv + layer3_biases, name='relu3')
        shape = relu.get_shape().as_list()
        
        reshape = tf.reshape(relu, [-1, shape[1]*shape[2]*shape[3]])
        fc1 = tf.matmul(reshape, layer4_weights) + layer4_biases
        self.predictions = tf.matmul(fc1, layer5_weights) + layer5_biases
        
        # Get the predictions for the chosen actions only
        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1]
        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)
        
        # Loss
        self.losses = tf.squared_difference(self.y_pl, self.action_predictions)
        self.loss = tf.reduce_mean(self.losses)
        
        # Optimizer Parameters from original paper
        self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
        self.train_op = self.optimizer.minimize(self.loss, 
                                                global_step=tf.contrib.framework.get_global_step())
        
        # Summaries for Tensorboard
        self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.histogram('loss_hist', self.losses),
                tf.summary.histogram('q_values_hist', self.predictions),
                tf.summary.scalar('max_q_value', tf.reduce_max(self.predictions))
            ])
        
    def predict(self, sess, s):
        """predictions action values.
        
        Args:
            sess: Tensorflow session
            s: State input of shape [batch_size, 4, 160, 160, 3]
                
        Returns:
            Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing
            the action values.
        """
        A = sess.run(self.predictions, {self.X_pl: s})
        
        return A
        #return sess.run(self.predictions, {self.X_pl: s})
        
    def update(self, sess, s, a, y):
        """
        UPdate the estimator toward the given targets
        
        Args:
            sess: Tensorflow session object
            s: State input of size [batch_size, 4, 160, 160, 3]
            a: Chosen action of size [batch_size]
            y: Targets of shape [batch_size]
            
        Returns:
            The calculated loss of each batch
        """
        feed_dict = {self.X_pl: s, self.y_pl: y, self.actions_pl: a}
        summaries, global_step, _, loss = sess.run(
            [self.summaries, 
             tf.contrib.framework.get_global_step(), 
             self.train_op,
             self.loss], feed_dict=feed_dict)
        if self.summary_writer:
            self.summary_writer.add_summary(summaries, global_step)
        return loss

In [7]:
# For Testing

tf.reset_default_graph()
global_step = tf.Variable(0, name='global_step', trainable=False)

e = Estimator(scope='test')
sp = StateProcessor()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    observation = env.reset()
    
    observation_p = sp.process(sess, observation)
    observation = np.stack([observation_p] * 4, axis=2)
    observations = np.array([observation] * 2)
    
    print(e.predict(sess, observations))
    
    # Test training step
    y = np.array([10.0, 10.0])
    a = np.array([1, 3])
    print(e.update(sess, observations, a, y))

[[ -2.5180614   -0.19548371  10.95501137   0.25467342]
 [ -2.5180614   -0.19548371  10.95501137   0.25467342]]
156.702


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [8]:
def copy_model_parameters(sess, estimator1, estimator2):
    """
    Copies the model params of one estimator to another
    
    Args:
        sess: Tensorflow sesion instance
        estimator1: Estimator to copy the parameters from
        estimator2: Estimator to copy the parameters to
    """
    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
    e1_params = sorted(e1_params, key=lambda v: v.name)
    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
    e2_params = sorted(e2_params, key=lambda v: v.name)
    
    update_ops = []
    for e1_v, e2_v in zip(e1_params, e2_params):
        op = e2_v.assign(e1_v)
        update_ops.append(op)
    sess.run(update_ops)

In [9]:
def make_epsilon_greedy_policy(estimator, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximation
    
    Args:
        esimator: An estimator that returns q values for a given state
        nA: Number of actions in the environment
        
    Returns:
        A function that takes the (sess, observation, epsilon) as an argument
        and returns the probabilites for each action in the form of a 
        numpy array of length nA
    """
    
    def policy_fn(sess, observation, epsilon):
        A = np.ones(nA, dtype=float) * epsilon / nA
       
        q_values = estimator.predict(sess, np.expand_dims(observation, 0))[0]
        
        best_action = np.argmax(q_values)
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

In [15]:
def deep_q_learning(sess, 
                    env, 
                    q_estimator, 
                    target_estimator, 
                    state_processor, 
                    num_episodes, 
                    experiment_dir, 
                    replay_memory_size=500000, 
                    replay_memory_init_size=50000, 
                    update_target_estimator_every=10000, 
                    discount_factor=0.99, 
                    epsilon_start=1.0, 
                    epsilon_end=0.1, 
                    epsilon_decay_steps=500000, 
                    batch_size=32, 
                    record_video_every=50):
    """
    Q-learning algorithim for off-policy TD control using function approximation
    Finds the optimal greedy policy while following epsilon-greedy one
    
    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Lambda time discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    
    """
    Transition = namedtuple('Transition', ['state', 'action', 'reward', 'next_state', 'done'])
    
    # The replay memory
    replay_memory = deque(maxlen=replay_memory_size)
    
    # Keeps track of useful stats
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))
    
    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, 'checkpoints')
    checkpoint_path = os.path.join(checkpoint_dir, 'model')
    monitor_path = os.path.join(experiment_dir, 'monitor')
    
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)
        
    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
    
    # Get the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())
    
    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
    
    # The policy we're following
    policy = make_epsilon_greedy_policy(
        q_estimator,
        len(VALID_ACTIONS))
    
    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        if i % 500 == 0:
            print("Step {}".format(i))
        action_probs = policy(sess, state, epsilons[total_t])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        next_state = state_processor.process(sess, next_state)
        next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
        replay_memory.append(Transition(state, action, reward, next_state, done))
        if done:
            state = env.reset()
            state = state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state
    
    # Record videos
    env.monitor.start(monitor_path,
                     resume=True,
                     video_callable=lambda count: count % record_video_every)
    
    for i_episode in xrange(num_episodes):
        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)
        
        # Reset the environment
        state = env.reset()
        state = state_processor.process(sess, state)
        state = np.stack([state]*4, axis=2)
        loss = None
        
        # One step in the environment.
        for t in itertools.count():
            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]
            
            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag='epsilon')
            q_estimator.summary_writer.add_summary(episode_summary, total_t)
            
            if total_t % update_target_estimator_every == 0:
                print("Copying parameters from q_estimator to target...\n")
                copy_model_parameters(sess, q_estimator, target_estimator)
            
            
            # Print step we're on, useful for debugging
            if t % 100 == 0:
                print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                t, total_t, i_episode+1, num_episodes, loss))
                sys.stdout.flush()
            
            # Take a step in the environment
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(len(action_probs), p=action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_processor.process(sess, next_state)
            next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
            
            transition = Transition(state, action, reward, next_state, done)
            # If our replay_memory is full, pop the first element
            replay_memory.append(transition)
            
            stats.episode_lengths[i_episode] = t
            stats.episode_rewards[i_episode] += reward
            
            #Sample minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_state_batch, done_batch = map(
            np.array, zip(*samples))
            
            # Calculate q-values and targets
            q_values_next = target_estimator.predict(sess, next_state_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.amax(q_values_next, axis=1)
            
            # Perform gradient descent update
            states_batch = np.array(states_batch)
            loss = q_estimator.update(sess, states_batch, action_batch, targets_batch)
            
            if done:
                break
            
            state = next_state
            total_t += 1
            
            
        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=stats.episode_rewards[i_episode],
                                     node_name='episode_reward', tag='episode_reward')
        episode_summary.value.add(simple_value=stats.episode_lengths[i_episode],
                                     node_name='episode_length', tag='episode_length')
        q_estimator.summary_writer.add_summary(episode_summary, total_t)
        q_estimator.summary_writer.flush()
            
        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode+1],
            episode_rewards=stats.episode_rewards[:i_episode+1])
        
    env.monitor.close()
    return


In [16]:
tf.reset_default_graph()

# Where we save our checkpoint and graphs
experiment_dir = os.path.abspath("./experiments/{}".format(env.spec.id))

# Create a global_step_variable
global_step = tf.Variable(0, name='global_step', trainable=False)

# Create estimators
q_estimator = Estimator(scope='q', summaries_dir=experiment_dir)
target_estimator = Estimator(scope='target_q')

state_processor = StateProcessor()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for t, stats in deep_q_learning(sess,
                                   env,
                                   q_estimator=q_estimator,
                                   target_estimator=target_estimator,
                                   state_processor=state_processor,
                                    num_episodes=10,
                                   experiment_dir=experiment_dir,
                                   replay_memory_size=50000,
                                   replay_memory_init_size=5000):
        print("\nEpisode Reward: {}".format(stats.episode_rewards[-1]))

Loading model checkpoint /home/cully/git/reinforcement-learning/DQN/experiments/Breakout-v0/checkpoints/model...

Populating replay memory...
Step 0
Step 500
Step 1000
Step 1500
Step 2000
Step 2500
Step 3000
Step 3500
Step 4000
Step 4500
Step 0 (4162) @ Episode 1/10, loss: None
Step 100 (4262) @ Episode 1/10, loss: 0.0784096419811


[2017-02-28 17:11:48,037] Starting new video recorder writing to /home/cully/git/reinforcement-learning/DQN/experiments/Breakout-v0/monitor/openaigym.video.0.16383.video000001.mp4



Episode Reward: 0.0
Step 0 (4337) @ Episode 2/10, loss: None
Step 100 (4437) @ Episode 2/10, loss: 0.0693007782102
Step 200 (4537) @ Episode 2/10, loss: 0.0229906775057

Episode Reward: 2.0


[2017-02-28 17:12:06,445] Starting new video recorder writing to /home/cully/git/reinforcement-learning/DQN/experiments/Breakout-v0/monitor/openaigym.video.0.16383.video000002.mp4


Step 0 (4631) @ Episode 3/10, loss: None
Step 100 (4731) @ Episode 3/10, loss: 0.00740694673732
Step 200 (4831) @ Episode 3/10, loss: 0.0385140702128


[2017-02-28 17:12:22,754] Starting new video recorder writing to /home/cully/git/reinforcement-learning/DQN/experiments/Breakout-v0/monitor/openaigym.video.0.16383.video000003.mp4



Episode Reward: 2.0
Step 0 (4895) @ Episode 4/10, loss: None
Step 100 (4995) @ Episode 4/10, loss: 0.0294174849987
Step 200 (5095) @ Episode 4/10, loss: 0.0427033752203
Step 300 (5195) @ Episode 4/10, loss: 1.06958556175
Step 400 (5295) @ Episode 4/10, loss: 0.013988243416

Episode Reward: 7.0


[2017-02-28 17:12:47,383] Starting new video recorder writing to /home/cully/git/reinforcement-learning/DQN/experiments/Breakout-v0/monitor/openaigym.video.0.16383.video000004.mp4


Step 0 (5297) @ Episode 5/10, loss: None
Step 100 (5397) @ Episode 5/10, loss: 0.0237196218222
Step 200 (5497) @ Episode 5/10, loss: 0.0254188049585
Step 300 (5597) @ Episode 5/10, loss: 0.0137845370919


[2017-02-28 17:13:07,848] Starting new video recorder writing to /home/cully/git/reinforcement-learning/DQN/experiments/Breakout-v0/monitor/openaigym.video.0.16383.video000005.mp4



Episode Reward: 3.0
Step 0 (5631) @ Episode 6/10, loss: None
Step 100 (5731) @ Episode 6/10, loss: 0.0465595908463
Step 200 (5831) @ Episode 6/10, loss: 1.44431376457


[2017-02-28 17:13:22,763] Starting new video recorder writing to /home/cully/git/reinforcement-learning/DQN/experiments/Breakout-v0/monitor/openaigym.video.0.16383.video000006.mp4



Episode Reward: 1.0
Step 0 (5868) @ Episode 7/10, loss: None
Step 100 (5968) @ Episode 7/10, loss: 0.0165252201259
Step 200 (6068) @ Episode 7/10, loss: 0.0186451822519
Step 300 (6168) @ Episode 7/10, loss: 0.0284429229796

Episode Reward: 2.0


[2017-02-28 17:13:41,368] Starting new video recorder writing to /home/cully/git/reinforcement-learning/DQN/experiments/Breakout-v0/monitor/openaigym.video.0.16383.video000007.mp4


Step 0 (6169) @ Episode 8/10, loss: None
Step 100 (6269) @ Episode 8/10, loss: 0.963227391243
Step 200 (6369) @ Episode 8/10, loss: 0.0580120682716


[2017-02-28 17:13:54,408] Starting new video recorder writing to /home/cully/git/reinforcement-learning/DQN/experiments/Breakout-v0/monitor/openaigym.video.0.16383.video000008.mp4



Episode Reward: 1.0
Step 0 (6379) @ Episode 9/10, loss: None
Step 100 (6479) @ Episode 9/10, loss: 0.928222060204
Step 200 (6579) @ Episode 9/10, loss: 0.0113477241248


[2017-02-28 17:14:08,265] Starting new video recorder writing to /home/cully/git/reinforcement-learning/DQN/experiments/Breakout-v0/monitor/openaigym.video.0.16383.video000009.mp4



Episode Reward: 1.0
Step 0 (6602) @ Episode 10/10, loss: None
Step 100 (6702) @ Episode 10/10, loss: 0.0150168975815
Step 200 (6802) @ Episode 10/10, loss: 0.0425222814083
Step 300 (6902) @ Episode 10/10, loss: 0.0137986261398


[2017-02-28 17:14:27,163] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/cully/git/reinforcement-learning/DQN/experiments/Breakout-v0/monitor')



Episode Reward: 2.0


AttributeError: 'numpy.ndarray' object has no attribute 'episode_rewards'

In [16]:
stats.

array([ 1.,  0.,  1.,  3.,  0.,  0.,  0.,  1.,  1.,  1.])

In [104]:
state = env.reset()

In [121]:
Transition = namedtuple('Transition', ['state', 'action', 'reward', 'next_state', 'done'])
state = env.reset()
print state.shape
with tf.Session() as sess:
    state = sp.process(sess, state)
    #print(state)
    print(state.shape)
    state = np.stack([state]*4, axis=2)
    print(state.shape)
    action = np.random.randint(4)
    next_state, reward, done, _ = env.step(action)
    print(next_state.shape)
    next_state = sp.process(sess, next_state)
    print(next_state.shape)
    print(state[:,:,1:].shape)
    next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
    print(next_state.shape)
    trans = Transition(state, action, reward, next_state, done)
    print(trans)

(210, 160, 3)
(84, 84)
(84, 84, 4)
(210, 160, 3)
(84, 84)
(84, 84, 3)
(84, 84, 4)
Transition(state=array([[[142, 142, 142, 142],
        [142, 142, 142, 142],
        [142, 142, 142, 142],
        ..., 
        [142, 142, 142, 142],
        [142, 142, 142, 142],
        [142, 142, 142, 142]],

       [[142, 142, 142, 142],
        [142, 142, 142, 142],
        [142, 142, 142, 142],
        ..., 
        [142, 142, 142, 142],
        [142, 142, 142, 142],
        [142, 142, 142, 142]],

       [[142, 142, 142, 142],
        [142, 142, 142, 142],
        [142, 142, 142, 142],
        ..., 
        [142, 142, 142, 142],
        [142, 142, 142, 142],
        [142, 142, 142, 142]],

       ..., 
       [[142, 142, 142, 142],
        [142, 142, 142, 142],
        [142, 142, 142, 142],
        ..., 
        [142, 142, 142, 142],
        [142, 142, 142, 142],
        [142, 142, 142, 142]],

       [[127, 127, 127, 127],
        [127, 127, 127, 127],
        [127, 127, 127, 127],
        ..., 
