In [0]:
# import necessary libraries
import os
import numpy as np
import tensorflow as tf
import gym
from gym import wrappers
import itertools
import time
import logging
import argparse

##Construct a Model
* a neural network with one hidden layer
* model checkpoint



In [0]:
# define a class to construct the model

class Model:
    def __init__(self, num_observations, num_actions, num_layers, layer_size, logger, learning_rate, checkpoint_dir):
        self.logger = logger

        self.num_observations = num_observations #80*80
        self.num_actions = num_actions #3
        
        self.observations = tf.placeholder(shape=[None, num_observations], name="observations", dtype=tf.float32)
        self.actions = tf.placeholder(shape=[None], name="actions", dtype=tf.int32)
        self.advantages = tf.placeholder(shape=[None], name="advantages", dtype=tf.float32)
        
        self.keep_prob = tf.placeholder(name='keep_prob', dtype=tf.float32)

        self.num_layers = num_layers #2

        self.layer_size = layer_size #200

        self.logprob_n, self.sampled_ac = self.build_model()

        self.loss = tf.reduce_mean(tf.multiply(self.logprob_n, self.advantages))
        #AdamOptimizer
        self.global_step = tf.Variable(0, trainable=False, name='global_step')
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss,
                global_step=self.global_step) #1e-3
        # Tensorboard summary scalar and histograms
        self.training_scalar = tf.summary.scalar("training_loss", self.loss)
        #self.validation_scalar = tf.summary.scalar("validation_loss", self.loss)
        tf.summary.histogram("logprob_n", self.logprob_n)
        tf.summary.histogram("sampled_ac", self.sampled_ac)
        self.histogram_merged = tf.summary.merge_all()

        self.checkpoint_dir = checkpoint_dir
        self.saver = tf.train.Saver(var_list=tf.global_variables())
    # model checkpoint save
    def save(self, sess):
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)
        self.saver.save(sess, self.checkpoint_dir + '/model', global_step=self.global_step)
        self.logger.info("Model saved")
    # model checkpoint load
    def load(self, session):
        latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_dir)
        if latest_checkpoint:
            self.logger.info("Loading model checkpoint {} ...\n".format(latest_checkpoint))
            self.saver.restore(session, latest_checkpoint)
            return True
        else:
            self.logger.info("Checkpoint not found")
            return False
    
    # define a function to build nerual network
    def build_model(self):
        """build a neural network with a size = 200 fully connected layer and a size = 3 dense layer
        """
        with tf.variable_scope("policy_network"):
            dense =self.observations
            for _ in range(self.num_layers):
                dense = tf.layers.dense(inputs=dense, units=self.layer_size, activation=tf.nn.relu)
            logits_na = tf.layers.dense(inputs=dense, units=self.num_actions, activation=None)
            sampled_ac = tf.squeeze(tf.multinomial(logits_na, 1), axis=[1])
            logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.actions, logits=logits_na)
            return logprob_n, sampled_ac

    def update(self, sess, batch_x, batch_y, advantages, keep_prob):
        loss, training_scalar, _, histogram_merged, _ = sess.run([self.loss, self.training_scalar, self.logprob_n, self.histogram_merged, self.optimizer],
                        feed_dict={self.observations: batch_x,
                                    self.actions: batch_y,
                                    self.advantages: advantages,
                                    self.keep_prob: keep_prob})
        return loss, training_scalar, histogram_merged

In [0]:
def config_logging(log_file):
    if os.path.exists(log_file):
        os.remove(log_file)

    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(message)s')

    fh = logging.FileHandler(log_file)
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    return logger

def pathlength(path):
    return len(path['rewards'])

def discounted_rewards_to_go(rewards, gamma):
  """calculates discounted rewards starting at time step t to the end of the trajectory:
  """
  discounted_rewards = []
  future_reward = 0
  # start at time step t and use future_reward to calculate current reward
  for r in reversed(rewards):
    future_reward = r + future_reward * gamma
    discounted_rewards.append(future_reward)
  discounted_rewards = discounted_rewards[::-1]
  return discounted_rewards

In [0]:
def create_model(session, num_observations, num_actions, num_layers,
                    layer_size, logger, learning_rate, checkpoint_dir, restore):
    """Create a model class with input parameters/ Restore a model from checkpoint
    """
    model = Model(num_observations, num_actions, num_layers,
                  layer_size, logger, learning_rate, checkpoint_dir)

    if restore:
        restored = model.load(session)
        if not restored:
            logger.info("Created model with fresh parameters")
            session.run(tf.global_variables_initializer())
    else:
        logger.info("Created model with fresh parameters")
        session.run(tf.global_variables_initializer())

    return model

def preprocess_frame(image):
    """ preprocess 210x160x3 uint8 frame into 6400 (80x80) 1 dim float vector
    """
    image = image[35:195] # crop the image
    image = image[::2,::2,0] # downsample by factor of 2
    image[image == 144] = 0 # erase background (background type 1)
    image[image == 109] = 0 # erase background (background type 2)
    image[image != 0] = 1 # everything else (paddles, ball) just set to 1
    return image.astype(np.float).ravel()

## Set up logging directory

In [0]:
log_file = os.path.join(os.getcwd(), 'results', 'train_out.log')
logger = config_logging(log_file)

checkpoint_dir = os.path.join(os.getcwd(), 'results',  'Pong-v0')
results_dir = os.path.join(os.getcwd(), 'results', 'Pong-v0', 'pg' + '_' + time.strftime("%d-%m-%Y_%H-%M-%S"))
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

In [0]:
# print out results directory for setting up tensorboard
results_dir

## Set up parameters in Policy Gradient Network

* discounted factor $\gamma = 0.99$
* batch size = 10
* learning rate = 0.001
* number of layers = 2
* size of the hidder layer = 200
* restore default set as False

In [0]:
import sys; sys.argv=['']; del sys # to allow use of argparse in ipython

parser = argparse.ArgumentParser()
parser.add_argument('--render', action='store_true')
parser.add_argument('--gamma', type=float, default=.99)
parser.add_argument('--batch_size', '-b', type=int, default=10)
parser.add_argument('--learning_rate', '-lr', type=float, default=1e-3)
parser.add_argument('--n_layers', '-l', type=int, default=1)
parser.add_argument('--layer_size', '-s', type=int, default=200)
parser.add_argument('--restore', '-restore', action='store_true')
args = parser.parse_args()

In [0]:
# hyper parameters
gamma=args.gamma
learning_rate=args.learning_rate
render=args.render
num_layers=args.n_layers
layer_size=args.layer_size
batch_size=args.batch_size
restore=args.restore
# directory folders
results_dir=results_dir
checkpoint_dir=checkpoint_dir

## Training Policy Gradient Network

In [0]:
tf.reset_default_graph()
    
with tf.Session() as session:
    # set up game environment
    env = gym.make('Pong-v0')
    env = wrappers.Monitor(env, results_dir, force=True)
    # define frame size and action space
    num_observations = 80 * 80
    NOOP, UP, DOWN = 0, 2, 5
    pong_actions = [NOOP, UP, DOWN]
    num_actions = 3
    keep_prob = 1

    model = create_model(session, num_observations, num_actions, num_layers,
        layer_size, logger, learning_rate, checkpoint_dir, restore)
    
    # write a file for tensorboard use
    file_writer = tf.summary.FileWriter(results_dir, session.graph)

    observation = env.reset()
    #initialize training process
    prev_frame = None
    observations, actions, rewards, batch_advantages = [], [], [], []
    episode_number = 0
    reward_sum = 0
    running_reward = None
    step = 0
    while True:
        ## Observation
        # process frames
        curr_frame = preprocess_frame(observation)
        difference_frame = curr_frame - prev_frame if prev_frame is not None else np.zeros(num_observations)
        prev_frame = curr_frame
        observations.append(difference_frame)
        # action
        action = session.run(model.sampled_ac, feed_dict={model.observations : [difference_frame]})
        action = action[0]
        actions.append(action)
        pong_action = pong_actions[action]
        # take action
        observation, reward, done, _ = env.step(pong_action)
        logger.debug('step:{} action:{} pong_action:{} reward:{}'.format(step, action, pong_action, reward))
        # record current reward/score
        reward_sum += reward
        rewards.append(reward)
        
        # if an episode is done
        if done:
          episode_number += 1
          # calculated discounted rewards
          q_n = np.concatenate([discounted_rewards_to_go(rewards, gamma)])
          advantages = q_n.copy()
          # normalize advantages
          advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)
          batch_advantages.append(advantages)
          logger.debug('advantages: {}'.format(advantages))
          # record smoothed running reward
          running_reward = reward_sum if running_reward is None else 0.99*running_reward + 0.01*reward_sum
          logger.info('Episode: %d reward: %.4f smoothed reward %.4f' %(episode_number, reward_sum, running_reward))
          running_reward_summary = tf.Summary(value=[tf.Summary.Value(tag="running_reward", simple_value=running_reward)])
          file_writer.add_summary(running_reward_summary, global_step=episode_number)
          print('Finished {} episodes and achieved {} rewards'.format(episode_number, running_reward))

          # update model after every batch
          if episode_number % batch_size == 0:
              step += 1
              loss, training_scalar, histogram_merged = model.update(session, observations, actions, advantages, keep_prob)

              file_writer.add_summary(training_scalar, step)
              file_writer.add_summary(histogram_merged, step)
              
              logger.info("Epoch %3d Loss %f" %(step, loss))

              observations, actions, rewards, batch_advantages = [], [], [], []
          # save the model every 100 episodes
          if episode_number % 100 == 0:
              model.save(session)

          # reset for a new episode
          observation = env.reset()
          reward_sum = 0