# DQN2 for Pong
### Version 12.16
### Two Observations and Two actions

## Prepare Environment

Please uncomment and run this chuck if you have not installed gym.

In [0]:
# install OpenAI gym per https://gym.openai.com/docs/
#!pip install gym
#!pip install gym[atari]
#!apt-get install python-opengl



### Import neccessary modules and make environment

In [0]:
import gym
import random
import numpy as np
import pandas as pd
import tensorflow as tf
env = gym.make('Pong-v0')

import os
LOG_DIR = './tmp_DQN2'
if not os.path.exists(LOG_DIR):
  os.makedirs(LOG_DIR)

### Create Memory Buffer

In [0]:
# Modified from code which professor provided in scratch06
class Replay:
  # accepts a tuple (s,a,r,s') and keeps a list, returns a random batch of tuples as needed
  def __init__(self):
    self.buffer = []
    self.length = 0
    self.max_length = 300000

  def write(self, data):
    if self.length >= self.max_length:
        # drop oldest data point to make room for new
        self.buffer.pop(0)
        self.length -= 1
    self.buffer.append(data)
    self.length += 1


  def read(self, batch_size):
    # randomly sample a batch and return a list of them
    return random.sample(self.buffer,min(batch_size,self.length))

### Network to abstract all tensorflow away from agent

In [0]:
# Based on https://github.com/fg91/Deep-Q-Learning/blob/master/DQN.ipynb
class Network:
  def __init__(self, session, hidden=64, learning_rate = 0.00025):
    '''
    Args:
        hidden: Integer, Number of filters in the final convolutional layer. 
                This is different from the DeepMind implementation
        learning_rate: Float, Learning rate for the Adam optimizer
    '''
    self.session = session
    self.n_actions = 2 # number of possible actions 
    self.hidden = hidden
    self.learning_rate = learning_rate
    self.frame_height = 80 # Height of a preprocessed frame of Pong
    self.frame_width = 80 # Width of a preprocessed frame of Pong
    self.y = tf.placeholder(tf.float32, [None, self.n_actions])
    self.observations = tf.placeholder(shape=[None, self.frame_height * self.frame_width], 
                                dtype=tf.float32)
    self.input = tf.reshape(self.observations, [tf.shape(self.observations)[0],80,80,1])


    # Convolutional layers
    self.conv1 = tf.layers.conv2d(
        inputs=self.input, filters=32, kernel_size=[8, 8], strides=4,
        kernel_initializer=tf.variance_scaling_initializer(scale=2),
        padding="valid", activation=tf.nn.relu, use_bias=False, name='conv1')
    self.conv2 = tf.layers.conv2d(
        inputs=self.conv1, filters=64, kernel_size=[4, 4], strides=2, 
        kernel_initializer=tf.variance_scaling_initializer(scale=2),
        padding="valid", activation=tf.nn.relu, use_bias=False, name='conv2')
    self.conv3 = tf.layers.conv2d(
        inputs=self.conv2, filters=64, kernel_size=[3, 3], strides=1, 
        kernel_initializer=tf.variance_scaling_initializer(scale=2),
        padding="valid", activation=tf.nn.relu, use_bias=False, name='conv3')
    self.conv4 = tf.layers.conv2d(
        inputs=self.conv3, filters=hidden, kernel_size=[3, 3], strides=1, 
        kernel_initializer=tf.variance_scaling_initializer(scale=2),
        padding="valid", activation=tf.nn.relu, use_bias=False, name='conv4')

    self.q_values = tf.layers.dense(
        inputs=tf.layers.flatten(self.conv4), units=self.n_actions,
        kernel_initializer=tf.variance_scaling_initializer(scale=2), name="q_values")
    self.best_action = tf.argmax(self.q_values, 1)

    # loss, train_step
    self.loss = tf.reduce_sum(tf.square(self.y - self.q_values),1)
    self.train_step = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

  def compute(self, state):
    # evaluate the network and return the action values [q(s,a=0),q(s,a=1)]
    return self.session.run(self.q_values, feed_dict={self.observations: state})

  def get_best_action(self, state):
    return self.session.run(self.best_action, feed_dict={self.observations: state})


  def train(self, x_batch, y_batch):
    # take a training step
    _ = self.session.run(self.train_step, feed_dict={self.observations: x_batch, self.y: y_batch})

### Pong deep q network agent

In [0]:
# Modified from code which professor provided in scratch06
class Agent: 

  def __init__(self, tf_session):
      self.n_actions = 2
      self.frame_height = 80
      self.frame_width = 80
      # first what reward has the agent accrued so far
      self.total_reward = 0 
      # discount, exploration rates, batch size
      self.gamma = 0.99
      self.epsilon = 1.0
      self.batch_size = 32
      # make an experience replay buffer
      self.replay_buffer = Replay()
      # make the network that will be the q function
      self.q = Network(tf_session)  
      self.sess = tf_session
      self.checkpoint_file = os.path.join('./tmp_DQN2', 'deep_q_network.ckpt')
      self.saver = tf.train.Saver()


  def gather_experience(self, state, action, reward, new_state):
      # push this experience onto the replay buffer
      self.replay_buffer.write(( state, action, reward, new_state))

  def choose_action(self, state):
      # behave according to an epsilon greedy policy
      if np.random.rand(1) < self.epsilon:
          return np.random.randint(0, self.n_actions)
      else:
          return self.q.get_best_action(state) [0] 


  def q_update(self, step):
      # pull a batch from the buffer
      sars_batch = self.replay_buffer.read(self.batch_size)
      # compute the q function for all last_state and state
      q_last = self.q.compute([s[0] for s in sars_batch])
      # q_next for current state requires a bit more attention, since done flag means q should be zero
      q_this = np.zeros_like(q_last) # initialize q to zeros
      ind_not_none = [i for i in range(np.shape(sars_batch)[0]) if sars_batch[i][3] is not None]

      q_this_not_none = self.q.compute([sb[3] for sb in sars_batch if sb[3] is not None])

      # now fill q_this with just the valid q, leaving others [0,0]
      for i in range(len(ind_not_none)):
          q_this[ind_not_none[i],:] = q_this_not_none[i,:]

      # now chunk this up as the train_step expects
      x_batch = np.zeros([np.shape(sars_batch)[0], self.frame_height * self.frame_width])
      y_batch = np.zeros([np.shape(sars_batch)[0], self.n_actions])
      for i in range(np.shape(sars_batch)[0]):
          x_batch[i,:] = sars_batch[i][0]
          for j in range(2):
              if j == sars_batch[i][1]:
                  # the key step... this is the q learning target
                  y_batch[i,j] = sars_batch[i][2] + self.gamma*np.max(q_this[i])
              else:
                  y_batch[i,j] = q_last[i][j]

      # now run the train step
      self.q.train(x_batch,y_batch)

  def set_epsilon(self, step):
      # set epsilon = 1 when evaluation
      if step > 300000 and step <1000000: 
          self.epsilon = max(0.1, self.epsilon - ((1-0.1)/(1000000-300000)))
      if step >= 1000000: 
          self.epsilon = max(0.01, self.epsilon - ((0.1-0.01)/ (10**6)))

  def reset_epsilon(self):
      # reset method for running greedy after training
      self.epsilon = 0.0

  def gather_reward(self, reward):
      self.total_reward += reward

  def get_total_reward(self):
      return self.total_reward

  def set_total_reward(self, new_total):
      self.total_reward = new_total

  def load_checkpoint(self):
      print("Loading checkpoint...")
      self.saver.restore(self.sess, self.checkpoint_file)

  def save_checkpoint(self):
      print("Saving checkpoint...")
      self.saver.save(self.sess, self.checkpoint_file)

### Preprocess frames

In [0]:
# From Andrej's code
def prepro(I):
  # prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector
  I = I[35:195]  # crop
  I = I[::2, ::2, 0]  # downsample by factor of 2
  I[I == 144] = 0  # erase background (background type 1)
  I[I == 109] = 0  # erase background (background type 2)
  I[I != 0] = 1  # everything else (paddles, ball) just set to 1
  return I.astype(np.float).ravel()

### Run it...

In [0]:
# map actions from 0, 1 to 2, 3 to match the environment's response
action_map = {0:2, 1:3}

with tf.Graph().as_default():
    load_checkpoint = False # 
    ep_rewards = []
    ep_smooth_rewards = []
    smoothed_reward = None
    with tf.Session() as sess:
        # create an agent
        agent = Agent(sess)
        if load_checkpoint:
            agent.load_checkpoint()
        # usual tf initialization
        sess.run(tf.global_variables_initializer())      
        
        #################################
        ## Q-learn (train) DQN on Pong ##
        #################################
        episode_n = 0
        n_steps = 1
        
        while True: 
          episode_n += 1
          # reset environment and agent
          observation1 = env.reset()
          # random pick first 3 action and record frame (observation)
          action = action_map[np.random.randint(0, 2)]
          observation2, _, _, _ = env.step(action)

          observation1 = prepro(observation1)
          observation2 = prepro(observation2)

          agent.set_total_reward(0)

          episode_done = False
          round_n = 1

          while not episode_done:
            # use the difference of two consecutive frames(observations) as a state
            last_state = observation2 - observation1
            # agent chooses an action
            action01 = agent.choose_action([last_state])
            action = action_map[action01]
            # agent takes the action, and the environment responds
            observation3, reward, episode_done, info = env.step(action)
            observation3 = prepro(observation3)
            # update rates
            agent.set_epsilon(n_steps)
            n_steps += 1
            # update agent with reward and data
            agent.gather_reward(reward)
            # update state
            state = observation3 - observation2
            observation1 = observation2
            observation2 = observation3

            if episode_done: 
                agent.gather_experience(last_state, action01, reward, None)
            else:
                agent.gather_experience(last_state, action01, reward, state)

            if n_steps > 300000:
                agent.q_update(n_steps)

            if reward != 0:
                round_n += 1 

          print("Episode %d finished after %d rounds" % (episode_n, round_n))
          print('Step is ', n_steps, ', epsilon is', agent.epsilon)

          episode_reward_sum = agent.get_total_reward()
          # exponentially smoothed version of reward
          if smoothed_reward is None:
            smoothed_reward = episode_reward_sum
          else:
            smoothed_reward = smoothed_reward * 0.99 + episode_reward_sum * 0.01

          print("Reward total was %.3f; discounted moving average of reward is %.3f" % (episode_reward_sum, smoothed_reward))

          ep_rewards.append(episode_reward_sum)
          ep_smooth_rewards.append(smoothed_reward)


          # Set termination rule
          if smoothed_reward > 5:
            break

          if episode_n % 20 == 0:
            agent.save_checkpoint()
            load_checkpoint = True

### Write out rewards and smoothed rewards to csv file

In [0]:
df = pd.DataFrame(ep_rewards, columns=["ep_rewards"])
df.to_csv('ep_rewards.csv', index=False)

df = pd.DataFrame(smoothed_reward, columns=["smoothed_reward"])
df.to_csv('smoothed_reward.csv', index=False)