## Policy Gradient for Pong

In [1]:
import numpy as np
import gym 
from gym import wrappers
import matplotlib.pyplot as plt
import tensorflow as tf
import time

In [2]:
class PolicyNetwork(object):
    def __init__(self, N_SIZE, h=200, gamma=0.99, eta=1e-3,decay=0.99, save_path = 'models2/pong.ckpt'):
        self.gamma = gamma
        self.save_path = save_path
        # Placeholders for passing state ...
        self.tf_x = tf.placeholder(dtype=tf.float32, shape=[None, N_SIZE*N_SIZE], name = "tf_x")
        self.tf_y = tf.placeholder(dtype=tf.float32, shape=[None, n_actions], name = "tf_y")
        self.tf_epr = tf.placeholder(dtype=tf.float32, shape=[None, 1], name = "tf_epr")
        
        # Weights
        xavier_l1 = tf.truncated_normal_initializer(mean=0,stddev=1. / N_SIZE, dtype=tf.float32)
        self.W1 = tf.get_variable("W1", [N_SIZE * N_SIZE, h], initializer=xavier_l1)
        xavier_l2 = tf.truncated_normal_initializer(mean=0,stddev=1. / np.sqrt(h), dtype=tf.float32)
        self.W2 = tf.get_variable("W2", [h, n_actions], initializer=xavier_l2)
        
        # Build Computation
        # tf reward processing (need tf_discounted_epr for policy gradient wizardry)
        tf_discounted_epr = self.tf_discount_rewards(self.tf_epr)
        tf_mean, tf_variance = tf.nn.moments(tf_discounted_epr, [0], shift=None, name="reward_moments")
        tf_discounted_epr -= tf_mean
        tf_discounted_epr /= tf.sqrt(tf_variance + 1e-6)
        
        # Define Optimizer, compute and apply gradients
        self.tf_aprob = self.tf_policy_forward(self.tf_x)
        loss = tf.nn.l2_loss(self.tf_y - self.tf_aprob)
        optimizer = tf.train.RMSPropOptimizer(eta, decay=decay)
        tf_grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables(), grad_loss=tf_discounted_epr)
        self.train_op = optimizer.apply_gradients(tf_grads)
        
        # Initialize Variables
        init = tf.global_variables_initializer()
        self.session = tf.InteractiveSession()
        self.session.run(init)
        self.load()
        
    def tf_discount_rewards(self, tf_r): # tf_r ~ [game_steps, 1]
        discount_f = lambda a, v: a * self.gamma + v
        tf_r_reverse = tf.scan(discount_f, tf.reverse(tf_r,[0]))
        tf_discounted_r = tf.reverse(tf_r_reverse, [0])
        return tf_discounted_r
    
    def tf_policy_forward(self, x): #x ~ [1,D]
        h = tf.matmul(x, self.W1)
        h = tf.nn.relu(h)
        logp = tf.matmul(h, self.W2)
        p = tf.nn.softmax(logp)
        return p
    
    def predict_UP(self,x):
        feed = {self.tf_x: np.reshape(x, (1,-1))}
        aprob = self.session.run(self.tf_aprob, feed)
        return aprob
    
    def update(self, feed):
        return self.session.run(self.train_op, feed)
    
    def load(self):
        self.saver = tf.train.Saver(tf.global_variables())
        load_was_success = True 
        try:
            save_dir = '/'.join(self.save_path.split('/')[:-1])
            ckpt = tf.train.get_checkpoint_state(Svae_dir)
            load_path = ckpt.model_checkpoint_path
            self.saver.restore(self.session, load_path)
        except:
            print("no saved model to load. starting new session")
            load_was_success = False
        else:
            print("loaded model: {}".format(load_path))
            saver = tf.train.Saver(tf.global_variables())
            episode_number = int(load_path.split('-')[-1])
    
    def save(self):
        self.saver.save(self.session, self.save_path, global_step=n)
        print("SAVED MODEL #{}".format(n))

In [3]:
# downsampling
def preprocess(I):
    """prepro 210*160*3 uint8 frame into 80*80 1D float vector"""
    I = I[35:195] # crop
    I = I [::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background type 1
    I[I == 109] = 0 # erase background type 2
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float).ravel()

# testing for the above function
def test_preprocess():
    env = gym.make("Pong-v0")
    observation = env.reset()
    img = preprocess(observation)

    plt.imshow(observation)
    plt.show()
    print(observation.shape)

    plt.imshow(img)
    plt.show()
    print(img.shape)
    env.close()

# test_preprocess()

In [4]:
# Create Game Environment
env_name = "Pong-v0"
env = gym.make(env_name)
env = wrappers.Monitor(env, 'tmp/pong', force=True)
n_actions = env.action_space.n # number of possible actions

In [5]:
# Initializing Game and State(t-1), action, reward, state(t)
xs, rs, ys = [], [], []
obs = env.reset()
prev_x = None

running_reward = None
running_rewards = []
reward_sum = 0
n = 0
done= False
n_size = 80
num_episodes = 500

# Create agent
agent = PolicyNetwork(n_size)

Instructions for updating:
Colocations handled automatically by placer.
no saved model to load. starting new session


In [6]:
# training loop
while not done and n < num_episodes:
    # Preprocess the observation
    cur_x = preprocess(obs)
    x = cur_x - prev_x if prev_x is not None else np.zeros(n_size*n_size)
    prev_x = cur_x

    # Predict the action
    aprob = agent.predict_UP(x); aprob = aprob[0,:]
    action = np.random.choice(n_actions, p=aprob)
#     print(action)
    label = np.zeros_like(aprob) ; label[action] = 1

    # Step the environment and get new measurements
    obs, reward, done, info = env.step(action)
    env.render()
    reward_sum += reward

    # record game history
    xs.append(x) ; ys.append(label) ; rs.append(reward)

    if done:
        # update running reward
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        running_rewards.append(running_reward)
        feed = {agent.tf_x: np.vstack(xs), 
                agent.tf_epr: np.vstack(rs), 
                agent.tf_y: np.vstack(ys)}
        # print progress console
        if n % 10 == 0:
            print('ep {}: reward: {}, mean reward: {:3f}'.format(n, reward_sum, running_reward))
        else:
            print('\tep {}: reward: {}'.format(n, reward_sum))

        # Start next episode and save model
        xs, rs, ys = [], [], []
        obs = env.reset()
        n += 1 # the Next Episode

        reward_sum = 0
        if n% 50 == 0:
            agent.save()
        done = False

5
3
4
0
1
2
4
4
1
1
5
3
3
1
2
4
2
2
5
2
4
1
0
0
4
2
0
3
1
3
3
0
0
5
5
5
1
1
4
0
2
1
2
4
3
4
1
4
0
5
4
5
0
5
4
5
0
0
0
0
1
5
5
2
1
0
0
5
4
2
2
1
2
4
0
2
1
5
1
1
4
4
1
5
1
4
4
4
0
4
4
4
5
0
0
4
0
1
5
3
2
4
1
3
5
4
3
3
0
5
5
4
5
1
3
1
0
4
5
5
0
0
1
3
4
4
1
3
5
3
2
2
3
3
5
3
2
2
4
3
3
4
4
2
2
1
2
1
3
2
1
1
5
1
2
4
5
0
4
0
5
5
3
0
3
5
4
3
2
4
2
1
1
4
3
1
1
1
3
0
0
2
5
1
3
0
1
2
4
0
2
0
4
2
0
1
2
2
2
3
2
4
3
2
0
4
5
1
5
0
5
5
0
3
1
0
2
3
1
3
2
5
1
5
5
4
1
2
3
1
1
1
4
4
2
3
4
4
3
5
0
5
4
1
2
0
5
3
1
5
0
4
0
0
0
5
4
5
3
0
0
1
4
1
2
4
1
0
4
0
5
3
0
4
3
1
4
4
2
3
1
1
1
1
3
5
4
5
5
2
2
0
5
3
4
4
1
3
4
1
3
1
2
0
5
5
3
5
5
3
2
2
5
3
5
3
4
5
5
4
5
4
3
3
3
4
1
2
4
3
3
0
0
4
3
3
1
2
3
4
1
2
5
3
0
2
2
5
2
0
3
5
5
1
3
4
3
4
4
5
4
2
2
3
0
4
0
4
1
3
2
4
5
5
2
3
1
0
4
0
2
0
0
4
5
5
3
0
0
3
2
3
4
5
1
0
4
0
4
1
4
2
1
3
4
0
5
5
3
1
3
1
4
2
3
0
1
5
0
5
1
1
1
2
0
1
5
1
4
2
1
4
1
5
5
4
2
5
2
1
3
1
2
1
2
1
4
0
0
5
4
0
4
3
3
3
2
1
5
1
2
3
5
1
4
0
3
3
5
1
2
1
2
1
5
4
5
5
3
4
1
1
3
3
2
0
5
0
4
0
3
5
3
2
1
1
4
0
2
4


1
2
0
0
3
1
1
3
3
4
3
3
1
2
2
1
3
1
0
5
3
0
5
4
0
0
3
5
1
1
2
0
3
1
3
4
1
3
0
3
5
1
3
2
3
1
3
1
5
0
3
2
0
4
2
0
0
0
2
5
1
2
3
0
5
5
4
3
0
4
4
2
0
5
5
3
0
1
0
3
3
0
0
1
5
1
1
0
1
3
3
4
5
2
3
2
3
5
1
4
2
5
5
4
3
0
3
4
3
0
4
0
3
0
4
0
2
1
4
0
4
5
5
0
2
0
0
5
3
0
4
0
4
1
1
5
0
1
3
4
5
1
3
5
2
3
4
0
2
5
4
4
3
4
1
2
1
1
1
4
4
3
4
0
5
2
0
4
1
1
3
4
1
2
3
1
3
0
0
4
4
2
4
0
3
1
3
3
4
1
3
2
5
1
1
5
2
1
3
3
3
4
1
0
2
4
4
4
0
2
3
2
2
3
4
0
4
2
4
3
1
1
1
0
3
4
1
3
5
3
3
1
5
3
5
0
4
5
3
5
1
0
0
4
5
5
1
5
2
1
3
5
0
2
4
1
3
4
2
5
4
2
4
5
1
5
1
0
4
2
2
4
5
5
3
5
2
5
4
2
3
1
0
1
1
0
4
4
4
2
0
3
3
5
4
5
1
0
2
3
0
2
5
5
1
3
4
3
2
1
1
2
1
1
2
4
1
4
2
5
1
1
0
0
4
0
2
4
2
4
0
2
4
3
1
1
1
0
1
2
2
5
3
5
1
3
5
2
4
4
2
0
3
2
5
0
3
5
0
3
0
0
0
4
5
3
3
5
4
4
1
0
4
3
5
2
5
2
3
3
2
0
3
2
0
3
3
4
5
4
3
2
4
3
3
2
2
4
1
1
4
4
2
5
5
3
4
3
5
2
2
4
5
5
3
1
0
2
0
5
0
0
3
0
2
0
5
2
4
4
4
5
4
2
3
0
5
5
0
3
3
1
0
3
0
5
4
0
1
5
4
1
0
5
4
3
3
5
4
1
3
4
3
3
5
1
1
1
4
3
4
3
1
2
5
2
0
0
3
4
1
0
4
5
4
3
1
4
1
0
5
0
5
3
3
5
1
2
1
2


KeyboardInterrupt: 

In [32]:
plt.plot(running_rewards)
plt.xlabel('episodes')
plt.ylabel('Running Average')
plt.show()
env.close()

DependencyNotInstalled: Found neither the ffmpeg nor avconv executables. On OS X, you can install ffmpeg via `brew install ffmpeg`. On most Ubuntu variants, `sudo apt-get install ffmpeg` should do it. On Ubuntu 14.04, however, you'll need to install avconv with `sudo apt-get install libav-tools`.

In [7]:
env.close()