# PG for Pong 
### Version 12.05

## Prepare Environment
Please uncomment and run this chuck if you have not installed gym.

In [0]:
#!pip install gym
#!pip install gym[atari]
#!apt-get install python-opengl

## Import neccessary modules and make directory

In [0]:
import os
import os.path

# Ensure target log dir exists
LOG_DIR = './tmp_PG'
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)
    
import numpy as np
import tensorflow as tf

import argparse
import pickle
import numpy as np
import gym


## Policy Gradient Network

In [0]:
OBSERVATIONS_SIZE = 6400


class Network:
    def __init__(self, learning_rate, checkpoints_dir):
      
        self.learning_rate = learning_rate

        self.observations = tf.placeholder(tf.float32, [None, OBSERVATIONS_SIZE])
                
        self.sampled_actions = tf.placeholder(tf.float32, [None, 1])
        
        self.rewards = tf.placeholder(tf.float32, [None, 1])
        
        self.sess = tf.InteractiveSession()
                
        self.checkpoint_file = os.path.join(checkpoints_dir, 'policy_network.ckpt')
        
        # input layer
        self.input_layer = tf.reshape(self.observations, [tf.shape(self.observations)[0],80,80,1])

        # cnn layer1
        self.c1 = tf.layers.conv2d(inputs=self.input_layer,
                             filters = 5,
                             activation = tf.nn.relu,
                             strides = [1, 1], 
                             padding = 'SAME',
                             kernel_size = [3, 3])
        
        # pool layer1
        self.p1 = tf.layers.max_pooling2d(inputs=self.c1, pool_size=[2, 2], strides=2)
        
        # cnn layer2
        self.c2 = tf.layers.conv2d(inputs=self.p1,
                             filters = 5,
                             activation = tf.nn.relu,
                             strides = [1, 1], 
                             padding = 'SAME',
                             kernel_size = [3, 3])
        
        # pool layer2
        self.p2 = tf.layers.max_pooling2d(inputs=self.c2, pool_size=[2, 2], strides=2)
        
        
        # dense layer
        self.flat = tf.reshape(self.p2, [-1, 20 * 20 * 5])
        self.dense = tf.layers.dense(self.flat,
                            units=20,
                            activation=tf.nn.relu)
        
        # last layer probability
        self.up_probability = tf.layers.dense(
            self.dense,
            units=1,
            activation=tf.sigmoid,
            kernel_initializer=tf.contrib.layers.xavier_initializer())


        self.loss = tf.losses.log_loss(
            labels=self.sampled_actions,
            predictions=self.up_probability,
            weights=self.rewards)
        
        self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

        self.saver = tf.train.Saver()
        
        tf.global_variables_initializer().run()
        

    def load_checkpoint(self):
        print("Loading checkpoint...")
        self.saver.restore(self.sess, self.checkpoint_file)

    def save_checkpoint(self):
        print("Saving checkpoint...")
        self.saver.save(self.sess, self.checkpoint_file)

    def get_up_probability(self, observations):
        up_probability = self.sess.run(
            self.up_probability,
            feed_dict={self.observations: observations.reshape([1, -1])})
        return up_probability

    def train(self, state_action_reward_tuples, episode_n):
        print("Training with %d (state, action, reward) tuples" %
              len(state_action_reward_tuples))

        states, actions, _rewards = zip(*state_action_reward_tuples)
        states = np.vstack(states)
        actions = np.vstack(actions)
        _rewards = np.vstack(_rewards)

        feed_dict = {
            self.observations: states,
            self.sampled_actions: actions,
            self.rewards: _rewards
        }
        self.sess.run(self.train_op, feed_dict)

## Set necessary variables and define functions for dataframe processing and discount reward calculation

In [0]:
learning_rate=0.0001
batch_size_episodes=1 # how many rounds we play before updating the weights of network
checkpoint_every_n_episodes=10
load_checkpoint = True
discount_factor=0.99
render=False

# Action values to send to gym environment to move paddle up/down
UP_ACTION = 2
DOWN_ACTION = 3
# Mapping from action values to outputs from the policy network
action_dict = {DOWN_ACTION: 0, UP_ACTION: 1}


# From Andrej's code
def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195]  # crop
    I = I[::2, ::2, 0]  # downsample by factor of 2
    I[I == 144] = 0  # erase background (background type 1)
    I[I == 109] = 0  # erase background (background type 2)
    I[I != 0] = 1  # everything else (paddles, ball) just set to 1
    return I.astype(np.float).ravel()

# From Andrej's code
def discount_rewards(r, discount_factor):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, len(r))):
      if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
      running_add = running_add * discount_factor + r[t]
      discounted_r[t] = running_add
    return discounted_r


## Run it

In [0]:
env = gym.make('Pong-v0')
observation = env.reset()
observation = prepro(observation)
last_observation = None


network = Network(learning_rate, checkpoints_dir=LOG_DIR)
if load_checkpoint:
    network.load_checkpoint()

batch_state_action_reward_tuples = []
smoothed_reward = None
episode_n = 1
round_n = 1    
n_steps = 0
episode_reward_sum = 0
ep_rewards = [] # for plot
ep_smooth_rewards = [] # for plot

while True:

    
    if render: env.render()
    
    observation_delta = observation - last_observation if last_observation is not None else observation
    last_observation = observation
    
    up_probability = network.get_up_probability(observation_delta)[0]
    action = UP_ACTION if np.random.uniform() < up_probability else DOWN_ACTION

    observation, reward, episode_done, _ = env.step(action)
    observation = prepro(observation)
    episode_reward_sum += reward
    n_steps += 1

    tup = (observation_delta, action_dict[action], reward)
    batch_state_action_reward_tuples.append(tup)

    if reward != 0:
        if reward == -1:
            print("Round %d: %d time steps; lost..." % (round_n, n_steps))
        elif reward == +1:
            print("Round %d: %d time steps; won!" % (round_n, n_steps))
        round_n += 1
        n_steps = 0

    if episode_done:
        print("Episode %d finished after %d rounds" % (episode_n, round_n-1))
        round_n = 1

        # exponentially smoothed version of reward
        if smoothed_reward is None:
            smoothed_reward = episode_reward_sum
        else:
            smoothed_reward = smoothed_reward * 0.99 + episode_reward_sum * 0.01
        print("Reward total was %.3f; discounted moving average of reward is %.3f" \
            % (episode_reward_sum, smoothed_reward))

        ep_rewards.append(episode_reward_sum)
        ep_smooth_rewards.append(smoothed_reward)
        
        episode_reward_sum = 0
        
        if smoothed_reward > 10.0: break

        if episode_n % batch_size_episodes == 0:
            states, actions, rewards = zip(*batch_state_action_reward_tuples)
            rewards = discount_rewards(rewards, discount_factor)
            rewards -= np.mean(rewards)
            rewards /= np.std(rewards)
            batch_state_action_reward_tuples = list(zip(states, actions, rewards))
            network.train(batch_state_action_reward_tuples, episode_n)
            batch_state_action_reward_tuples = []

        if episode_n % checkpoint_every_n_episodes == 0:
            network.save_checkpoint()
            load_checkpoint = True

        episode_n += 1
        observation = env.reset()
        observation = prepro(observation)
        last_observation = None
        

import pandas as pd
df = pd.DataFrame(ep_rewards, columns=["ep_rewards"])
df.to_csv('ep_rewards.csv', index=False)

df = pd.DataFrame(ep_smooth_rewards, columns=["ep_smooth_rewards"])
df.to_csv('ep_smooth_rewards.csv', index=False)
