# Experiment

In [None]:
import os
import sys

from datetime import datetime as dt

import cv2
import gym
import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
env_name = 'Pong-v0'
env = gym.make(env_name)

In [None]:
state_shape = env.observation_space.shape
n_actions = env.action_space.n
print(f'States: {state_shape}\tActions: {n_actions}')

In [None]:
def network(state):
    net = tf.reshape(state, [1, *state_shape])
    # 3 convolutional layers
    net = tf.layers.conv2d(net, filters=16, kernel_size=5, strides=1, padding='same')
    net = tf.layers.conv2d(net, filters=16, kernel_size=5, strides=2, padding='same')
    net = tf.layers.conv2d(net, filters=32, kernel_size=5, strides=2, padding='same')
    # flattening layer
    net = tf.contrib.layers.flatten(net)
    # 2 fully connected layers
    net = tf.layers.dense(net, units=128, activation=tf.nn.relu)
    Q_value = tf.layers.dense(net, units=n_actions)
    predict = tf.argmax(Q_value, axis=1)
    return Q_value, predict

### Policy Network

In [None]:
# Reset default graph
tf.reset_default_graph()

# Placeholders
state_placeholder = tf.placeholder(tf.float32, shape=state_shape)
action_paceholder = tf.placeholder(tf.int32, shape=[1, n_actions])

# Loss function
Q_value, predict = network(state_placeholder)
x_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=Q_value, 
                                                    labels=action_paceholder)
loss = tf.reduce_mean(x_entropy)

# Training
global_step = tf.Variable(0, trainable=False)
optimizer = tf.train.AdamOptimizer()
train = optimizer.minimize(loss, global_step=global_step)

### Tesnorflow's `Session`

In [None]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

### Tensorboard

In [None]:
save_dir = f'saved/experiment/{env_name}'
tboard_dir = os.path.join(save_dir, 'tensorboard')
logdir = os.path.join(tboard_dir, 'log')

model_dir = os.path.join(save_dir, 'models')
model_path = os.path.join(model_dir, 'model.ckpt')

# Summary
tf.summary.scalar('loss', loss)
merged = tf.summary.merge_all()

# Saver & Writer
saver = tf.train.Saver()
writer = tf.summary.FileWriter(logdir=logdir, graph=sess.graph)

In [None]:
if tf.gfile.Exists(model_dir):
    try:
        print('INFO: Attempting to restore last checkpoint')
        ckpt_file = tf.train.latest_checkpoint(model_dir)
        saver.restore(sess=sess, save_path=ckpt_file)
        print(f'INFO: Successfully restored last chekcpoint - {ckpt_file}')
    except Exception as e:
        sys.stderr.write(f'ERR: Could not restore checkpoint. {e}')
        sys.stderr.flush()
else:
    tf.gfile.MakeDirs(model_dir)
    print(f'INFO: Created checkpoint directory - {model_dir}')

### Let's Play the Game

In [None]:
def run_episode(episodes, **kwargs):
    # Keyword arguments
    gamma = kwargs.get('gamma', 0.9)
    render = kwargs.get('render', False)
    logging = kwargs.get('logging', True)
    save_step = kwargs.get('save_step', 1000)
    max_trans_per_episode = kwargs.get('max_trans_per_episode', 200)
    
    # Metrics
    metrics = {
        'wins':    0,
        'rewards': 0,
    }
    # Game loop
    for episode in range(episodes):
        state = env.reset()
        done = False
        max_trans = 0
        
        while max_trans < max_trans_per_episode:
            max_trans += 1
            if render:
                env.render()
            # Room for E-greedy exploration
            Q, _predict = sess.run([Q_value, predict], 
                                         feed_dict={state_placeholder: state})
            # !- Random exploration
            action = _predict[0]
            
            # Transition to a new state
            new_state, reward, done, _ = env.step(action)
            
            # Get the next Q value
            _next_Q = sess.run([Q_value], feed_dict={state_placeholder: new_state})
            Q[0, action] = reward + gamma * np.max(_next_Q)
            
            # Train
            feed_dict = {state_placeholder: state, action_paceholder: Q}
            _, i_global = sess.run([train, global_step], feed_dict=feed_dict)
            
            # Update parameters
            metrics['rewards'] += reward
            state = new_state
            
            # Logging
            if logging:
                sys.stdout.write(f'\rEpisode: {episode:,}\tGlobal steps: {i_global:,}'
                                 f'\tReward: {reward}')
                sys.stdout.flush()
            # Game won!
            if done:
                metrics['wins'] += 1
                break
            
        # Save model at intervals
        if save_step % episode == 0:
            saver.save(sess=sess, save_path=model_path, global_step=global_step)
            summary = sess.run(merged, feed_dict=feed_dict)
            writer.add_summary(summary=summary, global_step=i_global)
            print('')  # Break print overriding
    return metrics

In [None]:
run_episode(episodes=10000)