In [1]:
import gym 
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import scipy.ndimage

In [2]:
env = gym.make("CartPole-v0")

[2017-09-17 16:37:02,977] Making new env: CartPole-v0


In [3]:
# Environment Parameters
n_inputs = [90,80] 
n_actions = env.action_space.n
states, actions, rewards = [], [], []
episode_num = 1
n_episodes = 10000
G = 0
average, avg_loss = [], []
render = False

# Hyper Parameters
learning_rate = 1e-7
gamma = .99
save_path='models/cartPole.ckpt'

# Conv Layers
convs = [64, 128]
kerns = [8, 4]
strides = [4, 2]
pads = 'VALID'


# TF Placeholders
X = tf.placeholder(tf.float32, shape=(None, 90, 80, 1), name="X")
Y = tf.placeholder(dtype=tf.float32, shape=[None, n_actions],name="Y")
eps_rewards = tf.placeholder(dtype=tf.float32, shape=[None,1], name="Episode_Rewards")



In [4]:
def resize(image):
    x = np.around(np.mean(image,-1))
    #x = (x-128)/128 - 1
    x = scipy.misc.imresize(x, [90,80],interp='bicubic')
    return(x)

In [5]:
# CONVOLUTION 1 - 1
with tf.name_scope('conv1_1'):
    filter1 = tf.Variable(tf.truncated_normal([8, 8, 1, 64], dtype=tf.float32,
                            stddev=1/np.sqrt(90*80)), name='weights1')
    stride = [1,4,4,1]
    conv = tf.nn.conv2d(X, filter1, stride, padding='VALID')
    biases = tf.Variable(tf.constant(0.0, shape=[convs[0]], dtype=tf.float32),
                         trainable=True, name='biases1')
    out = tf.nn.bias_add(conv, biases)
    conv1 = tf.nn.relu(out)
    
    
# CONVOLUTION 1 - 2
with tf.name_scope('conv1_2'):
    filter2 = tf.Variable(tf.truncated_normal([kerns[1], kerns[1], convs[0], convs[1]], dtype=tf.float32,
                                                stddev=1/np.sqrt(int(np.prod(conv1.get_shape()[1:])))), name='weights2')
    stride = [1,strides[1],strides[0],1]
    conv = tf.nn.conv2d(conv1, filter2, stride, padding=pads)
    biases = tf.Variable(tf.constant(0.0, shape=[convs[1]], dtype=tf.float32),
                         trainable=True, name='biases2')
    out = tf.nn.bias_add(conv, biases)
    conv2 = tf.nn.relu(out)
    

#FULLY CONNECTED 1
with tf.name_scope('fc1') as scope:
    shape = int(np.prod(conv2.get_shape()[1:]))
    fc1w = tf.Variable(tf.truncated_normal([shape, fc], dtype=tf.float32, stddev=1/np.sqrt(shape)), name='weights3')
    fc1b = tf.Variable(tf.constant(1.0, shape=[fc], dtype=tf.float32),
                       trainable=True, name='biases3')
    flat = tf.reshape(conv2, [-1, shape])
    out = tf.nn.bias_add(tf.matmul(flat, fc1w), fc1b)
    fc1 = tf.nn.relu(out)
    

#FULLY CONNECTED 2 & SOFTMAX OUTPUT
with tf.name_scope('softmax') as scope:
    fc2w = tf.Variable(tf.truncated_normal([fc, n_actions], dtype=tf.float32,
                                           stddev=1e-1), name='weights4')
    fc2b = tf.Variable(tf.constant(1.0, shape=[n_actions], dtype=tf.float32),
                       trainable=True, name='biases4')
    Ylogits = tf.nn.bias_add(tf.matmul(fc1, fc2w), fc2b)
    output = tf.nn.softmax(Ylogits)





In [6]:
# Apply discount to episode rewards & normalize
def discount_rewards(rewards, gamma):
    dis_r = np.zeros_like(rewards)
    G = 0
    for i in reversed(range(0, len(rewards))):
        G = G * gamma + rewards[i]
        dis_r[i] = G
        
    # Normalize rewards add 1e-3 to std incase it's 0
    dis_r = (dis_r - np.mean(dis_r)) / (np.std(dis_r)+1e-5)
    return dis_r


In [7]:
# Define loss
loss = tf.nn.l2_loss(Y - output)
loss_mean = tf.reduce_mean(loss)
optimizer = tf.train.AdamOptimizer(learning_rate)
grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables(), grad_loss=eps_rewards)
train = optimizer.apply_gradients(grads)

In [None]:
# Define Session and initialize variables
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
state = env.reset()
state = resize(env.render(mode='rgb_array'))

for i in range(n_episodes):
    
    while True:
        
        if render:
            env.render()
        # Get action probability from network
        aprob = sess.run([output], feed_dict={X:state.reshape(1, 90, 80, 1)})
        action = np.random.choice(n_actions, p=aprob[0][0])
        oneHot = np.zeros(n_actions)
        oneHot[action] = 1
    
        # Perform action & store results
        state2, reward, done, info = env.step(action)
        G += reward
    
        # Record history
        states.append(state)
        actions.append(oneHot)
        rewards.append(reward)
    
        # Update current state
        state = resize(env.render(mode='rgb_array'))
    
        if done:
            
            average.append(G)
            rewards = discount_rewards(rewards,gamma)
            
            # Feed history into network
            feed = {X: np.dstack(states).reshape(len(states),90,80,1), eps_rewards: np.vstack(rewards), Y: np.vstack(actions)}
            _, losses = sess.run([train, loss_mean], feed_dict=feed)
            avg_loss.append(losses)
            
            if episode_num % 10 == 0:
                print('Episode: {}   G: {}  Average: {:4.3f}  Avg. Loss: {:4.3f}'.format(episode_num, G, np.mean(average), np.mean(avg_loss)))
            
            states, actions, rewards = [], [], []
            G = 0
            state = env.reset()
            state = resize(env.render(mode='rgb_array'))
            episode_num += 1
            break
                
    if sum(average[episode_num-100:episode_num])/100 >= 195:
        print('Solved in {} Episodes'.format(episode_num))
        break
        
    
    

Episode: 10   G: 23.0  Average: 18.300  Avg. Loss: 4.088
Episode: 20   G: 14.0  Average: 18.750  Avg. Loss: 4.313
Episode: 30   G: 11.0  Average: 17.167  Avg. Loss: 3.821
Episode: 40   G: 12.0  Average: 17.725  Avg. Loss: 3.938
Episode: 50   G: 49.0  Average: 19.260  Avg. Loss: 4.400
Episode: 60   G: 20.0  Average: 19.117  Avg. Loss: 4.442
Episode: 70   G: 14.0  Average: 20.371  Avg. Loss: 4.772
Episode: 80   G: 10.0  Average: 20.425  Avg. Loss: 4.831
Episode: 90   G: 28.0  Average: 20.833  Avg. Loss: 4.961
Episode: 100   G: 14.0  Average: 20.590  Avg. Loss: 4.946
Episode: 110   G: 14.0  Average: 21.036  Avg. Loss: 5.108
Episode: 120   G: 25.0  Average: 22.200  Avg. Loss: 5.419
Episode: 130   G: 14.0  Average: 22.738  Avg. Loss: 5.552
Episode: 140   G: 12.0  Average: 22.586  Avg. Loss: 5.527
Episode: 150   G: 21.0  Average: 22.573  Avg. Loss: 5.538
Episode: 160   G: 22.0  Average: 22.400  Avg. Loss: 5.510
Episode: 170   G: 20.0  Average: 22.071  Avg. Loss: 5.443
Episode: 180   G: 49.0 

Episode: 1420   G: 21.0  Average: 22.341  Avg. Loss: 5.634
Episode: 1430   G: 26.0  Average: 22.323  Avg. Loss: 5.629
Episode: 1440   G: 9.0  Average: 22.310  Avg. Loss: 5.627
Episode: 1450   G: 15.0  Average: 22.287  Avg. Loss: 5.621
Episode: 1460   G: 19.0  Average: 22.338  Avg. Loss: 5.633
Episode: 1470   G: 18.0  Average: 22.333  Avg. Loss: 5.631
Episode: 1480   G: 28.0  Average: 22.316  Avg. Loss: 5.627
Episode: 1490   G: 29.0  Average: 22.333  Avg. Loss: 5.631
Episode: 1500   G: 11.0  Average: 22.288  Avg. Loss: 5.620
Episode: 1510   G: 17.0  Average: 22.266  Avg. Loss: 5.615
Episode: 1520   G: 17.0  Average: 22.242  Avg. Loss: 5.608
Episode: 1530   G: 13.0  Average: 22.250  Avg. Loss: 5.610
Episode: 1540   G: 18.0  Average: 22.240  Avg. Loss: 5.608
Episode: 1550   G: 34.0  Average: 22.235  Avg. Loss: 5.606
Episode: 1560   G: 15.0  Average: 22.227  Avg. Loss: 5.603
Episode: 1570   G: 26.0  Average: 22.218  Avg. Loss: 5.602
Episode: 1580   G: 31.0  Average: 22.228  Avg. Loss: 5.60

In [None]:
aprob = sess.run([output], feed_dict={X:state.reshape(1, 90, 80, 1)})

In [None]:
conv1