In [1]:
import numpy as np
import tensorflow as tf
import random
import dqn
from collections import deque

import gym
env = gym.make('CartPole-v0')
env._max_episode_steps = 10000

# Constants defining our neural network
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

mainDQN = None

dis = 0.9
REPLAY_MEMORY = 50000

[2017-11-15 14:09:03,038] Making new env: CartPole-v0


In [2]:
class DQN:
    def __init__(self, session, input_size, output_size, name="main"):
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        
        self._build_network()
        
    def _build_network(self, h_size = 10, h2_size = 10, l_rate = 1e-1):
        with tf.variable_scope(self.net_name):
            self._X = tf.placeholder(
                tf.float32, [None, self.input_size], name = "input_x")
        
            # First layer of weights
            W1 = tf.get_variable("W1", shape=[self.input_size, h_size],
                             initializer=tf.contrib.layers.xavier_initializer())
            layer1 = tf.nn.tanh(tf.matmul(self._X, W1))
        
            # Second Layer of weights
            W2 = tf.get_variable("W2", shape=[h_size, h2_size],
                             initializer=tf.contrib.layers.xavier_initializer())
            layer2 = tf.nn.tanh(tf.matmul(layer1, W2))
            
            # Third Layer of weights
            W3 = tf.get_variable("W3", shape=[h2_size, self.output_size],
                             initializer=tf.contrib.layers.xavier_initializer())
        
            # Q prediction
            self._Qpred = tf.matmul(layer2, W3)
        
        # We need to define the parts of the network needed for learning a policy
        self._Y = tf.placeholder(
            shape = [None, self.output_size], dtype = tf.float32)
    
        # Loss function
        self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
        # Learning
        self._train = tf.train.AdamOptimizer(
            learning_rate=l_rate).minimize(self._loss)
    
    def predict(self, state):
        x = np.reshape(state, [1, self.input_size])
        return self.session.run(self._Qpred, feed_dict = {self._X: x})
    
    def update(self, x_stack, y_stack):
        return self.session.run([self._loss, self._train], feed_dict = 
                               {self._X: x_stack, self._Y: y_stack})

In [3]:
def replay_train(mainDQN, targetDQN, train_batch):
    x_stack = np.empty(0).reshape(0, input_size)
    y_stack = np.empty(0).reshape(0, output_size)
    
    # Get stored information from the buffer
    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)
        
        # terminal?
        if done:
            Q[0, action] = reward
        else:
            # get target from target DQN (Q')
            Q[0, action] = reward + dis * np.max(targetDQN.predict(next_state))
            
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
        
    # Train our network using target and predicted Q values on each episode
    return mainDQN.update(x_stack, y_stack)

In [4]:
def get_copy_var_ops(*, dest_scope_name = "target", src_scope_name = "main"):
    # Copy variables src_scope to dest_scope
    op_holder = []
    
    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = src_scope_name)
    dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = dest_scope_name)
    
    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))
        
    return op_holder

In [5]:
def bot_play(mainDQN):
    # See our trained network in action
    s = env.reset()
    reward_sum = 0
    while True:
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s, reward, done, _ = env.step(a)
        reward_sum += reward
        if done:
            print("Total score: {}".format(reward_sum))
            break

In [6]:
def main():
    max_episodes = 1000
    # store the previous observations in replay memory
    replay_buffer = deque()
    
    with tf.Session() as sess:
        mainDQN = DQN(sess, input_size, output_size, name = "main")
        targetDQN = DQN(sess, input_size, output_size, name = "target")
        tf.global_variables_initializer().run()
    
        # initial copy q_net -> target_net
        copy_ops = get_copy_var_ops(dest_scope_name = "target", src_scope_name = "main")
    
        sess.run(copy_ops)
    
        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()
        
            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    # Choose an action by greedily from the Q-netsork
                    action = np.argmax(mainDQN.predict(state))
                
                # Get new state and reward from environment
                next_state, reward, done, _ = env.step(action)
                if done: # Penalty
                    reward = -100
                
                # Save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count > 1000: # Good enough. Let's move on
                    break

            print("Episode: {}  step:  {}".format(episode, step_count))
            if step_count > 1000:
                pass
                # break

            if episode % 10 == 1:  # train every 10 episode
                # Get a random batch of experiences.
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)

                print("Loss: ", loss)
                #copy q_net -> target_net
                sess.run(copy_ops)

        bot_play(mainDQN)
            
if __name__ == "__main__":
    main()

Episode: 0  step:  15
Episode: 1  step:  12
Loss:  23.47
Episode: 2  step:  13
Episode: 3  step:  14
Episode: 4  step:  14
Episode: 5  step:  14
Episode: 6  step:  11
Episode: 7  step:  16
Episode: 8  step:  9
Episode: 9  step:  11
Episode: 10  step:  14
Episode: 11  step:  16
Loss:  952.459
Episode: 12  step:  51
Episode: 13  step:  100
Episode: 14  step:  27
Episode: 15  step:  65
Episode: 16  step:  71
Episode: 17  step:  119
Episode: 18  step:  54
Episode: 19  step:  37
Episode: 20  step:  69
Episode: 21  step:  155
Loss:  10.764
Episode: 22  step:  134
Episode: 23  step:  75
Episode: 24  step:  133
Episode: 25  step:  104
Episode: 26  step:  94
Episode: 27  step:  125
Episode: 28  step:  71
Episode: 29  step:  70
Episode: 30  step:  143
Episode: 31  step:  77
Loss:  3.6998
Episode: 32  step:  121
Episode: 33  step:  51
Episode: 34  step:  72
Episode: 35  step:  59
Episode: 36  step:  84
Episode: 37  step:  76
Episode: 38  step:  150
Episode: 39  step:  48
Episode: 40  step:  77
Ep

Loss:  2.15478
Episode: 332  step:  210
Episode: 333  step:  129
Episode: 334  step:  354
Episode: 335  step:  306
Episode: 336  step:  169
Episode: 337  step:  786
Episode: 338  step:  347
Episode: 339  step:  507
Episode: 340  step:  720
Episode: 341  step:  332
Loss:  5.69137
Episode: 342  step:  68
Episode: 343  step:  39
Episode: 344  step:  73
Episode: 345  step:  62
Episode: 346  step:  61
Episode: 347  step:  51
Episode: 348  step:  37
Episode: 349  step:  78
Episode: 350  step:  75
Episode: 351  step:  62
Loss:  8.46276
Episode: 352  step:  9
Episode: 353  step:  10
Episode: 354  step:  11
Episode: 355  step:  10
Episode: 356  step:  8
Episode: 357  step:  9
Episode: 358  step:  8
Episode: 359  step:  9
Episode: 360  step:  8
Episode: 361  step:  14
Loss:  511.877
Episode: 362  step:  53
Episode: 363  step:  122
Episode: 364  step:  143
Episode: 365  step:  81
Episode: 366  step:  102
Episode: 367  step:  89
Episode: 368  step:  71
Episode: 369  step:  94
Episode: 370  step:  

Loss:  3.72388
Episode: 652  step:  1001
Episode: 653  step:  1001
Episode: 654  step:  1001
Episode: 655  step:  1001
Episode: 656  step:  1001
Episode: 657  step:  1001
Episode: 658  step:  1001
Episode: 659  step:  1001
Episode: 660  step:  1001
Episode: 661  step:  1001
Loss:  1.21751
Episode: 662  step:  1001
Episode: 663  step:  1001
Episode: 664  step:  1001
Episode: 665  step:  1001
Episode: 666  step:  1001
Episode: 667  step:  1001
Episode: 668  step:  1001
Episode: 669  step:  1001
Episode: 670  step:  1001
Episode: 671  step:  1001
Loss:  0.692955
Episode: 672  step:  1001
Episode: 673  step:  1001
Episode: 674  step:  1001
Episode: 675  step:  1001
Episode: 676  step:  1001
Episode: 677  step:  1001
Episode: 678  step:  1001
Episode: 679  step:  1001
Episode: 680  step:  1001
Episode: 681  step:  1001
Loss:  2.45398
Episode: 682  step:  1001
Episode: 683  step:  1001
Episode: 684  step:  1001
Episode: 685  step:  1001
Episode: 686  step:  1001
Episode: 687  step:  1001
Epi

Loss:  3.99799
Episode: 952  step:  263
Episode: 953  step:  1001
Episode: 954  step:  1001
Episode: 955  step:  1001
Episode: 956  step:  1001
Episode: 957  step:  1001
Episode: 958  step:  404
Episode: 959  step:  1001
Episode: 960  step:  1001
Episode: 961  step:  401
Loss:  3.49527
Episode: 962  step:  396
Episode: 963  step:  1001
Episode: 964  step:  1001
Episode: 965  step:  1001
Episode: 966  step:  639
Episode: 967  step:  1001
Episode: 968  step:  480
Episode: 969  step:  1001
Episode: 970  step:  1001
Episode: 971  step:  1001
Loss:  1.68945
Episode: 972  step:  1001
Episode: 973  step:  1001
Episode: 974  step:  1001
Episode: 975  step:  1001
Episode: 976  step:  564
Episode: 977  step:  770
Episode: 978  step:  1001
Episode: 979  step:  1001
Episode: 980  step:  767
Episode: 981  step:  1001
Loss:  1.56462
Episode: 982  step:  1001
Episode: 983  step:  1001
Episode: 984  step:  1001
Episode: 985  step:  829
Episode: 986  step:  1001
Episode: 987  step:  1001
Episode: 988  