In [1]:
import numpy as np
import tensorflow as tf
import random
import dqn
from collections import deque

import gym
env = gym.make('CartPole-v0')

# Constants defining our neural network
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

mainDQN = None

dis = 0.9
REPLAY_MEMORY = 50000

[2017-11-14 14:09:35,457] Making new env: CartPole-v0


In [2]:
class DQN:
    def __init__(self, session, input_size, output_size, name="main"):
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        
        self._build_network()
        
    def _build_network(self, h_size = 10, l_rate = 1e-1):
        with tf.variable_scope(self.net_name):
            self._X = tf.placeholder(
                tf.float32, [None, self.input_size], name = "input_x")
        
            # First layer of weights
            W1 = tf.get_variable("W1", shape=[self.input_size, h_size],
                             initializer=tf.contrib.layers.xavier_initializer())
            layer1 = tf.nn.tanh(tf.matmul(self._X, W1))
        
            # Second Layer of weights
            W2 = tf.get_variable("W2", shape=[h_size, self.output_size],
                             initializer=tf.contrib.layers.xavier_initializer())
        
            # Q prediction
            self._Qpred = tf.matmul(layer1, W2)
        
        # We need to define the parts of the network needed for learning a policy
        self._Y = tf.placeholder(
            shape = [None, self.output_size], dtype = tf.float32)
    
        # Loss function
        self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
        # Learning
        self._train = tf.train.AdamOptimizer(
            learning_rate=l_rate).minimize(self._loss)
    
    def predict(self, state):
        x = np.reshape(state, [1, self.input_size])
        return self.session.run(self._Qpred, feed_dict = {self._X: x})
    
    def update(self, x_stack, y_stack):
        return self.session.run([self._loss, self._train], feed_dict = 
                               {self._X: x_stack, self._Y: y_stack})

In [3]:
def replay_train(mainDQN, targetDQN, train_batch):
    x_stack = np.empty(0).reshape(0, input_size)
    y_stack = np.empty(0).reshape(0, output_size)
    
    # Get stored information from the buffer
    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)
        
        # terminal?
        if done:
            Q[0, action] = reward
        else:
            # get target from target DQN (Q')
            Q[0, action] = reward + dis * np.max(targetDQN.predict(next_state))
            
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
        
    # Train our network using target and predicted Q values on each episode
    return mainDQN.update(x_stack, y_stack)

In [4]:
def get_copy_var_ops(*, dest_scope_name = "target", src_scope_name = "main"):
    # Copy variables src_scope to dest_scope
    op_holder = []
    
    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = src_scope_name)
    dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = dest_scope_name)
    
    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))
        
    return op_holder

In [5]:
def bot_play(mainDQN):
    # See our trained network in action
    s = env.reset()
    reward_sum = 0
    while True:
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s, reward, done, _ = env.step(a)
        reward_sum += reward
        if done:
            print("Total score: {}".format(reward_sum))
            break

In [None]:
def main():
    max_episodes = 5000
    # store the previous observations in replay memory
    replay_buffer = deque()
    
    with tf.Session() as sess:
        mainDQN = DQN(sess, input_size, output_size, name = "main")
        targetDQN = DQN(sess, input_size, output_size, name = "target")
        tf.global_variables_initializer().run()
    
        # initial copy q_net -> target_net
        copy_ops = get_copy_var_ops(dest_scope_name = "target", src_scope_name = "main")
    
        sess.run(copy_ops)
    
        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()
        
            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    # Choose an action by greedily from the Q-netsork
                    action = np.argmax(mainDQN.predict(state))
                
                # Get new state and reward from environment
                next_state, reward, done, _ = env.step(action)
                if done: # Penalty
                    reward = -100
                
                # Save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count > 10000: # Good enough. Let's move on
                    break

            print("Episode: {}  step:  {}".format(episode, step_count))
            if step_count > 10000:
                pass
                # break

            if episode % 10 == 1:  # train every 10 episode
                # Get a random batch of experiences.
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)

                print("Loss: ", loss)
                #copy q_net -> target_net
                sess.run(copy_ops)

        bot_play(mainDQN)
            
if __name__ == "__main__":
    main()

Episode: 0  step:  14
Episode: 1  step:  10
Loss:  0.727751
Episode: 2  step:  58
Episode: 3  step:  29
Episode: 4  step:  31
Episode: 5  step:  18
Episode: 6  step:  35
Episode: 7  step:  24
Episode: 8  step:  28
Episode: 9  step:  15
Episode: 10  step:  32
Episode: 11  step:  23
Loss:  2.52683
Episode: 12  step:  58
Episode: 13  step:  122
Episode: 14  step:  29
Episode: 15  step:  139
Episode: 16  step:  69
Episode: 17  step:  74
Episode: 18  step:  101
Episode: 19  step:  56
Episode: 20  step:  56
Episode: 21  step:  102
Loss:  2.00315
Episode: 22  step:  11
Episode: 23  step:  10
Episode: 24  step:  10
Episode: 25  step:  11
Episode: 26  step:  10
Episode: 27  step:  11
Episode: 28  step:  10
Episode: 29  step:  10
Episode: 30  step:  9
Episode: 31  step:  9
Loss:  474.02
Episode: 32  step:  40
Episode: 33  step:  19
Episode: 34  step:  30
Episode: 35  step:  42
Episode: 36  step:  29
Episode: 37  step:  19
Episode: 38  step:  56
Episode: 39  step:  50
Episode: 40  step:  28
Episo

Loss:  2.78619
Episode: 332  step:  71
Episode: 333  step:  103
Episode: 334  step:  59
Episode: 335  step:  52
Episode: 336  step:  68
Episode: 337  step:  47
Episode: 338  step:  94
Episode: 339  step:  79
Episode: 340  step:  77
Episode: 341  step:  55
Loss:  567.678
Episode: 342  step:  47
Episode: 343  step:  54
Episode: 344  step:  60
Episode: 345  step:  59
Episode: 346  step:  61
Episode: 347  step:  45
Episode: 348  step:  82
Episode: 349  step:  99
Episode: 350  step:  56
Episode: 351  step:  46
Loss:  504.677
Episode: 352  step:  37
Episode: 353  step:  61
Episode: 354  step:  36
Episode: 355  step:  44
Episode: 356  step:  48
Episode: 357  step:  40
Episode: 358  step:  34
Episode: 359  step:  58
Episode: 360  step:  56
Episode: 361  step:  49
Loss:  3.63016
Episode: 362  step:  52
Episode: 363  step:  71
Episode: 364  step:  74
Episode: 365  step:  53
Episode: 366  step:  35
Episode: 367  step:  69
Episode: 368  step:  58
Episode: 369  step:  68
Episode: 370  step:  56
Epi

In [7]:
bot_play(mainDQN)

AttributeError: 'NoneType' object has no attribute 'predict'