In [1]:
import tensorflow as tf
import gym
import numpy as np
import random

In [2]:
GAMMA = 0.9
ALPHA = 0.01
MAX_EPISODE = 10000
MAX_STEPS = 500
INITIAL_EPSILON = 0.5
FINAL_EPSILON = 0.01
REPLAY_SIZE = 10000
BATCH_SIZE = 32
MAX_TEST_STEPS = 1000

class DQN(object):
    def __init__(self, env):
        self.feature_size = env.observation_space.shape[0]
        self.action_num = env.action_space.n
        
        self.params = []
        self.replay = []
        
        self.epsilon = INITIAL_EPSILON
        
        self.input_state = tf.placeholder(tf.float32, shape=(None, self.feature_size))
        
        self.Qvalue = self.create_q_net(self.input_state, self.feature_size, self.action_num)
        
        self.target_qvalue = tf.placeholder(tf.float32, shape=(None,))
        
        self.action = tf.placeholder(tf.float32, shape=(None, self.action_num))
        
        self.train_op = self.create_training_op()
        
        self.session = tf.InteractiveSession()
        self.session.run(tf.initialize_all_variables())
        

        
    def create_q_net(self, x, feature_size, action_num):
        w1 = create_variable('w1', (feature_size, 128), tf.truncated_normal_initializer(), self.params)
        b1 = create_variable('b1', (128), tf.random_normal_initializer(), self.params)
        h1 = tf.nn.relu(tf.nn.bias_add(tf.matmul(x, w1), b1))
        
        w2 = create_variable('w2', (128, action_num), tf.truncated_normal_initializer(), self.params)
        b2 = create_variable('b2', (action_num), tf.random_normal_initializer(), self.params)
        
        q = tf.nn.bias_add(tf.matmul(h1, w2), b2)
        
        return q
    
    
    def create_training_op(self):
        chosen_q = tf.reduce_sum(tf.mul(self.Qvalue, self.action), reduction_indices=1)
        loss = tf.reduce_mean(tf.square(self.target_qvalue - chosen_q))
        optim = tf.train.AdamOptimizer(0.0001).minimize(loss)
        return optim
    
    
    def epsilon_greedy_action(self, state):
        Q_value = self.Qvalue.eval(feed_dict={self.input_state: [state]})
        if random.random() <= self.epsilon:
            return random.randint(0, self.action_num-1)
        else:
            return np.argmax(Q_value)
    
    
    def decrease_epsilon(self):
        self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/10000
        
        
    
    def deterministic_action(self, state):
        Q_value = self.Qvalue.eval(feed_dict={self.input_state: [state]})
        return np.argmax(Q_value)
    
    
    
    def train_network(self):
        inp = random.sample(self.replay, BATCH_SIZE)
        
        indx = np.eye(self.action_num)
        
        x = [t[0] for t in inp]
        a = [indx[t[1]] for t in inp]
        r = [t[2] for t in inp]
        nx = [t[3] for t in inp]
        d = [t[4] for t in inp]
        
        qval = np.max(self.Qvalue.eval(feed_dict={self.input_state: nx}), axis=1)
        
        y = []
        for i in range(BATCH_SIZE):
            if d[i]:
                y.append(r[i])
            else:
                y.append(r[i] + GAMMA * qval[i])
                
        
        self.train_op.run(feed_dict={
                self.input_state: x,
                self.action: a,
                self.target_qvalue: y,
            })
        
        
    def learning(self, state, action, reward, next_state, done):
        self.replay.append([state, action, reward, next_state, done])
        if len(self.replay) > REPLAY_SIZE:
            self.replay.pop(0)
        
        if len(self.replay) > BATCH_SIZE:
            self.train_network()

In [3]:
def create_variable(name, shape, initializer, params):
    w = tf.get_variable(name, shape=shape, initializer=initializer)
    params.append(w)
    return w

In [4]:
env = gym.make('CartPole-v0')
agent = DQN(env)

for i in range(MAX_EPISODE):
    state = env.reset()
    total_reward = 0
    for j in range(MAX_STEPS):
        action = agent.epsilon_greedy_action(state)
        next_state, reward, done, _ = env.step(action)

        total_reward += reward
        agent.learning(state, action, reward, next_state, done)
        state = next_state

        if done:
            print "Episode %d, ended in %3d steps, total_reward = %.3f" % (i, j, total_reward)
            break

    if i % 100 == 0:
        total_reward = 0
        state = env.reset()
        for j in range(MAX_TEST_STEPS):
            env.render()
            action = agent.deterministic_action(state)
            state, reward, done, _ = env.step(action)
            total_reward += reward

            if done:
                print "TEST Episode %d, ended in %3d steps, total_reward = %.3f" % (i, j, total_reward)
                break           

[2016-07-04 17:27:35,692] Making new env: CartPole-v0


Episode 0, ended in  12 steps, total_reward = 13.000
TEST Episode 0, ended in   7 steps, total_reward = 8.000
Episode 1, ended in   8 steps, total_reward = 9.000
Episode 2, ended in  10 steps, total_reward = 11.000
Episode 3, ended in  14 steps, total_reward = 15.000
Episode 4, ended in  11 steps, total_reward = 12.000
Episode 5, ended in  12 steps, total_reward = 13.000
Episode 6, ended in  34 steps, total_reward = 35.000
Episode 7, ended in  11 steps, total_reward = 12.000
Episode 8, ended in  12 steps, total_reward = 13.000
Episode 9, ended in  10 steps, total_reward = 11.000
Episode 10, ended in  10 steps, total_reward = 11.000
Episode 11, ended in  11 steps, total_reward = 12.000
Episode 12, ended in  18 steps, total_reward = 19.000
Episode 13, ended in  23 steps, total_reward = 24.000
Episode 14, ended in  15 steps, total_reward = 16.000
Episode 15, ended in  10 steps, total_reward = 11.000
Episode 16, ended in  15 steps, total_reward = 16.000
Episode 17, ended in   8 steps, tota

KeyboardInterrupt: 