In [1]:
import gym
import tensorflow as tf
import numpy as np
from collections import defaultdict, deque

In [2]:
class Memory():
    def __init__(self, max_size=1000):
        self.max_size = max_size
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]

    def seed(self, env):
        i = 0
        while i < self.max_size:
            state = env.reset()
            done = False
            while True:
                action = env.action_space.sample()
                next_state, reward, done, _ = env.step(action)
                next_state = np.zeros(state.shape) if done else next_state
                self.add((state, action, reward, next_state))
                if done:
                    break
                i += 1

In [82]:
class NeuralAgent(object):
    def __init__(self, state_shape, num_actions, learning_rate=1e-3, hidden_size=32, gamma=0.9, top_two=100):
        self.gamma = gamma
        self.top_two = tf.constant(top_two, dtype=tf.float32)
        
        self.action = tf.placeholder(tf.int32, shape=[None])
        self.actions_one_hot = tf.one_hot(self.action, num_actions)
        
        self.state = tf.placeholder(tf.float32, shape=[None, *state_shape])
        # self.layer0 = tf.layers.dense(self.state, hidden_size, activation=tf.nn.relu, kernel_initializer=tf.truncated_normal_initializer(stddev=1e-1))
        # self.layer1 = tf.layers.dense(self.layer0, hidden_size, activation=tf.nn.relu, kernel_initializer=tf.truncated_normal_initializer(stddev=1e-1))
        # self.value = tf.layers.dense(self.layer1, num_actions, activation=None)
        self.layer0 = tf.contrib.layers.fully_connected(self.state, hidden_size)
        self.layer1 = tf.contrib.layers.fully_connected(self.layer0, hidden_size)
        self.value = tf.contrib.layers.fully_connected(self.layer1, num_actions, activation_fn=None)

        self.best_values, self.best_actions = tf.nn.top_k(self.value, k=2)
        self.best_action = tf.squeeze(self.best_actions[:, 0])
        self.second_best_action = tf.squeeze(self.best_actions[:, 1])
        self.best_reward = tf.squeeze(self.best_values[:, 0])
        self.second_best_reward = tf.squeeze(self.best_values[:, 1])
#         self.best_action = tf.squeeze(tf.argmax(self.value, axis=1))
#         self.best_reward = tf.squeeze(tf.reduce_max(self.value, axis=1))

        # self.expected_reward = tf.squeeze(tf.gather(self.value, self.action, axis=1))
        self.expected_reward = tf.reduce_sum(tf.multiply(self.value, self.actions_one_hot), axis=1)
        
        # self.reward = tf.placeholder(tf.float32, shape=[None])
        self.target = tf.placeholder(tf.float32, shape=[None])
        # self.next_reward = tf.placeholder(tf.float32, shape=[None])
        # self.loss = tf.nn.l2_loss(self.reward + self.gamma * self.next_reward - self.expected_reward)
        best_actions_float = tf.cast(self.best_actions, tf.float32)
        self.print = tf.Print([self.value, self.best_values, best_actions_float], [self.value, self.best_values, best_actions_float])
        self.loss = tf.reduce_mean(tf.square(self.target - self.expected_reward)) + self.top_two*tf.reduce_mean(tf.square(self.best_reward - self.second_best_reward))
        self.train = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

def train(sess, env, agent, num_episodes, explore_decay=1e-4, batch_size=20, buffer_size=10000):
    memory = Memory(max_size=buffer_size)
    memory.seed(env)
    for i in range(num_episodes):
        epsilon = np.exp(-explore_decay*i)
        state = env.reset()
        total_reward = 0
        while True:
            if np.random.random() < epsilon:
                action = env.action_space.sample()
                value = None
            else:
                action, value = sess.run([agent.best_action, agent.value], feed_dict={
                    agent.state: [state],
                })
            next_state, reward, done, info = env.step(action)
            # if value is not None:
                # print(state, value, action, reward)
            total_reward += reward
            if done:
                # next_state is 0
                # print(state, reward)
                memory.add((state, action, reward, np.zeros(state.shape)))
                break
            memory.add((state, action, reward, next_state))
            next_state = state
            # run training
            loss = train_step(sess, agent, memory, batch_size)

        if i % (num_episodes//30) == 0:
            print(i, loss, epsilon, total_reward)

def train_step(sess, agent, memory, batch_size):
    state, action, reward, next_state = zip(*memory.sample(batch_size))
    next_reward, = sess.run([agent.best_reward], feed_dict={
        agent.state: next_state,
    })
    episode_ends = (next_state == np.zeros(state[0].shape)).all(axis=1)
    target = reward + agent.gamma * next_reward
    target[episode_ends] = -10
    _, loss = sess.run([
        agent.train,
        agent.loss,
    #    agent.print
    ], feed_dict={
        agent.state: state,
        agent.action: action,
        agent.target: target,
    })
    return loss

def play(sess, env, agent):
    state = env.reset()
    env.render()
    done = False
    total_reward = 0
    while not done:
        action, value = sess.run([agent.best_action, agent.value], feed_dict={
            agent.state: [state],
        })
        state, reward, done, info = env.step(action)
        total_reward += reward
        print(state, value, action, reward)
        env.render()
    env.close()
    print(total_reward)

In [84]:
env = gym.make('CartPole-v0')
tf.reset_default_graph()
agent = NeuralAgent(env.reset().shape, env.action_space.n, learning_rate=1e-4, hidden_size=64, gamma=0.999, top_two=1e7)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
train(sess, env, agent, 2000, explore_decay=1e-3, batch_size=20, buffer_size=10000)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
0 203.0741 1.0 26.0
66 3.306761 0.9361308642916188 9.0
132 6.7076674 0.8763409950793732 29.0
198 16.291662 0.8203698531378311 15.0
264 6.0952826 0.7679735396567061 16.0
330 1.2085667 0.7189237334319262 12.0
396 11.009008 0.6730066959373864 23.0
462 1.1906924 0.6300223399419123 16.0
528 6.4316063 0.5897833576128504 12.0
594 1.3088791 0.5521144043069306 11.0
660 1.3286767 0.5168513344916992 18.0
726 11.222813 0.483840486467991 18.0
792 6.1772714 0.4529380127765577 16.0
858 1.1166687 0.4240092533710473 15.0
924 11.093655 0.39692814882588245 19.0
990 6.151258 0.3715766910220457 11.0
1056 16.170982 0.3478444089170874 10.0
1122 6.0968285 0.32562788715856034 10.0
1188 6.174362 0.30483031544319683 14.0
1254 11.08802 0.28536106665812666 12.0
1320 6.209101 0.26713530196585034 10.0
1386 11.166468 0.2500736011120941 9.0
1452 1.088772 0.2341016163455822 24.0
1518 11.09774 0.2191497484416548 9

In [90]:
env = gym.make('CartPole-v0')
play(sess, env, agent)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[-0.00561728 -0.22225645  0.01109633  0.3325386 ] [[0.0038058  0.00372704]] 0 1.0
[-0.01006241 -0.02729419  0.0177471   0.04337545] [[-0.01818687 -0.00615237]] 1 1.0
[-0.0106083   0.16756884  0.01861461 -0.24365568] [[0.00369611 0.00380344]] 1 1.0
[-0.00725692 -0.02781398  0.0137415   0.05484015] [[0.01729968 0.01666563]] 0 1.0
[-0.0078132  -0.22313024  0.0148383   0.35182675] [[0.00333026 0.00316543]] 0 1.0
[-0.0122758  -0.02822241  0.02187484  0.06385949] [[-0.0192674  -0.00571072]] 1 1.0
[-0.01284025 -0.22365105  0.02315203  0.36336301] [[0.00349962 0.00316365]] 0 1.0
[-0.01731327 -0.02886567  0.03041929  0.07806933] [[-0.01928915 -0.00517028]] 1 1.0
[-0.01789059 -0.22441019  0.03198067  0.38019231] [[0.00328123 0.00270816]] 0 1.0
[-0.02237879 -0.02975663  0.03958452  0.09776198] [[-0.01967306 -0.00459427]] 1 1.0
[-0.02297392 -0.2254229   0.04153976  0.40266627] [[0.00303065 0

In [164]:
def human_play(env):
    state = env.reset()
    env.render()
    done = False
    while not done:
        action = int(input())
        state, reward, done, info = env.step(action)
        env.render()
    env.close()

In [167]:
env = gym.make('CartPole-v0')
human_play(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
2


AssertionError: 2 (<class 'int'>) invalid

In [None]:
class Agent(object):
    def __init__(self, num_actions, state_encoder=None, gamma=1, alpha=1e-1):
        # values is a dictionary mapping state -> estimated reward for each action
        self.num_actions = num_actions
        self.gamma = gamma
        self.alpha = alpha
        self.state_encoder = state_encoder
        self.values = defaultdict(lambda: np.zeros(num_actions))
    
    def choose_action(self, state, epsilon=0):
        '''
        Chooses an action according to an epsilon-greedy strategy.
        epsilon=0 corresponds to a pure greedy strategy
        epsilon=1 corresponds to a pure random strategy
        
        Arguments:
            state: a structure representing the environments state
            epsilon: a number between 0 and 1 inclusive
        
        Returns:
            action: an integer representing the action
            reward: the predicted reward
        '''
        if self.state_encoder is not None:
            state = self.state_encoder.encode_state(state)
        if state in self.values:
            best = np.argmax(self.values[state])
            if epsilon == 0:
                return best, self.values[state][best]
            probs = np.ones(self.num_actions, dtype=np.float32) * epsilon / self.num_actions
            probs[best] += 1 - epsilon
        else:
            probs = np.ones(self.num_actions, dtype=np.float32) / float(self.num_actions)
        action = np.random.choice(self.num_actions, p=probs)
        return action, self.values[state][action]
    
    def step(self, state, action, reward, next_state):
        if self.state_encoder is not None:
            state = self.state_encoder.encode_state(state)
            next_state = self.state_encoder.encode_state(next_state)
        next_action, _ = self.choose_action(next_state, epsilon=0)
        true_value = reward + self.gamma * self.values[next_state][next_action]
        error = true_value - self.values[state][action]
        self.values[state][action] += self.alpha * error
        return error

class Discretizer(object):
    def __init__(self, env, num_buckets, num_sample_episodes=1000):
        self.env = env
        samples = []
        for i in range(num_sample_episodes):
            state = env.reset()
            samples.append(state)
            done = False
            while not done:
                state, reward, done, info = env.step(np.random.choice(env.action_space.n))
                samples.append(state)
        samples = np.array(samples)
        self.low = np.min(samples, axis=0)
        self.high = np.max(samples, axis=0)
        self.range = self.high - self.low
        self.n = num_buckets

    def encode_state(self, state):
        '''Encode state takes in an environments state and returns a tuple.'''
        d = np.round((state - self.low) / self.range * self.n).astype(np.int32)
        return tuple(np.clip(d, 0, self.n))
    

def train(env, agent, num_episodes):
    batch_error = []
    batch_steps = []
    batch_rewards = []
    for i in range(num_episodes):
        epsilon = np.exp((-1e-4)*i)
        state = env.reset()
        errors = list()
        steps = 1
        total_reward = 0
        while True:
            action, pred_reward = agent.choose_action(state, epsilon=epsilon)
            next_state, reward, done, info = env.step(action)
            total_reward += reward * 10 - 1
            error = agent.step(state, action, reward, next_state)
#             errors.append(error**2)
            errors.append(error)
            steps += 1
            state = next_state
            if done:
                break
        batch_error.append(np.mean(errors))
        batch_steps.append(steps)
        batch_rewards.append(total_reward)
        if i % 100 == 0:
            print("Error: %.2f\tReward: %.2f\tSteps: %d" % (np.mean(batch_error), np.mean(batch_rewards), np.mean(batch_steps)))
            batch_error = []
            batch_steps = []
            batch_rewards = []

def play(env, agent):
    state = env.reset()
    env.render()
    done = False
    errors = []
    rewards = []
    steps = 0
    while not done:
        # use greedy strategy
        action, pred_reward = agent.choose_action(state, epsilon=0)
        state, reward, done, info = env.step(action)
        errors.append(reward - pred_reward)
        rewards.append(reward)
        steps += 1
        env.render()
    env.close()
    return sum(rewards), steps, np.mean(np.array(errors)**2)