In [1]:
import gym
import tensorflow as tf
import numpy as np
from collections import defaultdict, deque

In [2]:
class Memory():
    def __init__(self, max_size=1000):
        self.max_size = max_size
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]

    def seed(self, env):
        i = 0
        while i < self.max_size:
            state = env.reset()
            done = False
            while True:
                action = env.action_space.sample()
                next_state, reward, done, _ = env.step(action)
                next_state = np.zeros(state.shape) if done else next_state
                self.add((state, action, reward, next_state))
                if done:
                    break
                i += 1

In [53]:
class NeuralAgent(object):
    def __init__(self, state_shape, num_actions, learning_rate=1e-3, hidden_size=32, gamma=0.9):
        self.gamma = gamma
        
        self.action = tf.placeholder(tf.int32, shape=[None])
        self.actions_one_hot = tf.one_hot(self.action, num_actions)
        
        self.state = tf.placeholder(tf.float32, shape=[None, *state_shape])
        # self.layer0 = tf.layers.dense(self.state, hidden_size, activation=tf.nn.relu, kernel_initializer=tf.truncated_normal_initializer(stddev=1e-1))
        # self.layer1 = tf.layers.dense(self.layer0, hidden_size, activation=tf.nn.relu, kernel_initializer=tf.truncated_normal_initializer(stddev=1e-1))
        # self.value = tf.layers.dense(self.layer1, num_actions, activation=None)
        self.layer0 = tf.contrib.layers.fully_connected(self.state, hidden_size)
        self.layer1 = tf.contrib.layers.fully_connected(self.layer0, hidden_size)
        self.value = tf.contrib.layers.fully_connected(self.layer1, num_actions, activation_fn=None)

        self.best_values, self.best_actions = tf.nn.top_k(self.value, k=2)
        self.best_action = tf.squeeze(self.best_actions[:, 0])
        self.second_best_action = tf.squeeze(self.best_actions[:, 1])
        self.best_reward = tf.squeeze(self.best_values[:, 0])
        self.second_best_reward = tf.squeeze(self.best_values[:, 1])
#         self.best_action = tf.squeeze(tf.argmax(self.value, axis=1))
#         self.best_reward = tf.squeeze(tf.reduce_max(self.value, axis=1))

        # self.expected_reward = tf.squeeze(tf.gather(self.value, self.action, axis=1))
        self.expected_reward = tf.reduce_sum(tf.multiply(self.value, self.actions_one_hot), axis=1)
        
        # self.reward = tf.placeholder(tf.float32, shape=[None])
        self.target = tf.placeholder(tf.float32, shape=[None])
        # self.next_reward = tf.placeholder(tf.float32, shape=[None])
        # self.loss = tf.nn.l2_loss(self.reward + self.gamma * self.next_reward - self.expected_reward)
        self.loss = tf.reduce_mean(tf.square(self.target - self.expected_reward)) + tf.reduce_mean(tf.square(self.best_reward - self.second_best_reward))
        self.train = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

def train(sess, env, agent, num_episodes, explore_decay=1e-4, batch_size=20, buffer_size=10000):
    memory = Memory(max_size=buffer_size)
    memory.seed(env)
    for i in range(num_episodes):
        epsilon = np.exp(-explore_decay*i)
        state = env.reset()
        total_reward = 0
        while True:
            if np.random.random() < epsilon:
                action = env.action_space.sample()
                value = None
            else:
                action, value = sess.run([agent.best_action, agent.value], feed_dict={
                    agent.state: [state],
                })
            next_state, reward, done, info = env.step(action)
            # if value is not None:
                # print(state, value, action, reward)
            total_reward += reward
            if done:
                # next_state is 0
                # print(state, reward)
                memory.add((state, action, reward, np.zeros(state.shape)))
                break
            memory.add((state, action, reward, next_state))
            next_state = state
            # run training
            loss = train_step(sess, agent, memory, batch_size)

        if i % (num_episodes//30) == 0:
            print(i, loss, epsilon, total_reward)

def train_step(sess, agent, memory, batch_size):
    state, action, reward, next_state = zip(*memory.sample(batch_size))
    next_reward, = sess.run([agent.best_reward], feed_dict={
        agent.state: next_state,
    })
    episode_ends = (next_state == np.zeros(state[0].shape)).all(axis=1)
    target = reward + agent.gamma * next_reward
    target[episode_ends] = 0
    _, loss = sess.run([agent.train, agent.loss], feed_dict={
        agent.state: state,
        agent.action: action,
        agent.target: target,
    })
    return loss

def play(sess, env, agent):
    state = env.reset()
    env.render()
    done = False
    while not done:
        action, value = sess.run([agent.best_action, agent.value], feed_dict={
            agent.state: [state],
        })
        state, reward, done, info = env.step(action)
        print(state, value, action, reward)
        env.render()
    env.close()

In [55]:
env = gym.make('CartPole-v0')
tf.reset_default_graph()
agent = NeuralAgent(env.reset().shape, env.action_space.n, learning_rate=1e-4, hidden_size=64, gamma=0.999)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
train(sess, env, agent, 1000, explore_decay=1e-4, batch_size=20, buffer_size=10000)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
0 1.0145406 1.0 12.0
33 6.6106834 0.9967054390154381 16.0
66 80.244316 0.9934217321629571 26.0
99 76.3348 0.9901488436829572 26.0
132 20.144272 0.9868867379336502 12.0
165 214.69324 0.9836353793906724 16.0
198 316.42194 0.9803947326466971 46.0
231 119.69615 0.9771647624110493 12.0
264 144.49698 0.9739454335093212 14.0
297 155.05605 0.9707367108829891 15.0
330 11.88345 0.967538559589032 9.0
363 146.74892 0.9643509447995507 77.0
396 273.38467 0.9611738318013887 35.0
429 23.965414 0.958007185995754 16.0
462 32.716953 0.9548509728978424 17.0
495 301.8011 0.9517051581364622 24.0
528 64.54038 0.9485697074536594 21.0
561 316.02863 0.9454445867043453 15.0
594 15.578945 0.9423297618559239 15.0
627 24.464447 0.9392251989879219 21.0
660 134.21796 0.9361308642916188 52.0
693 356.30508 0.9330467240696795 56.0
726 347.13812 0.9299727447357862 12.0
759 116.21411 0.9269088928142737 21.0
792 28.1

In [56]:
env = gym.make('CartPole-v0')
play(sess, env, agent)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[ 0.02698689  0.21116858  0.03766834 -0.23525833] [[48.649868 48.749046]] 1 1.0
[ 0.03121026  0.40573266  0.03296318 -0.51582555] [[48.751907 48.946613]] 1 1.0
[ 0.03932491  0.6003753   0.02264666 -0.79794149] [[48.667076 48.909157]] 1 1.0
[ 0.05133242  0.79517934  0.00668783 -1.08341506] [[49.053036 49.32095 ]] 1 1.0
[ 0.067236    0.9902124  -0.01498047 -1.3739919 ] [[50.333035 50.608376]] 1 1.0
[ 0.08704025  1.18551836 -0.0424603  -1.67132207] [[52.854507 53.14676 ]] 1 1.0
[ 0.11075062  1.38110706 -0.07588675 -1.97692027] [[55.74008  56.066074]] 1 1.0
[ 0.13837276  1.57694173 -0.11542515 -2.29211615] [[58.87269  59.244522]] 1 1.0
[ 0.16991159  1.77292386 -0.16126747 -2.61799291] [[62.08731  62.505493]] 1 1.0
[ 0.20537007  1.96887538 -0.21362733 -2.95531375] [[65.77322  66.221344]] 1 1.0


In [164]:
def human_play(env):
    state = env.reset()
    env.render()
    done = False
    while not done:
        action = int(input())
        state, reward, done, info = env.step(action)
        env.render()
    env.close()

In [167]:
env = gym.make('CartPole-v0')
human_play(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
2


AssertionError: 2 (<class 'int'>) invalid

In [None]:
class Agent(object):
    def __init__(self, num_actions, state_encoder=None, gamma=1, alpha=1e-1):
        # values is a dictionary mapping state -> estimated reward for each action
        self.num_actions = num_actions
        self.gamma = gamma
        self.alpha = alpha
        self.state_encoder = state_encoder
        self.values = defaultdict(lambda: np.zeros(num_actions))
    
    def choose_action(self, state, epsilon=0):
        '''
        Chooses an action according to an epsilon-greedy strategy.
        epsilon=0 corresponds to a pure greedy strategy
        epsilon=1 corresponds to a pure random strategy
        
        Arguments:
            state: a structure representing the environments state
            epsilon: a number between 0 and 1 inclusive
        
        Returns:
            action: an integer representing the action
            reward: the predicted reward
        '''
        if self.state_encoder is not None:
            state = self.state_encoder.encode_state(state)
        if state in self.values:
            best = np.argmax(self.values[state])
            if epsilon == 0:
                return best, self.values[state][best]
            probs = np.ones(self.num_actions, dtype=np.float32) * epsilon / self.num_actions
            probs[best] += 1 - epsilon
        else:
            probs = np.ones(self.num_actions, dtype=np.float32) / float(self.num_actions)
        action = np.random.choice(self.num_actions, p=probs)
        return action, self.values[state][action]
    
    def step(self, state, action, reward, next_state):
        if self.state_encoder is not None:
            state = self.state_encoder.encode_state(state)
            next_state = self.state_encoder.encode_state(next_state)
        next_action, _ = self.choose_action(next_state, epsilon=0)
        true_value = reward + self.gamma * self.values[next_state][next_action]
        error = true_value - self.values[state][action]
        self.values[state][action] += self.alpha * error
        return error

class Discretizer(object):
    def __init__(self, env, num_buckets, num_sample_episodes=1000):
        self.env = env
        samples = []
        for i in range(num_sample_episodes):
            state = env.reset()
            samples.append(state)
            done = False
            while not done:
                state, reward, done, info = env.step(np.random.choice(env.action_space.n))
                samples.append(state)
        samples = np.array(samples)
        self.low = np.min(samples, axis=0)
        self.high = np.max(samples, axis=0)
        self.range = self.high - self.low
        self.n = num_buckets

    def encode_state(self, state):
        '''Encode state takes in an environments state and returns a tuple.'''
        d = np.round((state - self.low) / self.range * self.n).astype(np.int32)
        return tuple(np.clip(d, 0, self.n))
    

def train(env, agent, num_episodes):
    batch_error = []
    batch_steps = []
    batch_rewards = []
    for i in range(num_episodes):
        epsilon = np.exp((-1e-4)*i)
        state = env.reset()
        errors = list()
        steps = 1
        total_reward = 0
        while True:
            action, pred_reward = agent.choose_action(state, epsilon=epsilon)
            next_state, reward, done, info = env.step(action)
            total_reward += reward * 10 - 1
            error = agent.step(state, action, reward, next_state)
#             errors.append(error**2)
            errors.append(error)
            steps += 1
            state = next_state
            if done:
                break
        batch_error.append(np.mean(errors))
        batch_steps.append(steps)
        batch_rewards.append(total_reward)
        if i % 100 == 0:
            print("Error: %.2f\tReward: %.2f\tSteps: %d" % (np.mean(batch_error), np.mean(batch_rewards), np.mean(batch_steps)))
            batch_error = []
            batch_steps = []
            batch_rewards = []

def play(env, agent):
    state = env.reset()
    env.render()
    done = False
    errors = []
    rewards = []
    steps = 0
    while not done:
        # use greedy strategy
        action, pred_reward = agent.choose_action(state, epsilon=0)
        state, reward, done, info = env.step(action)
        errors.append(reward - pred_reward)
        rewards.append(reward)
        steps += 1
        env.render()
    env.close()
    return sum(rewards), steps, np.mean(np.array(errors)**2)