In [1]:
import gym
import tensorflow as tf
import numpy as np
from collections import defaultdict, deque

In [108]:
class Memory():
    def __init__(self, max_size=1000, seed_length=20):
        self.max_size = max_size
        self.seed_length = seed_length
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]

    def seed(self, env):
        i = 0
        while i < self.seed_length:
            state = env.reset()
            done = False
            while True:
                action = env.action_space.sample()
                next_state, reward, done, _ = env.step(action)
                next_state = np.zeros(state.shape) if done else next_state
                self.add((state, action, reward, next_state))
                if done:
                    break
                i += 1

In [215]:
class NeuralAgent(object):
    def __init__(self, state_shape, num_actions, learning_rate=1e-3, hidden_size=32, gamma=0.9, top_two=100):
        self.gamma = gamma
        self.top_two = tf.constant(top_two, dtype=tf.float32)
        
        self.action = tf.placeholder(tf.int32, shape=[None])
        self.actions_one_hot = tf.one_hot(self.action, num_actions)
        
        self.state = tf.placeholder(tf.float32, shape=[None, *state_shape])
        # self.layer0 = tf.layers.dense(self.state, hidden_size, activation=tf.nn.relu, kernel_initializer=tf.truncated_normal_initializer(stddev=1e-1))
        # self.layer1 = tf.layers.dense(self.layer0, hidden_size, activation=tf.nn.relu, kernel_initializer=tf.truncated_normal_initializer(stddev=1e-1))
        # self.value = tf.layers.dense(self.layer1, num_actions, activation=None)
        self.layer0 = tf.contrib.layers.fully_connected(self.state, hidden_size)
        self.layer1 = tf.contrib.layers.fully_connected(self.layer0, hidden_size)
        self.value = tf.contrib.layers.fully_connected(self.layer1, num_actions, activation_fn=None)

        self.best_values, self.best_actions = tf.nn.top_k(self.value, k=2)
        self.best_action = tf.squeeze(self.best_actions[:, 0])
        self.second_best_action = tf.squeeze(self.best_actions[:, 1])
        self.best_reward = tf.squeeze(self.best_values[:, 0])
        self.second_best_reward = tf.squeeze(self.best_values[:, 1])
#         self.best_action = tf.squeeze(tf.argmax(self.value, axis=1))
#         self.best_reward = tf.squeeze(tf.reduce_max(self.value, axis=1))

        # self.expected_reward = tf.squeeze(tf.gather(self.value, self.action, axis=1))
        self.expected_reward = tf.reduce_sum(tf.multiply(self.value, self.actions_one_hot), axis=1)
        
        # self.reward = tf.placeholder(tf.float32, shape=[None])
        self.target = tf.placeholder(tf.float32, shape=[None])
        # self.next_reward = tf.placeholder(tf.float32, shape=[None])
        # self.loss = tf.nn.l2_loss(self.reward + self.gamma * self.next_reward - self.expected_reward)
        best_actions_float = tf.cast(self.best_actions, tf.float32)
        self.print = tf.Print([self.value, self.best_values, best_actions_float], [self.value, self.best_values, best_actions_float])
        self.loss = tf.reduce_mean(tf.square(self.target - self.expected_reward)) + self.top_two*tf.reduce_mean(tf.square(self.best_reward - self.second_best_reward))
        self.train = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

def train(sess, env, agent, num_episodes, explore_decay=1e-4, batch_size=20, buffer_size=10000):
    memory = Memory(max_size=buffer_size, seed_length=batch_size)
    memory.seed(env)
    for i in range(num_episodes):
        epsilon = np.exp(-explore_decay*i)
        state = env.reset()
        # one random step to get it moving
        state, reward, done, _ = env.step(env.action_space.sample())
        total_reward = 0
        total_reward += reward
        while True:
            if np.random.random() < epsilon:
                action = env.action_space.sample()
                value = None
            else:
                action, value = sess.run([agent.best_action, agent.value], feed_dict={
                    agent.state: [state],
                })
            next_state, reward, done, info = env.step(action)
            # if value is not None:
                # print(state, value, action, reward)
            total_reward += reward
            if done:
                # next_state is 0
                # print(state, reward)
                memory.add((state, action, reward, np.zeros(state.shape)))
                break
            memory.add((state, action, reward, next_state))
            state = next_state
            # run training
            loss = train_step(sess, agent, memory, batch_size)

        if i % (num_episodes//30) == 0:
            print(i, loss, epsilon, total_reward)

def train_step(sess, agent, memory, batch_size):
    state, action, reward, next_state = zip(*memory.sample(batch_size))
    next_reward, = sess.run([agent.best_reward], feed_dict={
        agent.state: next_state,
    })
    episode_ends = (next_state == np.zeros(state[0].shape)).all(axis=1)
    target = reward + agent.gamma * next_reward
    target[episode_ends] = -10
    _, loss = sess.run([
        agent.train,
        agent.loss,
    #    agent.print
    ], feed_dict={
        agent.state: state,
        agent.action: action,
        agent.target: target,
    })
    return loss

def play(sess, env, agent):
    try:
        state = env.reset()
        env.render()
        done = False
        total_reward = 0
        # one random step to get it moving
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        env.render()
        total_reward += reward
        while not done:
            action, value = sess.run([agent.best_action, agent.value], feed_dict={
                agent.state: [state],
            })
            state, reward, done, info = env.step(action)
            total_reward += reward
            print(state, value, action, reward)
            env.render()
        env.close()
        print(total_reward)
    finally:
        env.close()

In [220]:
env = gym.make('CartPole-v1')
tf.reset_default_graph()
agent = NeuralAgent(env.reset().shape, env.action_space.n, learning_rate=1e-4, hidden_size=64, gamma=0.99, top_two=0)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
train(sess, env, agent, 2000, explore_decay=5e-3, batch_size=20, buffer_size=10000)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
0 6.0854683 1.0 11.0
66 1.0594946 0.7189237334319262 61.0
132 24.13586 0.5168513344916992 17.0
198 35.351227 0.3715766910220457 125.0
264 2.5518212 0.26713530196585034 62.0
330 0.3209721 0.19204990862075408 224.0
396 0.12070046 0.13806923731089282 290.0
462 0.07378008 0.09926125155964566 500.0
528 1.8764896 0.07136126955638605 11.0
594 0.10458519 0.05130331033191911 403.0
660 0.07975681 0.036883167401239994 500.0
726 0.036237795 0.02651618440889418 486.0
792 0.084814034 0.019063114291611637 292.0
858 0.70962465 0.013704925297364945 12.0
924 20.591652 0.009852796061187257 9.0
990 0.08315927 0.0070834089290521185 390.0
1056 0.16619228 0.00509243079269919 500.0
1122 0.037723083 0.0036610693577310053 443.0
1188 0.05842433 0.0026320296510131984 500.0
1254 0.029255103 0.001892228583209938 500.0
1320 0.086191036 0.0013603680375478928 500.0
1386 0.570027 0.0009780008683953946 500.0
1452 

In [221]:
env = gym.make('CartPole-v1')
play(sess, env, agent)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[-0.00425674  0.04142584  0.05493316  0.0667296 ] [[83.0953   90.782974]] 1 1.0
[-0.00342822  0.23571893  0.05626775 -0.20812819] [[89.49143 92.1352 ]] 1 1.0
[0.00128615 0.03983943 0.05210519 0.10176005] [[91.79396 90.07947]] 0 1.0
[ 0.00208294  0.23417742  0.05414039 -0.17403932] [[89.55337 92.68454]] 1 1.0
[0.00676649 0.0383241  0.0506596  0.13521971] [[92.28485  91.040535]] 0 1.0
[ 0.00753297  0.2326852   0.053364   -0.14106036] [[89.42551  93.106415]] 1 1.0
[0.01218668 0.03684118 0.05054279 0.16796905] [[92.58145 91.91248]] 0 1.0
[ 0.0129235   0.2312046   0.05390217 -0.10835052] [[88.85094  93.222015]] 1 1.0
[ 0.01754759  0.42551436  0.05173516 -0.38355228] [[92.574615 92.62364 ]] 1 1.0
[ 0.02605788  0.22969751  0.04406412 -0.07501664] [[91.874825 88.02566 ]] 0 1.0
[ 0.03065183  0.42416097  0.04256378 -0.35347803] [[92.61963  92.756325]] 1 1.0
[ 0.03913505  0.22846044  0.0354

[ 0.50829238  0.22698379 -0.00522939 -0.01507812] [[90.70218 86.8695 ]] 0 1.0
[ 0.51283205  0.03193723 -0.00553095  0.2759503 ] [[90.59551 90.52074]] 0 1.0
[ 5.13470797e-01  2.27137650e-01 -1.19482720e-05 -1.84719524e-02] [[86.85936 91.04893]] 1 1.0
[ 5.18013550e-01  4.22259773e-01 -3.81387321e-04 -3.11158649e-01] [[90.33324 90.5446 ]] 1 1.0
[ 0.52645875  0.22714326 -0.00660456 -0.01859602] [[90.92362 87.04049]] 0 1.0
[ 0.53100161  0.03211664 -0.00697648  0.27199579] [[90.649704 90.51128 ]] 0 1.0
[ 0.53164394  0.22733744 -0.00153656 -0.02287936] [[86.968254 91.08218 ]] 1 1.0
[ 0.53619069  0.4224814  -0.00199415 -0.31604669] [[90.38179 90.51053]] 1 1.0
[ 0.54464032  0.22738791 -0.00831509 -0.02399332] [[91.0489 87.0759]] 0 1.0
[ 0.54918808  0.03238618 -0.00879495  0.26605455] [[90.79675 90.52331]] 0 1.0
[ 0.5498358   0.22763255 -0.00347386 -0.02938935] [[87.09545 91.08108]] 1 1.0
[ 0.55438845  0.03256059 -0.00406165  0.26219551] [[90.54052 90.49548]] 0 1.0
[ 0.55503966  0.22774028  0.00

[ 0.88356606  0.03698364 -0.00571507  0.16462215] [[91.09519 90.80985]] 0 1.0
[ 0.88430573  0.23218694 -0.00242262 -0.12985823] [[87.194176 91.10555 ]] 1 1.0
[ 0.88894947  0.03709977 -0.00501979  0.16205941] [[91.0346   90.767075]] 0 1.0
[ 0.88969146  0.23229322 -0.0017786  -0.1322029 ] [[87.13806 91.06573]] 1 1.0
[ 0.89433733  0.0371968  -0.00442266  0.15991838] [[90.9859  90.73771]] 0 1.0
[ 0.89508126  0.23238178 -0.00122429 -0.1341565 ] [[87.094826 91.040016]] 1 1.0
[ 0.8997289   0.03727739 -0.00390742  0.15813993] [[90.949974 90.72204 ]] 0 1.0
[ 9.00474447e-01  2.32455066e-01 -7.44620698e-04 -1.35773126e-01] [[87.062485 91.02648 ]] 1 1.0
[ 0.90512355  0.03734379 -0.00346008  0.15667479] [[90.926186 90.71926 ]] 0 1.0
[ 9.05870424e-01  2.32515108e-01 -3.26587402e-04 -1.37097699e-01] [[87.03939  91.023415]] 1 1.0
[ 0.91052073  0.03739784 -0.00306854  0.15548218] [[90.912865 90.727715]] 0 1.0
[ 9.11268683e-01  2.32563588e-01  4.11021691e-05 -1.38167222e-01] [[87.024124 91.029434]] 1 1.

[ 0.96951025 -0.33080294 -0.02270003  0.25466061] [[74.90581  74.380646]] 0 1.0
[ 0.96289419 -0.13536435 -0.01760682 -0.04509498] [[70.60028  74.666695]] 1 1.0
[ 0.96018691 -0.33022946 -0.01850872  0.24198124] [[74.34396 73.81662]] 0 1.0
[ 0.95358232 -0.13484808 -0.0136691  -0.05648188] [[70.037025 74.11076 ]] 1 1.0
[ 0.95088535 -0.3297714  -0.01479873  0.2318572 ] [[73.86985 73.35366]] 0 1.0
[ 0.94428993 -0.13444115 -0.01016159 -0.06545677] [[69.546974 73.64825 ]] 1 1.0
[ 0.9416011  -0.32941595 -0.01147072  0.22400288] [[73.451675 72.9603  ]] 0 1.0
[ 0.93501278 -0.13413195 -0.00699067 -0.07227617] [[69.11721  73.267525]] 1 1.0
[ 0.93233015 -0.32915298 -0.00843619  0.21819299] [[73.10014 72.64864]] 0 1.0
[ 0.92574709 -0.13391146 -0.00407233 -0.07713908] [[68.73652  72.959175]] 1 1.0
[ 0.92306886 -0.32897479 -0.00561511  0.21425623] [[72.81277 72.41765]] 0 1.0
[ 0.91648936 -0.13377302 -0.00132999 -0.08019265] [[68.39499  72.715675]] 1 1.0
[ 0.9138139  -0.32887588 -0.00293384  0.21207036

[ 0.60232593  0.05774371  0.00728313 -0.29336053] [[71.635605 71.9233  ]] 1 1.0
[ 0.6034808  -0.13748132  0.00141592  0.00161047] [[72.78297  68.875885]] 0 1.0
[ 0.60073118 -0.33262355  0.00144813  0.2947398 ] [[72.00034 71.98552]] 0 1.0
[ 0.59407871 -0.13752227  0.00734293  0.00251395] [[67.73497 72.69391]] 1 1.0
[ 0.59132826  0.05749361  0.00739321 -0.28784318] [[71.66319  72.006256]] 1 1.0
[ 0.59247813 -0.13773299  0.00163634  0.0071623 ] [[72.87768 69.01053]] 0 1.0
[ 0.58972347  0.05736545  0.00177959 -0.28500389] [[72.01842 72.07378]] 1 1.0
[ 0.59087078 -0.13778183 -0.00392049  0.00823977] [[73.04232 68.90695]] 0 1.0
[ 0.58811515 -0.33284734 -0.00375569  0.29968316] [[72.285965 71.99582 ]] 0 1.0
[ 0.5814582  -0.13767206  0.00223797  0.00581813] [[68.21148  72.844284]] 1 1.0
[ 0.57870476  0.05741773  0.00235433 -0.28615785] [[71.86288  71.882675]] 1 1.0
[ 0.57985311 -0.13773772 -0.00336882  0.00726669] [[72.90225 68.73172]] 0 1.0
[ 0.57709836 -0.3328112  -0.00322349  0.29888481] [[

In [164]:
def human_play(env):
    state = env.reset()
    env.render()
    done = False
    while not done:
        action = int(input())
        state, reward, done, info = env.step(action)
        env.render()
    env.close()

In [167]:
env = gym.make('CartPole-v0')
human_play(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
2


AssertionError: 2 (<class 'int'>) invalid

In [None]:
class Agent(object):
    def __init__(self, num_actions, state_encoder=None, gamma=1, alpha=1e-1):
        # values is a dictionary mapping state -> estimated reward for each action
        self.num_actions = num_actions
        self.gamma = gamma
        self.alpha = alpha
        self.state_encoder = state_encoder
        self.values = defaultdict(lambda: np.zeros(num_actions))
    
    def choose_action(self, state, epsilon=0):
        '''
        Chooses an action according to an epsilon-greedy strategy.
        epsilon=0 corresponds to a pure greedy strategy
        epsilon=1 corresponds to a pure random strategy
        
        Arguments:
            state: a structure representing the environments state
            epsilon: a number between 0 and 1 inclusive
        
        Returns:
            action: an integer representing the action
            reward: the predicted reward
        '''
        if self.state_encoder is not None:
            state = self.state_encoder.encode_state(state)
        if state in self.values:
            best = np.argmax(self.values[state])
            if epsilon == 0:
                return best, self.values[state][best]
            probs = np.ones(self.num_actions, dtype=np.float32) * epsilon / self.num_actions
            probs[best] += 1 - epsilon
        else:
            probs = np.ones(self.num_actions, dtype=np.float32) / float(self.num_actions)
        action = np.random.choice(self.num_actions, p=probs)
        return action, self.values[state][action]
    
    def step(self, state, action, reward, next_state):
        if self.state_encoder is not None:
            state = self.state_encoder.encode_state(state)
            next_state = self.state_encoder.encode_state(next_state)
        next_action, _ = self.choose_action(next_state, epsilon=0)
        true_value = reward + self.gamma * self.values[next_state][next_action]
        error = true_value - self.values[state][action]
        self.values[state][action] += self.alpha * error
        return error

class Discretizer(object):
    def __init__(self, env, num_buckets, num_sample_episodes=1000):
        self.env = env
        samples = []
        for i in range(num_sample_episodes):
            state = env.reset()
            samples.append(state)
            done = False
            while not done:
                state, reward, done, info = env.step(np.random.choice(env.action_space.n))
                samples.append(state)
        samples = np.array(samples)
        self.low = np.min(samples, axis=0)
        self.high = np.max(samples, axis=0)
        self.range = self.high - self.low
        self.n = num_buckets

    def encode_state(self, state):
        '''Encode state takes in an environments state and returns a tuple.'''
        d = np.round((state - self.low) / self.range * self.n).astype(np.int32)
        return tuple(np.clip(d, 0, self.n))
    

def train(env, agent, num_episodes):
    batch_error = []
    batch_steps = []
    batch_rewards = []
    for i in range(num_episodes):
        epsilon = np.exp((-1e-4)*i)
        state = env.reset()
        errors = list()
        steps = 1
        total_reward = 0
        while True:
            action, pred_reward = agent.choose_action(state, epsilon=epsilon)
            next_state, reward, done, info = env.step(action)
            total_reward += reward * 10 - 1
            error = agent.step(state, action, reward, next_state)
#             errors.append(error**2)
            errors.append(error)
            steps += 1
            state = next_state
            if done:
                break
        batch_error.append(np.mean(errors))
        batch_steps.append(steps)
        batch_rewards.append(total_reward)
        if i % 100 == 0:
            print("Error: %.2f\tReward: %.2f\tSteps: %d" % (np.mean(batch_error), np.mean(batch_rewards), np.mean(batch_steps)))
            batch_error = []
            batch_steps = []
            batch_rewards = []

def play(env, agent):
    state = env.reset()
    env.render()
    done = False
    errors = []
    rewards = []
    steps = 0
    while not done:
        # use greedy strategy
        action, pred_reward = agent.choose_action(state, epsilon=0)
        state, reward, done, info = env.step(action)
        errors.append(reward - pred_reward)
        rewards.append(reward)
        steps += 1
        env.render()
    env.close()
    return sum(rewards), steps, np.mean(np.array(errors)**2)