In [1]:
import gym
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
class FfAgentDiscrete(object):
    def __init__(self, session, input_size, output_size, gamma=0.99):
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.gamma = gamma
        
        self.observations_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.input_size])
        # expected sum of discounted rewards
        self.esdr_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1])
        self.v_s_ph  = tf.placeholder(dtype=tf.float32, shape=[None, 1]) # V(s)
        self.v_sp_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1]) # V(s')
        self.r_ph    = tf.placeholder(dtype=tf.float32, shape=[None, 1]) # r_t+1
        self.actions_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.output_size])
        #self.reward_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1])
        
        # Shared-parameter policy and value network
        W1 = tf.get_variable("w1", [self.input_size, 128], initializer=tf.initializers.random_normal(stddev=0.01))
        b1 = tf.get_variable("b1", [128], initializer=tf.initializers.random_normal(stddev=0.01))
        W2p = tf.get_variable("w2p", [128, self.output_size], initializer=tf.initializers.random_normal(stddev=0.01)) # policy
        b2p = tf.get_variable("b2p", [self.output_size], initializer=tf.initializers.random_normal(stddev=0.01))
        W2v = tf.get_variable("w2v", [128, 1], initializer=tf.initializers.random_normal(stddev=0.01)) # value
        b2v = tf.get_variable("b2v", [1], initializer=tf.initializers.random_normal(stddev=0.01))
        
        l1 = tf.nn.relu(tf.matmul(self.observations_ph, W1) + b1)
        # this will need to be changed to accommodate the range and character of action values
        l2_logits = tf.matmul(l1, W2p) + b2p
        l2p = tf.nn.softmax(l2_logits)
        l2v = tf.matmul(l1, W2v) + b2v
        
        self.reinforce_loss = tf.reduce_sum(
            (self.esdr_ph)*tf.nn.softmax_cross_entropy_with_logits(logits=l2_logits, labels=self.actions_ph)
        )
        self.reinforce_optimizer = tf.train.AdamOptimizer(learning_rate=0.01).minimize(self.reinforce_loss)
        
        self.actor_critic_loss = -1.*tf.reduce_sum((self.esdr_ph - self.v_s_ph)*tf.log(l2p)) + tf.reduce_sum(tf.square(l2v - self.esdr_ph))
        self.actor_critic_optimizer = tf.train.AdamOptimizer(learning_rate=0.01).minimize(self.actor_critic_loss)
        
        self.action_predictions = l2p
        self.esdr_predictions = l2v
        
    # For advantage:
    #    Add single timestep reward samples
    #    Add placeholders for estimated V(s) and V(s')
    def trainSarBatches(self, states, actions, discounted_rewards):
        '''
        Expects inputs to be numpy arrays of shape:
            states = [batch_size, num_state_features]
            actions = [batch_size, num_available_actions]
            discounted_rewards = [batch_size, 1]
        
        The idea is that all episodes have been parsed through and shuffled into
        one big batch of training data.
        '''
        #print("Shape of states:", states.shape)
        advantage_feeds = {
            self.observations_ph: states
        }
        #print("shape of discounted rewards:", discounted_rewards.shape)
        
        advantage_fetches = self.esdr_predictions
        
        v_predictions = self.session.run(advantage_fetches, feed_dict=advantage_feeds)
        #print(v_predictions.shape)
        
        optimize_feeds = {
            self.observations_ph: states,
            self.esdr_ph: discounted_rewards,
            self.v_s_ph: v_predictions,
            self.actions_ph: actions
        }
        
        optimize_fetches = [
            self.reinforce_loss,
            #self.actor_critic_loss,
            self.action_predictions,
            self.esdr_predictions,
            self.reinforce_optimizer
            #self.actor_critic_optimizer
        ]
        
        loss, action_predictions, esdr_predictions, _ = self.session.run(optimize_fetches, feed_dict=optimize_feeds)
        return loss, action_predictions, esdr_predictions
    
    def predict(self, state):
        '''
        Expects state to have the shape [num_state_features]
        '''
        
        feeds = {
            self.observations_ph: np.array([state])
        }
        
        fetches = [
            self.action_predictions,
            self.esdr_predictions
        ]
        action_predictions, esdr_predictions = self.session.run(fetches, feed_dict=feeds)
        return action_predictions, esdr_predictions

In [3]:
def prepSarData(states, actions, rewards, gamma=0.99):
    '''
    Converts temporally synced lists of states, actions, and rewards into shuffled
    numpy matrices for training.
    '''
    #print("lengths")
    #print(len(states), len(actions), len(rewards))
    discounted_sum_rewards = 0
    discounted_rewards = []
    for i in range(len(rewards) - 1, -1, -1):
        discounted_sum_rewards = gamma*discounted_sum_rewards + rewards[i]
        discounted_rewards.append(discounted_sum_rewards)
    discounted_rewards = np.expand_dims(np.array(discounted_rewards[::-1]), axis=1)
    
    actions = np.array(actions)
    states = np.array(states)
    indices = [i for i in range(len(actions))]
    np.random.shuffle(indices)
    
    actions_shuffled = actions[indices]
    states_shuffled = states[indices]
    discounted_rewards_shuffled = discounted_rewards[indices]
    
    return states_shuffled, actions_shuffled, discounted_rewards_shuffled

In [4]:
def accumulateData(env, agent, max_steps=1000, max_rollouts=50):
    states = []
    actions = []
    rewards = []
    for rollout_count in range(max_rollouts):
        ep_states = []
        ep_actions = []
        ep_rewards = []
        ep_state_t = env.reset()
        ep_states.append(ep_state_t)
        for t in range(max_steps):
            ep_action_t = np.random.choice(a=range(agent.output_size), p=agent.predict(ep_state_t)[0][0]) #discrete
            #print(ep_action_t)
            #print(ep_action_t)
            
            ep_state_tp1, ep_reward_tp1, done, _ = env.step(ep_action_t)
            ep_states.append(ep_state_tp1)
            ep_action_t_onehot = np.zeros((2,)) #discrete
            ep_action_t_onehot[ep_action_t] = 1. #discrete
            ep_actions.append(ep_action_t_onehot) #discrete
            
            ep_rewards.append(ep_reward_tp1)
            if done:
                ep_states.pop(-1)
                #ep_rewards.pop(-1)
                break
            ep_state_t = ep_state_tp1
        states.append(ep_states)
        actions.append(ep_actions)
        rewards.append(ep_rewards)
    return states, actions, rewards

In [5]:
env = gym.make("CartPole-v0")
session = tf.Session()
print(env.observation_space.shape)
print(env.action_space)
num_actions = 0
while env.action_space.contains(num_actions):
    num_actions += 1
agent = FfAgentDiscrete(session, env.observation_space.shape[0], num_actions)

session.run(tf.global_variables_initializer())

  result = entry_point.load(False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
(4,)
Discrete(2)
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [6]:
average_rewards = []
for i in range(100):
    states, actions, rewards = accumulateData(env, agent)
    #print("accmulated data:", len(states[0]), len(actions[0]), len(rewards[0]))
    for j in range(len(actions)):
        ret = prepSarData(states[j], actions[j], rewards[j])
        #print(ret[0].shape, ret[1].shape, ret[2].shape)
        for k in range(1):
            agent.trainSarBatches(ret[0], ret[1], ret[2])
    print(i)
    average_reward = np.average([sum(r) for r in rewards])
    print("average reward: ", average_reward)
    average_rewards.append(average_reward)
    
env.close()

0
average reward:  22.06
1
average reward:  28.74
2
average reward:  32.78
3
average reward:  36.84
4
average reward:  51.52
5
average reward:  54.9
6
average reward:  54.36
7
average reward:  58.66
8
average reward:  53.0
9
average reward:  41.5
10
average reward:  48.14
11
average reward:  49.8
12
average reward:  50.94
13
average reward:  61.46
14
average reward:  72.94
15
average reward:  57.5
16
average reward:  58.74
17
average reward:  69.28
18
average reward:  64.84
19
average reward:  49.92
20
average reward:  56.6
21
average reward:  103.24
22
average reward:  100.12
23
average reward:  100.52
24
average reward:  112.72
25
average reward:  148.9
26
average reward:  150.9
27
average reward:  163.6
28
average reward:  161.88
29
average reward:  159.52
30
average reward:  170.88
31
average reward:  163.1
32
average reward:  167.04
33
average reward:  167.54
34
average reward:  165.26
35
average reward:  168.12
36
average reward:  174.62
37
average reward:  179.94
38
average rewa

plt.figure()
plt.plot(average_rewards)
plt.show()
env.close()

In [7]:
#print(type(gym.envs.registry.all()))
env_ids = [espec.id for espec in gym.envs.registry.all()]
for e in sorted(env_ids):
    print(e)


Acrobot-v1
AirRaid-ram-v0
AirRaid-ram-v4
AirRaid-ramDeterministic-v0
AirRaid-ramDeterministic-v4
AirRaid-ramNoFrameskip-v0
AirRaid-ramNoFrameskip-v4
AirRaid-v0
AirRaid-v4
AirRaidDeterministic-v0
AirRaidDeterministic-v4
AirRaidNoFrameskip-v0
AirRaidNoFrameskip-v4
Alien-ram-v0
Alien-ram-v4
Alien-ramDeterministic-v0
Alien-ramDeterministic-v4
Alien-ramNoFrameskip-v0
Alien-ramNoFrameskip-v4
Alien-v0
Alien-v4
AlienDeterministic-v0
AlienDeterministic-v4
AlienNoFrameskip-v0
AlienNoFrameskip-v4
Amidar-ram-v0
Amidar-ram-v4
Amidar-ramDeterministic-v0
Amidar-ramDeterministic-v4
Amidar-ramNoFrameskip-v0
Amidar-ramNoFrameskip-v4
Amidar-v0
Amidar-v4
AmidarDeterministic-v0
AmidarDeterministic-v4
AmidarNoFrameskip-v0
AmidarNoFrameskip-v4
Ant-v2
Assault-ram-v0
Assault-ram-v4
Assault-ramDeterministic-v0
Assault-ramDeterministic-v4
Assault-ramNoFrameskip-v0
Assault-ramNoFrameskip-v4
Assault-v0
Assault-v4
AssaultDeterministic-v0
AssaultDeterministic-v4
AssaultNoFrameskip-v0
AssaultNoFrameskip-v4
Asterix-ra