In [1]:
import gym
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
class FfAgentContinuous(object):
    def __init__(self, session, input_size, output_size, gamma=0.99):
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.gamma = gamma
        
        self.observations_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.input_size])
        # expected sum of discounted rewards
        self.esdr_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1])
        self.v_s_ph  = tf.placeholder(dtype=tf.float32, shape=[None, 1]) # V(s)
        self.v_sp_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1]) # V(s')
        self.r_ph    = tf.placeholder(dtype=tf.float32, shape=[None, 1]) # r_t+1
        self.actions_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.output_size])
        #self.reward_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1])
        
        advantage = self.esdr_ph - self.v_s_ph
        mean_adv = tf.reduce_mean(advantage)
        stddev_adv = mean_adv*mean_adv - tf.reduce_mean(advantage*advantage)
        self.adv_normalized = (advantage - mean_adv)/stddev_adv
        
        # Shared-parameter policy and value network
        W1 = tf.get_variable("w1", [self.input_size, 128], initializer=tf.initializers.random_normal(stddev=0.01))
        b1 = tf.get_variable("b1", [128], initializer=tf.initializers.random_normal(stddev=0.01))
        W2p = tf.get_variable("w2p", [128, self.output_size], initializer=tf.initializers.random_normal(stddev=0.01)) # policy
        b2p = tf.get_variable("b2p", [self.output_size], initializer=tf.initializers.random_normal(stddev=0.01))
        W2v = tf.get_variable("w2v", [128, 1], initializer=tf.initializers.random_normal(stddev=0.01)) # value
        b2v = tf.get_variable("b2v", [1], initializer=tf.initializers.random_normal(stddev=0.01))
        
        l1 = tf.nn.relu(tf.matmul(self.observations_ph, W1) + b1)
        # this will need to be changed to accommodate the range and character of action values
        l2_logits = tf.matmul(l1, W2p) + b2p
        l2p = 2*tf.nn.tanh(l2_logits)
        l2v = tf.matmul(l1, W2v) + b2v
        
        #self.reinforce_loss = tf.reduce_sum(
        #    (self.esdr_ph)*tf.nn.softmax_cross_entropy_with_logits(logits=l2_logits, labels=self.actions_ph)
        #)
        self.reinforce_loss = tf.reduce_sum(
            (self.esdr_ph)*tf.square(self.actions_ph - l2p)
        )
        self.reinforce_optimizer = tf.train.AdamOptimizer(learning_rate=0.01).minimize(self.reinforce_loss)
        
        #self.actor_critic_loss = -1.*tf.reduce_sum(
        #    (self.esdr_ph - self.v_s_ph)*tf.square(self.actions_ph - l2p)
        #) + tf.reduce_sum(tf.square(l2v - self.esdr_ph))
        self.actor_critic_loss = -1.*tf.reduce_sum(
            (self.adv_normalized)*tf.square(self.actions_ph - l2p)
        ) + tf.reduce_sum(tf.square(l2v - self.esdr_ph))
        self.actor_critic_optimizer = tf.train.AdamOptimizer(learning_rate=0.01).minimize(self.actor_critic_loss)
        
        self.action_predictions = l2p
        self.esdr_predictions = l2v
        
    # For advantage:
    #    Add single timestep reward samples
    #    Add placeholders for estimated V(s) and V(s')
    def trainSarBatches(self, states, actions, discounted_rewards):
        '''
        Expects inputs to be numpy arrays of shape:
            states = [batch_size, num_state_features]
            actions = [batch_size, num_available_actions]
            discounted_rewards = [batch_size, 1]
        
        The idea is that all episodes have been parsed through and shuffled into
        one big batch of training data.
        '''
        
        advantage_feeds = {
            self.observations_ph: states
        }
        #print("shape of discounted rewards:", discounted_rewards.shape)
        
        advantage_fetches = self.esdr_predictions
        
        v_predictions = self.session.run(advantage_fetches, feed_dict=advantage_feeds)
        #print(v_predictions.shape)
        
        optimize_feeds = {
            self.observations_ph: states,
            self.esdr_ph: discounted_rewards,
            self.v_s_ph: v_predictions,
            self.actions_ph: actions
        }
        
        optimize_fetches = [
            #self.reinforce_loss,
            self.actor_critic_loss,
            self.action_predictions,
            self.esdr_predictions,
            #self.reinforce_optimizer
            self.actor_critic_optimizer
        ]
        
        loss, action_predictions, esdr_predictions, _ = self.session.run(optimize_fetches, feed_dict=optimize_feeds)
        return loss, action_predictions, esdr_predictions
    
    def predict(self, state):
        '''
        Expects state to have the shape [num_state_features]
        '''
        
        feeds = {
            self.observations_ph: np.array([state])
        }
        #print("state received by agent:", state)
        fetches = [
            self.action_predictions,
            self.esdr_predictions
        ]
        action_predictions, esdr_predictions = self.session.run(fetches, feed_dict=feeds)
        return action_predictions, esdr_predictions

In [4]:
def prepSarData(states, actions, rewards, gamma=0.99):
    '''
    Converts temporally synced lists of states, actions, and rewards into shuffled
    numpy matrices for training.
    '''
    #print(len(states), len(actions), len(rewards))
    discounted_sum_rewards = 0
    discounted_rewards = []
    for i in range(len(rewards) - 1, -1, -1):
        discounted_sum_rewards = gamma*discounted_sum_rewards + rewards[i]
        discounted_rewards.append(discounted_sum_rewards)
    discounted_rewards = np.expand_dims(np.array(discounted_rewards[::-1]), axis=1)
    
    actions = np.array(actions)
    states = np.array(states)
    indices = [i for i in range(len(actions))]
    np.random.shuffle(indices)
    
    actions_shuffled = actions[indices]
    states_shuffled = states[indices]
    discounted_rewards_shuffled = discounted_rewards[indices]
    
    return actions_shuffled, states_shuffled, discounted_rewards_shuffled

In [5]:
def accumulateData(env, agent, max_steps=1000, max_rollouts=50):
    states = []
    actions = []
    rewards = []
    for rollout_count in range(max_rollouts):
        ep_states = []
        ep_actions = []
        ep_rewards = []
        ep_state_t = env.reset()
        ep_states.append(ep_state_t)
        for t in range(max_steps):
            ep_action_t = np.random.normal(loc=agent.predict(ep_state_t)[0][0])
            #print(ep_action_t)
            ep_action_t = min(max(ep_action_t, [-2.0]), [2.0])
            #print(ep_action_t)
            ep_state_tp1, ep_reward_tp1, done, _ = env.step(ep_action_t)

            ep_actions.append(ep_action_t)
            ep_states.append(ep_state_tp1)
            ep_rewards.append(ep_reward_tp1)
            if done:
                ep_states.pop(-1)
                #ep_rewards.pop(-1)
                break
            ep_state_t = ep_state_tp1
        states.append(ep_states)
        actions.append(ep_actions)
        rewards.append(ep_rewards)
    return states, actions, rewards

In [6]:
#print(type(gym.envs.registry.all()))
env_ids = [espec.id for espec in gym.envs.registry.all()]
for e in sorted(env_ids):
    print(e)


Acrobot-v1
AirRaid-ram-v0
AirRaid-ram-v4
AirRaid-ramDeterministic-v0
AirRaid-ramDeterministic-v4
AirRaid-ramNoFrameskip-v0
AirRaid-ramNoFrameskip-v4
AirRaid-v0
AirRaid-v4
AirRaidDeterministic-v0
AirRaidDeterministic-v4
AirRaidNoFrameskip-v0
AirRaidNoFrameskip-v4
Alien-ram-v0
Alien-ram-v4
Alien-ramDeterministic-v0
Alien-ramDeterministic-v4
Alien-ramNoFrameskip-v0
Alien-ramNoFrameskip-v4
Alien-v0
Alien-v4
AlienDeterministic-v0
AlienDeterministic-v4
AlienNoFrameskip-v0
AlienNoFrameskip-v4
Amidar-ram-v0
Amidar-ram-v4
Amidar-ramDeterministic-v0
Amidar-ramDeterministic-v4
Amidar-ramNoFrameskip-v0
Amidar-ramNoFrameskip-v4
Amidar-v0
Amidar-v4
AmidarDeterministic-v0
AmidarDeterministic-v4
AmidarNoFrameskip-v0
AmidarNoFrameskip-v4
Ant-v2
Assault-ram-v0
Assault-ram-v4
Assault-ramDeterministic-v0
Assault-ramDeterministic-v4
Assault-ramNoFrameskip-v0
Assault-ramNoFrameskip-v4
Assault-v0
Assault-v4
AssaultDeterministic-v0
AssaultDeterministic-v4
AssaultNoFrameskip-v0
AssaultNoFrameskip-v4
Asterix-ra

In [7]:
pendulum = gym.make("Pendulum-v0")
session = tf.Session()
print(pendulum.observation_space.shape)
print(pendulum.action_space)
num_actions = len(pendulum.action_space.high)
agent = FfAgentContinuous(session, pendulum.observation_space.shape[0], num_actions)

session.run(tf.global_variables_initializer())

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
(3,)
Box(1,)


  result = entry_point.load(False)


In [None]:
average_rewards = []
for i in range(10000):
    states, actions, rewards = accumulateData(pendulum, agent)
    #print(actions[0:10])
    #print(rewards[0:10])
    states_pro = []
    actions_pro = []
    rewards_pro = []
    for j in range(len(actions)):
        ret = prepSarData(actions[j], states[j], rewards[j])
        mean_reward = np.average(ret[1])
        stdev_reward = np.std(ret[1])
        states_pro.append(ret[0])
        #actions_pro.append((ret[1] - mean_reward)/stdev_reward)
        actions_pro.append(ret[1])
        rewards_pro.append(ret[2])
        #print(ret[0].shape, ret[1].shape, ret[2].shape)
        #for k in range(10):
        #    agent.trainSarBatches(ret[0], ret[1], ret[2])
    for k in range(10):
        train_index = np.random.choice(a=range(len(states_pro)))
        agent.trainSarBatches(states_pro[train_index], actions_pro[train_index], rewards_pro[train_index])
    print(i)
    average_reward = np.average([sum(r) for r in rewards])
    print("average reward: ", average_reward)
    average_rewards.append(average_reward)

plt.figure()
plt.plot(average_rewards)
plt.show()
pendulum.close()

0
average reward:  -1248.5779345535227
1
average reward:  -1444.729583085761
2
average reward:  -1406.1604705862594
3
average reward:  -1399.5166997222661
4
average reward:  -1368.2750354923683
5
average reward:  -1440.197960318318
6
average reward:  -1430.7396279616814
7
average reward:  -1420.8275684790956
8
average reward:  -1404.3918824421767
9
average reward:  -1374.7409250692988
10
average reward:  -1379.6512138211972
11
average reward:  -1423.7907944426424
12
average reward:  -1389.197619604485
13
average reward:  -1397.59274937789
14
average reward:  -1380.538874567583
15
average reward:  -1424.4620422841704
16
average reward:  -1413.6562672197877
17
average reward:  -1404.3492443125072
18
average reward:  -1403.3942951343079
19
average reward:  -1430.7935901623086
20
average reward:  -1373.8368729020679
21
average reward:  -1356.7246631653832
22
average reward:  -1373.1642977696717
23
average reward:  -1432.6339524396762
24
average reward:  -1433.2371249671423
25
average rewar

204
average reward:  -1391.6178769841129
205
average reward:  -1403.527874454418
206
average reward:  -1409.7627816900142
207
average reward:  -1439.994549808761
208
average reward:  -1420.662309541362
209
average reward:  -1381.9219905221203
210
average reward:  -1388.541007268365
211
average reward:  -1424.0988850170957
212
average reward:  -1409.3714356114892
213
average reward:  -1396.9223696258878
214
average reward:  -1388.3966566016577
215
average reward:  -1459.9655422700043
216
average reward:  -1454.1284403557665
217
average reward:  -1431.1040379748852
218
average reward:  -1454.6198206650436
219
average reward:  -1429.7993983002223
220
average reward:  -1392.855105441896
221
average reward:  -1386.773022301012
222
average reward:  -1393.3108026141701
223
average reward:  -1389.254910234741
224
average reward:  -1422.9087360549488
225
average reward:  -1395.4249030093952
226
average reward:  -1427.090673234168
227
average reward:  -1438.9296638946419
228
average reward:  -14

406
average reward:  -1394.6118518130047
407
average reward:  -1440.2257563893515
408
average reward:  -1375.2176942862543
409
average reward:  -1400.5451232754453
410
average reward:  -1408.3427819267056
411
average reward:  -1406.4498038006611
412
average reward:  -1403.4261839103267
413
average reward:  -1434.673311768435
414
average reward:  -1428.8607809309478
415
average reward:  -1388.7285065984356
416
average reward:  -1376.8189329169886
417
average reward:  -1372.231509321331
418
average reward:  -1420.6790252818757
419
average reward:  -1398.724705392138
420
average reward:  -1398.497464344105
421
average reward:  -1440.1610058309984
422
average reward:  -1446.8425604016072
423
average reward:  -1415.1088999585832
424
average reward:  -1415.0144666107444
425
average reward:  -1437.9580756987152
426
average reward:  -1435.9721462776254
427
average reward:  -1416.5325910600575
428
average reward:  -1429.8741108879665
429
average reward:  -1355.6796758531432
430
average reward: 

608
average reward:  -1420.9875051376844
609
average reward:  -1416.517888853201
610
average reward:  -1399.7082734675496
611
average reward:  -1424.8128844552775
612
average reward:  -1373.3693357483578
613
average reward:  -1441.5918497748776
614
average reward:  -1374.9306014149008
615
average reward:  -1397.51416171761
616
average reward:  -1398.4637729915603
617
average reward:  -1414.668807073331
618
average reward:  -1409.9932574476054
619
average reward:  -1399.512517030284
620
average reward:  -1382.1680387216998
621
average reward:  -1396.677081821107
622
average reward:  -1442.7843748203568
623
average reward:  -1459.8168321864277
624
average reward:  -1367.4106036882895
625
average reward:  -1407.6653996902498
626
average reward:  -1406.1245657823324
627
average reward:  -1442.809938899799
628
average reward:  -1437.9206918559805
629
average reward:  -1424.1457712363135
630
average reward:  -1429.2477257652824
631
average reward:  -1401.4510550348493
632
average reward:  -1

810
average reward:  -1420.6323407948669
811
average reward:  -1403.9031176393628
812
average reward:  -1451.2682484826232
813
average reward:  -1426.4253406346234
814
average reward:  -1411.258616861563
815
average reward:  -1409.2819236605212
816
average reward:  -1420.8195992787523
817
average reward:  -1426.5462226996547
818
average reward:  -1384.3269337059912
819
average reward:  -1404.8389757982811
820
average reward:  -1416.7319914192951
821
average reward:  -1389.941835796002
822
average reward:  -1422.9241681369924
823
average reward:  -1358.0214086923456
824
average reward:  -1373.700627679421
825
average reward:  -1404.055133471051
826
average reward:  -1362.8395495119485
827
average reward:  -1418.3635282167459
828
average reward:  -1409.6041272787866
829
average reward:  -1371.5486058484782
830
average reward:  -1417.652730048541
831
average reward:  -1384.2703070164985
832
average reward:  -1405.4449034023967
833
average reward:  -1417.3586573630191
834
average reward:  

1011
average reward:  -1375.4886307715833
1012
average reward:  -1404.5211972923755
1013
average reward:  -1409.2756111995423
1014
average reward:  -1409.957044245803
1015
average reward:  -1389.4634955372499
1016
average reward:  -1380.7452410729265
1017
average reward:  -1407.6838910198674
1018
average reward:  -1392.2550445751203
1019
average reward:  -1411.620506745101
1020
average reward:  -1414.5828657279976
1021
average reward:  -1453.8850829410983
1022
average reward:  -1432.252501716646
1023
average reward:  -1418.1328573752135
1024
average reward:  -1439.6510403322482
1025
average reward:  -1475.047491864046
1026
average reward:  -1393.452769425814
1027
average reward:  -1362.9219461573587
1028
average reward:  -1403.278437288708
1029
average reward:  -1441.3920271791792
1030
average reward:  -1402.3332181254884
1031
average reward:  -1429.4181055408783
1032
average reward:  -1401.1517802398073
1033
average reward:  -1391.8534764325834
1034
average reward:  -1475.219313773047

1208
average reward:  -1447.8953602883205
1209
average reward:  -1393.0774518915987
1210
average reward:  -1433.7496873226692
1211
average reward:  -1450.8190422864939
1212
average reward:  -1398.4561148993917
1213
average reward:  -1417.1351655336816
1214
average reward:  -1410.6245210399238
1215
average reward:  -1425.1810978205299
1216
average reward:  -1373.5985261807173
1217
average reward:  -1434.9329946959674
1218
average reward:  -1429.0444150922876
1219
average reward:  -1393.2399271654815
1220
average reward:  -1392.9515681740231
1221
average reward:  -1372.4768966286179
1222
average reward:  -1410.3972196911666
1223
average reward:  -1423.7545645210882
1224
average reward:  -1418.1617681311486
1225
average reward:  -1369.0440950949837
1226
average reward:  -1407.9214218820011
1227
average reward:  -1412.960293999099
1228
average reward:  -1383.683189940278
1229
average reward:  -1395.6329373060476
1230
average reward:  -1399.8607902891672
1231
average reward:  -1425.03413336

1405
average reward:  -1452.9781306152172
1406
average reward:  -1407.0054430866765
1407
average reward:  -1405.3806156467717
1408
average reward:  -1384.9229734885055
1409
average reward:  -1430.6124176552348
1410
average reward:  -1426.2459065959163
1411
average reward:  -1442.0246495268998
1412
average reward:  -1426.1225021211797
1413
average reward:  -1418.3035068084253
1414
average reward:  -1437.4259196501248
1415
average reward:  -1405.3531948515238
1416
average reward:  -1433.2860542793608
1417
average reward:  -1437.9198337841947
1418
average reward:  -1410.5526431182934
1419
average reward:  -1400.6548164668534
1420
average reward:  -1413.8820440348434
1421
average reward:  -1430.0305774152878
1422
average reward:  -1402.7052370279864
1423
average reward:  -1397.2457423673186
1424
average reward:  -1422.5303150151676
1425
average reward:  -1412.685139633985
1426
average reward:  -1387.9749221892034
1427
average reward:  -1373.0229915775772
1428
average reward:  -1433.3858561

1602
average reward:  -1391.4991318002146
1603
average reward:  -1436.1450627753773
1604
average reward:  -1427.1439414713432
1605
average reward:  -1416.9996886356873
1606
average reward:  -1433.7547653774052
1607
average reward:  -1423.9387209832971
1608
average reward:  -1425.974812173858
1609
average reward:  -1422.6114169069388
1610
average reward:  -1387.4287204334166
1611
average reward:  -1367.5358856513808
1612
average reward:  -1394.4337638944087
1613
average reward:  -1379.6694806077096
1614
average reward:  -1410.8591245611005
1615
average reward:  -1396.9606915924535
1616
average reward:  -1408.2899821746475
1617
average reward:  -1398.0805247034266
1618
average reward:  -1469.8595374480244
1619
average reward:  -1450.79925703797
1620
average reward:  -1408.0124845987718
1621
average reward:  -1427.2814022471232
1622
average reward:  -1428.6991071232903
1623
average reward:  -1436.2457093026753
1624
average reward:  -1444.0468149318813
1625
average reward:  -1410.236017201

1798
average reward:  -1413.6944531067213
1799
average reward:  -1409.268767603857
1800
average reward:  -1411.3595250140893
1801
average reward:  -1432.6644345631764
1802
average reward:  -1404.1936871429332
1803
average reward:  -1411.2369893088626
1804
average reward:  -1429.4913613896767
1805
average reward:  -1387.7836018901376
1806
average reward:  -1406.0528385808977
1807
average reward:  -1384.9732478003334
1808
average reward:  -1413.5310824982696
1809
average reward:  -1418.6747319419417
1810
average reward:  -1415.8200208009148
1811
average reward:  -1406.549240394714
1812
average reward:  -1400.941367402495
1813
average reward:  -1434.6955129080493
1814
average reward:  -1421.2919301760908
1815
average reward:  -1357.9197544690112
1816
average reward:  -1426.8136548725195
1817
average reward:  -1379.3641917730697
1818
average reward:  -1377.5002457487292
1819
average reward:  -1395.305122444263
1820
average reward:  -1387.034281716579
1821
average reward:  -1424.08760157330