In [1]:
## Taken from https://github.com/marload/DeepRL-TensorFlow2 ##

import wandb
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda, concatenate
# import tensorflow_federated as tff

In [2]:
import gym
import argparse
import numpy as np
import random
from collections import deque

tf.keras.backend.set_floatx('float64')

# parser = argparse.ArgumentParser()
# parser.add_argument('--gamma', type=float, default=0.99)
# parser.add_argument('--actor_lr', type=float, default=0.0005)
# parser.add_argument('--critic_lr', type=float, default=0.001)
# parser.add_argument('--batch_size', type=int, default=64)
# parser.add_argument('--tau', type=float, default=0.05)
# parser.add_argument('--train_start', type=int, default=2000)

# args = parser.parse_args()



# class Args:
#     gamma = 0.99
#     actor_lr = 0.0005
#     critic_lr = 0.0005
#     batch_size = 64
#     tau = 0.05
#     train_start = 400
#     episodes = 10
#     N = 3
#     epochs = 100

# args = Args()

In [3]:
class ReplayBuffer:
    def __init__(self, capacity=20000):
        self.buffer = deque(maxlen=capacity)
    
    def put(self, state, action, reward, next_state, done):
        self.buffer.append([state, action, reward, next_state, done])
    
    def sample(self):
        sample = random.sample(self.buffer, wandb.config.batch_size)
        states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))
        states = np.array(states).reshape(wandb.config.batch_size, -1)
        next_states = np.array(next_states).reshape(wandb.config.batch_size, -1)
        return states, actions, rewards, next_states, done
    
    def size(self):
        return len(self.buffer)

In [4]:
class Actor:
    def __init__(self, state_dim, action_dim, action_bound):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(wandb.config.actor_lr)

    def create_model(self):
        return tf.keras.Sequential([
            Input((self.state_dim,)),
            Dense(wandb.config.actor['layer1'], activation='relu'),
            Dense(wandb.config.actor['layer2'], activation='relu'),
            Dense(self.action_dim, activation='tanh'),
            Lambda(lambda x: x * self.action_bound)
        ])

    def train(self, states, q_grads):
        with tf.GradientTape() as tape:
            grads = tape.gradient(self.model(states), self.model.trainable_variables, -q_grads)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
    
    def predict(self, state):
        return self.model.predict(state)

    def get_action(self, state):
        state = np.reshape(state, [1, self.state_dim])
        return self.model.predict(state)[0]

In [5]:
class Critic:
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(wandb.config.critic_lr)

    def create_model(self):
        state_input = Input((self.state_dim,))
        s1 = Dense(wandb.config.critic['state1'], activation='relu')(state_input)
        s2 = Dense(wandb.config.critic['state2'], activation='relu')(s1)
        action_input = Input((self.action_dim,))
        a1 = Dense(wandb.config.critic['actor1'], activation='relu')(action_input)
        c1 = concatenate([s2, a1], axis=-1)
        c2 = Dense(wandb.config.critic['cat1'], activation='relu')(c1)
        output = Dense(1, activation='linear')(c2)
        return tf.keras.Model([state_input, action_input], output)
    
    def predict(self, inputs):
        return self.model.predict(inputs)
    
    def q_grads(self, states, actions):
        actions = tf.convert_to_tensor(actions)
        with tf.GradientTape() as tape:
            tape.watch(actions)
            q_values = self.model([states, actions])
            q_values = tf.squeeze(q_values)
        return tape.gradient(q_values, actions)

    def compute_loss(self, v_pred, td_targets):
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, actions, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model([states, actions], training=True)
            assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

In [6]:
class Agent:
    def __init__(self, env, iden = 0):
        self.env = env
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.action_bound = self.env.action_space.high[0]

        self.buffer = ReplayBuffer()

        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound)
        self.critic = Critic(self.state_dim, self.action_dim)
        
        self.target_actor = Actor(self.state_dim, self.action_dim, self.action_bound)
        self.target_critic = Critic(self.state_dim, self.action_dim)

        actor_weights = self.actor.model.get_weights()
        critic_weights = self.critic.model.get_weights()
        self.target_actor.model.set_weights(actor_weights)
        self.target_critic.model.set_weights(critic_weights)
        
        self.iden = iden
        
    
    def target_update(self):
        actor_weights = self.actor.model.get_weights()
        t_actor_weights = self.target_actor.model.get_weights()
        critic_weights = self.critic.model.get_weights()
        t_critic_weights = self.target_critic.model.get_weights()

        for i in range(len(actor_weights)):
            t_actor_weights[i] = wandb.config.tau * actor_weights[i] + (1 - wandb.config.tau) * t_actor_weights[i]

        for i in range(len(critic_weights)):
            t_critic_weights[i] = wandb.config.tau * critic_weights[i] + (1 - wandb.config.tau) * t_critic_weights[i]
        
        self.target_actor.model.set_weights(t_actor_weights)
        self.target_critic.model.set_weights(t_critic_weights)


    def td_target(self, rewards, q_values, dones):
        targets = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                targets[i] = rewards[i]
            else:
                targets[i] = wandb.config.gamma * q_values[i]
        return targets

    def list_to_batch(self, list):
        batch = list[0]
        for elem in list[1:]:
            batch = np.append(batch, elem, axis=0)
        return batch
    
    def ou_noise(self, x, rho=0.15, mu=0, dt=1e-1, sigma=0.2, dim=1):
        return x + rho * (mu-x) * dt + sigma * np.sqrt(dt) * np.random.normal(size=dim)
    
    def replay(self):
        for _ in range(10):
            states, actions, rewards, next_states, dones = self.buffer.sample()
            target_q_values = self.target_critic.predict([next_states, self.target_actor.predict(next_states)])
            td_targets = self.td_target(rewards, target_q_values, dones)
            
            self.critic.train(states, actions, td_targets)
            
            s_actions = self.actor.predict(states)
            s_grads = self.critic.q_grads(states, s_actions)
            grads = np.array(s_grads).reshape((-1, self.action_dim))
            self.actor.train(states, grads)
            self.target_update()

    def train(self, max_episodes=1000):
        for ep in range(max_episodes):      # train a bunch of episodes
            episode_reward, done = 0, False

            state = self.env.reset()
            bg_noise = np.zeros(self.action_dim)
            while not done:    # run till done by hitting the action that's done
#                 self.env.render()
                action = self.actor.get_action(state)   # pick an action, add noise, clip the action
                noise = self.ou_noise(bg_noise, dim=self.action_dim)
                action = np.clip(action + noise, -self.action_bound, self.action_bound)

                next_state, reward, done, _ = self.env.step(action)
                self.buffer.put(state, action, (reward+8)/8, next_state, done)
                bg_noise = noise     # why does the noise wander in such a weird way
                episode_reward += reward
                state = next_state
                
            if self.buffer.size() >= wandb.config.batch_size and self.buffer.size() >= wandb.config.train_start:    # update the states if enough
                self.replay()                
            print('EP{} EpisodeReward={}'.format(ep, episode_reward))
            wandb.log({'Reward' + str(self.iden): episode_reward})
            
        return episode_reward

In [None]:
# def main():
if __name__ == "__main__":
    
    try: wandb.finish()
    except: pass
    
    ####configurations
    wandb.init(name='DDPG-multiplemax-long', project="deep-rl-tf2")
    env_name = 'Pendulum-v0'

    
    wandb.config.gamma = 0.99
    wandb.config.actor_lr = 0.001
    wandb.config.critic_lr = 0.0001
    wandb.config.batch_size = 64
    wandb.config.tau = 0.005
    wandb.config.train_start = 400
    wandb.config.episodes = 5
    wandb.config.num = 3
    wandb.config.epochs = 200

    wandb.config.actor = {'layer1': 128, 'layer2' : 128}
    wandb.config.critic = {'state1': 256, 'state2': 128, 'actor1': 128, 'cat1': 64}
    
    print(wandb.config)
    
    # main run    
    N = wandb.config.num
    agents = []
    
    # set up the agent
    for i in range(N):
        env_t = gym.make(env_name)
        agents.append(Agent(env_t, i))

    # start the training
    for z in range(wandb.config.epochs):

        rewards = []
        reward = 0
        # train the agent
        for j in range(len(agents)):
            print('Training Agent {}'.format(agents[j].iden))
            rewards.append(agents[j].train(wandb.config.episodes))
            reward += rewards[-1]
    
        reward = reward / N
        print('Epoch={}\t Average reward={}'.format(z, reward))
        wandb.log({'batch': z, 'Epoch': reward})

        index = np.argmax(np.array(rewards))
        
        # get the average - actor and critic
        critic_avg = []
        actor_avg = []

        for i in range(len(agents[index].actor.model.get_weights())):
            
#             actor_t = agents[0].actor.model.get_weights()[i]

#             for j in range(1, N):
#                 actor_t += agents[j].actor.model.get_weights()[i]

#             actor_t = actor_t / N
            actor_t = agents[index].actor.model.get_weights()[i]
            actor_avg.append(actor_t)


        for i in range(len(agents[index].critic.model.get_weights())):
#             critic_t = agents[0].critic.model.get_weights()[i]

#             for j in range(1, N):
#                 critic_t += agents[j].critic.model.get_weights()[i]

#             critic_t = critic_t / N
            critic_t = agents[index].critic.model.get_weights()[i]
            critic_avg.append(critic_t)


        # set the average
        for j in range(N):
            agents[j].actor.model.set_weights(actor_avg)
            agents[j].critic.model.set_weights(critic_avg)


#     for j in range(N):
#         print("Agent {}, actor {}".format(j, agents[j].actor.model.get_weights()))
#         print("------------------------")
#         print("Agent {}, critic {}".format(j, agents[j].critic.model.get_weights()))
#         print("------------------------")

    
    wandb.finish()
    
# if __name__ == "__main__":
#     main()
    

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Currently logged in as: [33mvictor-qin[0m (use `wandb login --relogin` to force relogin)


{'gamma': 0.99, 'actor_lr': 0.001, 'critic_lr': 0.0001, 'batch_size': 64, 'tau': 0.005, 'train_start': 400, 'episodes': 5, 'num': 3, 'epochs': 200, 'actor': {'layer1': 128, 'layer2': 128}, 'critic': {'state1': 256, 'state2': 128, 'actor1': 128, 'cat1': 64}}
Training Agent 0
EP0 EpisodeReward=-1865.661897081727
EP1 EpisodeReward=-1896.9704881271523
EP2 EpisodeReward=-1332.532200490319
EP3 EpisodeReward=-1623.0147684270898
EP4 EpisodeReward=-1314.7808665527693
Training Agent 1
EP0 EpisodeReward=-1275.2120864119536
EP1 EpisodeReward=-1602.5868291956035
EP2 EpisodeReward=-1302.0490003300629
EP3 EpisodeReward=-1478.24253949674
EP4 EpisodeReward=-1463.1121287973185
Training Agent 2
EP0 EpisodeReward=-1684.6560017237125
EP1 EpisodeReward=-1713.8583043870194
EP2 EpisodeReward=-1444.509307557442
EP3 EpisodeReward=-1338.4681808349153
EP4 EpisodeReward=-897.5733376772048
Epoch=0	 Average reward=-1225.1554443424309
Training Agent 0
EP0 EpisodeReward=-1403.8583521804526
EP1 EpisodeReward=-1778.3824

EP0 EpisodeReward=-1376.1430779322347
EP1 EpisodeReward=-1489.8308501023776
EP2 EpisodeReward=-1344.4822056732296
EP3 EpisodeReward=-1499.686513955285
EP4 EpisodeReward=-1546.5462261862822
Training Agent 1
EP0 EpisodeReward=-1520.4357550869474
EP1 EpisodeReward=-1520.3315043380028
EP2 EpisodeReward=-1513.0137890188375
EP3 EpisodeReward=-1285.5423153418553
EP4 EpisodeReward=-1620.7046717259002
Training Agent 2
EP0 EpisodeReward=-1597.3167231577306
EP1 EpisodeReward=-1164.4621411901455
EP2 EpisodeReward=-1535.3940563827516
EP3 EpisodeReward=-1502.6248113279341
EP4 EpisodeReward=-1024.5714992777835
Epoch=12	 Average reward=-1397.2741323966554
Training Agent 0
EP0 EpisodeReward=-1636.1134603214489
EP1 EpisodeReward=-1513.5148928188196
EP2 EpisodeReward=-1598.7229827458652
EP3 EpisodeReward=-1640.6944292843118
EP4 EpisodeReward=-1509.385596563419
Training Agent 1
EP0 EpisodeReward=-1647.7491870888955
EP1 EpisodeReward=-1252.3224383642826
EP2 EpisodeReward=-988.5883547272438
EP3 EpisodeRewar

EP2 EpisodeReward=-1457.539770366491
EP3 EpisodeReward=-1544.1437235476417
EP4 EpisodeReward=-1522.2601538524193
Training Agent 2
EP0 EpisodeReward=-1644.0765460123619
EP1 EpisodeReward=-1143.9033774880513
EP2 EpisodeReward=-1081.0741535888542
EP3 EpisodeReward=-1431.2912028612552
EP4 EpisodeReward=-1535.85529158993
Epoch=24	 Average reward=-1544.579564690887
Training Agent 0
EP0 EpisodeReward=-1162.5576326487808
EP1 EpisodeReward=-1636.7082323647155
EP2 EpisodeReward=-1543.6735933109223
EP3 EpisodeReward=-1502.410832918514
EP4 EpisodeReward=-1606.392627955628
Training Agent 1
EP0 EpisodeReward=-1480.437899310554
EP1 EpisodeReward=-1638.1205177887273
EP2 EpisodeReward=-1618.251161779326
EP3 EpisodeReward=-1240.9298408372592
EP4 EpisodeReward=-1475.6616271401501
Training Agent 2
EP0 EpisodeReward=-1247.617323156422
EP1 EpisodeReward=-1569.012248031042
EP2 EpisodeReward=-1497.5703476702636
EP3 EpisodeReward=-1332.9689958725203
EP4 EpisodeReward=-1054.01627524138
Epoch=25	 Average reward=

EP4 EpisodeReward=-1546.0963073635246
Epoch=36	 Average reward=-1415.4161423223566
Training Agent 0
EP0 EpisodeReward=-1323.7475269489846
EP1 EpisodeReward=-1495.414528712887
EP2 EpisodeReward=-1066.1798189898502
EP3 EpisodeReward=-1102.7273017098519
EP4 EpisodeReward=-1341.6664625550263
Training Agent 1
EP0 EpisodeReward=-1193.886217419976
EP1 EpisodeReward=-1511.4043121086866
EP2 EpisodeReward=-1301.994129042693
EP3 EpisodeReward=-1138.2178991047235
EP4 EpisodeReward=-1330.6152872376988
Training Agent 2
EP0 EpisodeReward=-1307.8407776758959
EP1 EpisodeReward=-1312.1155472439195
EP2 EpisodeReward=-1376.3257056415691
EP3 EpisodeReward=-1616.2352273787785
EP4 EpisodeReward=-1284.7701738712162
Epoch=37	 Average reward=-1319.0173078879805
Training Agent 0
EP0 EpisodeReward=-1347.9466028045172
EP1 EpisodeReward=-960.9029473485954
EP2 EpisodeReward=-1116.3461375799952
EP3 EpisodeReward=-1484.6929043358045
EP4 EpisodeReward=-1457.6247115714023
Training Agent 1
EP0 EpisodeReward=-1517.5457388

EP4 EpisodeReward=-856.2618341560701
Training Agent 1
EP0 EpisodeReward=-866.196619364701
EP1 EpisodeReward=-1443.9921527646718
EP2 EpisodeReward=-1184.211280986331
EP3 EpisodeReward=-1109.9000565910503
EP4 EpisodeReward=-1476.8121216095033
Training Agent 2
EP0 EpisodeReward=-1210.3204878177148
EP1 EpisodeReward=-966.5809981658992
EP2 EpisodeReward=-1162.1354511986715
EP3 EpisodeReward=-1108.088371590336
EP4 EpisodeReward=-1046.3312261459957
Epoch=49	 Average reward=-1126.468393970523
Training Agent 0
EP0 EpisodeReward=-1128.6750746964565
EP1 EpisodeReward=-1120.2697621777186
EP2 EpisodeReward=-1478.3392323928101
EP3 EpisodeReward=-1537.3477460563993
EP4 EpisodeReward=-703.7110085614612
Training Agent 1
EP0 EpisodeReward=-1119.826868058274
EP1 EpisodeReward=-1240.2578880039473
EP2 EpisodeReward=-1527.7990592158253
EP3 EpisodeReward=-954.4306505687496
EP4 EpisodeReward=-1091.9572520693355
Training Agent 2
EP0 EpisodeReward=-1568.9716371995523
EP1 EpisodeReward=-1177.6997169796414
EP2 Ep

In [35]:
# import pandas
wandb.finish()
api = wandb.Api()
run = api.run("victor-qin/deep-rl-tf2/1s1ac3wo")
temp = run.history()
# print(run.scan_history())
# enumerate(run.history())

In [26]:
if run.state == "finished":
    for i, row in enumerate(run.scan_history()):
        try: print(row["_runtime"],'\t', row["Epoch"])
        except: pass

204 	 -1397.4979261457054
411 	 -1314.97677473863
616 	 -1675.7039031101483
819 	 -1728.7964805138092
1026 	 -1527.2705949310778
1237 	 -1578.1663064042016
1447 	 -1562.6616137080212
1656 	 -1570.4740155175275
1866 	 -1766.325876451435
2070 	 -1645.5299593648642
2285 	 -1418.3971006402744
2502 	 -1555.6624956007024
2718 	 -1214.3700774544943
2932 	 -1561.5437283128983
3144 	 -1403.9980968619263
3407 	 -1367.411475604592
3623 	 -1393.962258191116
3830 	 -1406.6523690831484
4029 	 -1101.6098018066943
4231 	 -1269.6366791527905
4449 	 -1498.5555164487466
4673 	 -1298.3502109839903
4894 	 -1638.4867399099965
5118 	 -1412.7765626408752
5334 	 -994.9669943700595
5534 	 -1320.8340664813616
5742 	 -1543.0258378183478
5951 	 -1445.0463899960896
6160 	 -1340.442215837614
6363 	 -1392.9779538323928
6585 	 -1430.2436505694286
6800 	 -1292.2180494001896
7001 	 -1243.6286137615236
7201 	 -1505.2007945599473
7407 	 -1393.522384897824
7618 	 -1316.374957483535
7821 	 -1227.821696099155
8018 	 -1256.30