In [2]:
## Taken from https://github.com/marload/DeepRL-TensorFlow2 ##

import wandb
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda, concatenate
import numba
# import tensorflow_federated as tff

In [3]:
import gym
import argparse
import numpy as np
import random
from collections import deque

tf.keras.backend.set_floatx('float64')

# parser = argparse.ArgumentParser()
# parser.add_argument('--gamma', type=float, default=0.99)
# parser.add_argument('--actor_lr', type=float, default=0.0005)
# parser.add_argument('--critic_lr', type=float, default=0.001)
# parser.add_argument('--batch_size', type=int, default=64)
# parser.add_argument('--tau', type=float, default=0.05)
# parser.add_argument('--train_start', type=int, default=2000)

# args = parser.parse_args()



# class Args:
#     gamma = 0.99
#     actor_lr = 0.0005
#     critic_lr = 0.0005
#     batch_size = 64
#     tau = 0.05
#     train_start = 400
#     episodes = 10
#     N = 3
#     epochs = 100

# args = Args()

In [4]:
class ReplayBuffer:
    def __init__(self, capacity=20000):
        self.buffer = deque(maxlen=capacity)
    
    def put(self, state, action, reward, next_state, done):
        self.buffer.append([state, action, reward, next_state, done])
    
    def sample(self):
        sample = random.sample(self.buffer, wandb.config.batch_size)
        states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))
        states = np.array(states).reshape(wandb.config.batch_size, -1)
        next_states = np.array(next_states).reshape(wandb.config.batch_size, -1)
        return states, actions, rewards, next_states, done
    
    def size(self):
        return len(self.buffer)

In [5]:
class Actor:
    def __init__(self, state_dim, action_dim, action_bound):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(wandb.config.actor_lr)

    def create_model(self):
        return tf.keras.Sequential([
            Input((self.state_dim,)),
            Dense(wandb.config.actor['layer1'], activation='relu'),
            Dense(wandb.config.actor['layer2'], activation='relu'),
            Dense(self.action_dim, activation='tanh'),
            Lambda(lambda x: x * self.action_bound)
        ])

    def train(self, states, q_grads):
        with tf.GradientTape() as tape:
            grads = tape.gradient(self.model(states), self.model.trainable_variables, -q_grads)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
    
    def predict(self, state):
        return self.model.predict(state)

    def get_action(self, state):
        state = np.reshape(state, [1, self.state_dim])
        return self.model.predict(state)[0]

In [6]:
class Critic:
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(wandb.config.critic_lr)

    def create_model(self):
        state_input = Input((self.state_dim,))
        s1 = Dense(wandb.config.critic['state1'], activation='relu')(state_input)
        s2 = Dense(wandb.config.critic['state2'], activation='relu')(s1)
        action_input = Input((self.action_dim,))
        a1 = Dense(wandb.config.critic['actor1'], activation='relu')(action_input)
        c1 = concatenate([s2, a1], axis=-1)
        c2 = Dense(wandb.config.critic['cat1'], activation='relu')(c1)
        output = Dense(1, activation='linear')(c2)
        return tf.keras.Model([state_input, action_input], output)
    
    def predict(self, inputs):
        return self.model.predict(inputs)
    
    def q_grads(self, states, actions):
        actions = tf.convert_to_tensor(actions)
        with tf.GradientTape() as tape:
            tape.watch(actions)
            q_values = self.model([states, actions])
            q_values = tf.squeeze(q_values)
        return tape.gradient(q_values, actions)

    def compute_loss(self, v_pred, td_targets):
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, actions, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model([states, actions], training=True)
            assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

In [7]:
class Agent:
    def __init__(self, env, iden = 0):
        self.env = env
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.action_bound = self.env.action_space.high[0]

        self.buffer = ReplayBuffer()

        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound)
        self.critic = Critic(self.state_dim, self.action_dim)
        
        self.target_actor = Actor(self.state_dim, self.action_dim, self.action_bound)
        self.target_critic = Critic(self.state_dim, self.action_dim)

        actor_weights = self.actor.model.get_weights()
        critic_weights = self.critic.model.get_weights()
        self.target_actor.model.set_weights(actor_weights)
        self.target_critic.model.set_weights(critic_weights)
        
        self.iden = iden
        
    
    def target_update(self):
        actor_weights = self.actor.model.get_weights()
        t_actor_weights = self.target_actor.model.get_weights()
        critic_weights = self.critic.model.get_weights()
        t_critic_weights = self.target_critic.model.get_weights()

        for i in range(len(actor_weights)):
            t_actor_weights[i] = wandb.config.tau * actor_weights[i] + (1 - wandb.config.tau) * t_actor_weights[i]

        for i in range(len(critic_weights)):
            t_critic_weights[i] = wandb.config.tau * critic_weights[i] + (1 - wandb.config.tau) * t_critic_weights[i]
        
        self.target_actor.model.set_weights(t_actor_weights)
        self.target_critic.model.set_weights(t_critic_weights)


    def td_target(self, rewards, q_values, dones):
        targets = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                targets[i] = rewards[i]
            else:
                targets[i] = wandb.config.gamma * q_values[i]
        return targets

    def list_to_batch(self, list):
        batch = list[0]
        for elem in list[1:]:
            batch = np.append(batch, elem, axis=0)
        return batch
    
    def ou_noise(self, x, rho=0.15, mu=0, dt=1e-1, sigma=0.2, dim=1):
        return x + rho * (mu-x) * dt + sigma * np.sqrt(dt) * np.random.normal(size=dim)
    
    def replay(self):
        for _ in range(10):
            states, actions, rewards, next_states, dones = self.buffer.sample()
            target_q_values = self.target_critic.predict([next_states, self.target_actor.predict(next_states)])
            td_targets = self.td_target(rewards, target_q_values, dones)
            
            self.critic.train(states, actions, td_targets)
            
            s_actions = self.actor.predict(states)
            s_grads = self.critic.q_grads(states, s_actions)
            grads = np.array(s_grads).reshape((-1, self.action_dim))
            self.actor.train(states, grads)
            self.target_update()

    def train(self, max_episodes=1000):
        for ep in range(max_episodes):      # train a bunch of episodes
            episode_reward, done = 0, False

            state = self.env.reset()
            bg_noise = np.zeros(self.action_dim)
            while not done:    # run till done by hitting the action that's done
#                 self.env.render()
                action = self.actor.get_action(state)   # pick an action, add noise, clip the action
                noise = self.ou_noise(bg_noise, dim=self.action_dim)
                action = np.clip(action + noise, -self.action_bound, self.action_bound)

                next_state, reward, done, _ = self.env.step(action)
                self.buffer.put(state, action, (reward+8)/8, next_state, done)
                bg_noise = noise     # why does the noise wander in such a weird way
                episode_reward += reward
                state = next_state
                
            if self.buffer.size() >= wandb.config.batch_size and self.buffer.size() >= wandb.config.train_start:    # update the states if enough
                self.replay()                
            print('EP{} EpisodeReward={}'.format(ep, episode_reward))
            wandb.log({'Reward' + str(self.iden): episode_reward})
            
        return episode_reward

In [25]:
# def main():
if __name__ == "__main__":
    
    try: wandb.finish()
    except: pass
    
    ####configurations
    wandb.init(name='DDPG-multiple-long', project="deep-rl-tf2")
    env_name = 'Pendulum-v0'

    
    wandb.config.gamma = 0.99
    wandb.config.actor_lr = 0.001
    wandb.config.critic_lr = 0.0001
    wandb.config.batch_size = 64
    wandb.config.tau = 0.005
    wandb.config.train_start = 400
    wandb.config.episodes = 5
    wandb.config.num = 3
    wandb.config.epochs = 2

    wandb.config.actor = {'layer1': 128, 'layer2' : 128}
    wandb.config.critic = {'state1': 256, 'state2': 128, 'actor1': 128, 'cat1': 64}
    
    print(wandb.config)
    
    # main run    
    N = wandb.config.num
    agents = []
    
    # set up the agent
    for i in range(N):
        env_t = gym.make(env_name)
        agents.append(Agent(env_t, i))

    # start the training
    for z in range(wandb.config.epochs):

        reward = 0
        # train the agent
        for j in range(len(agents)):
            print('Training Agent {}'.format(agents[j].iden))
            reward += agents[j].train(wandb.config.episodes)
    
        reward = reward / N
        print('Epoch={}\t Average reward={}'.format(z, reward))
        wandb.log({'batch': z, 'Epoch': reward})


        # get the average - actor and critic
        critic_avg = []
        actor_avg = []

        for i in range(len(agents[0].actor.model.get_weights())):
            
            actor_t = agents[0].actor.model.get_weights()[i]

            for j in range(1, N):
                actor_t += agents[j].actor.model.get_weights()[i]

            actor_t = actor_t / N
            actor_avg.append(actor_t)


        for i in range(len(agents[0].critic.model.get_weights())):
            critic_t = agents[0].critic.model.get_weights()[i]

            for j in range(1, N):
                critic_t += agents[j].critic.model.get_weights()[i]

            critic_t = critic_t / N
            critic_avg.append(critic_t)


        # set the average
        for j in range(N):
            agents[j].actor.model.set_weights(actor_avg)
            agents[j].critic.model.set_weights(critic_avg)


    # wrtie things out
    for j in range(N):

        
#         with open(wandb.run.dir + wandb.run.id() + "-agent{}-actor.txt".format(j), "w") as f:
#             f.write(str(agents[j].actor.model.get_weights()))
#             f.close()
#         wandb.save(wandb.run.dir + "agent{}-actor.txt".format(j))
        
        
#         with open(wandb.run.dir + "agent{}-critic.txt".format(j), "w") as f:
#             f.write(str(agents[j].critic.model.get_weights()))
#             f.close()
#         wandb.save(wandb.run.dir + "agent{}-critic.txt".format(j))

    
    wandb.finish()
    
# if __name__ == "__main__":
#     main()
    

{'gamma': 0.99, 'actor_lr': 0.001, 'critic_lr': 0.0001, 'batch_size': 64, 'tau': 0.005, 'train_start': 400, 'episodes': 5, 'num': 3, 'epochs': 2, 'actor': {'layer1': 128, 'layer2': 128}, 'critic': {'state1': 256, 'state2': 128, 'actor1': 128, 'cat1': 64}}
Training Agent 0
EP0 EpisodeReward=-1618.2978326110708
EP1 EpisodeReward=-1320.6920827798501
EP2 EpisodeReward=-1309.780001567266
EP3 EpisodeReward=-1669.3555635656592
EP4 EpisodeReward=-1555.8289035570126
Training Agent 1
EP0 EpisodeReward=-1154.3043278771856
EP1 EpisodeReward=-1197.2978441269431
EP2 EpisodeReward=-1415.4956745005588
EP3 EpisodeReward=-1216.7546197784213
EP4 EpisodeReward=-1472.3796587933227
Training Agent 2
EP0 EpisodeReward=-1715.7008176403021
EP1 EpisodeReward=-1015.2949755587437
EP2 EpisodeReward=-1650.783800363374
EP3 EpisodeReward=-1590.536368846822
EP4 EpisodeReward=-1397.2612344751867
Epoch=0	 Average reward=-1475.1565989418407
Training Agent 0
EP0 EpisodeReward=-1260.2629027340104
EP1 EpisodeReward=-1182.474

0,1
Reward0,-1745.06052
_step,31.0
_runtime,219.0
_timestamp,1606342121.0
Reward1,-1209.30537
Reward2,-1287.49936
batch,1.0
Epoch,-1413.95508


0,1
Reward0,▃▆▆▂▃▇█▄▇▁
_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
Reward1,▆▅▃▅▃▄▂█▁▅
Reward2,▂█▃▃▅▆▁▆▅▆
batch,▁█
Epoch,▁█


In [23]:
agents[j].actor.model.save_weights(wandb.run.dir + "/" + wandb.run.id + "-agent{}-actor.txt".format(j))
wandb.save(wandb.run.dir + "/" + wandb.run.id + "-agent{}-actor.txt".format(j))
# st = './' + wandb.run.dir + '/' + wandb.run.id
# print(st)



[]

In [24]:
wandb.finish()

0,1
Reward0,-1195.23171
_step,36.0
_runtime,2979.0
_timestamp,1606340545.0
Reward1,-1785.90016
Reward2,-1275.39651
batch,1.0
Epoch,-1418.84279
Reward5,-1326.74586


0,1
Reward0,▃█▆▄▄▅▁▄▃▆
_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█████
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█████
Reward1,▄▇█▃▃█▂▁▁▁
Reward2,▅█▅▆▁▅▃▆▂▅
batch,▁█
Epoch,▁█
Reward5,▁▂▃▅█


In [22]:
env_t = gym.make('Pendulum-v0')

test = Agent(env_t, 5)
test.actor.model.load_weights(wandb.run.dir + "/" + wandb.run.id)
test.train(wandb.config.episodes)

EP0 EpisodeReward=-1814.7723910898906
EP1 EpisodeReward=-1778.3727616439553
EP2 EpisodeReward=-1695.057725756254
EP3 EpisodeReward=-1544.5524373830822
EP4 EpisodeReward=-1326.7458598531082


-1326.7458598531082

In [None]:
# import pandas
wandb.finish()
api = wandb.Api()
run = api.run("victor-qin/deep-rl-tf2/1s1ac3wo")
temp = run.history()
# print(run.scan_history())
# enumerate(run.history())

In [None]:
if run.state == "finished":
    for i, row in enumerate(run.scan_history()):
        try: print(row["_runtime"],'\t', row["Epoch"])
        except: pass

In [None]:
# critic_avg = []
# actor_avg = []
# for i in range(len(agent1.actor.model.get_weights())):
#     critic_avg.append(agent1.critic.model.get_weights()[i] + agent2.critic.model.get_weights()[i])
#     actor_avg.append(agent1.actor.model.get_weights()[i] + agent2.actor.model.get_weights()[i])
    
#     agent1.critic.model.set_weights = critic_avg[i]
#     agent2.critic.model.set_weights = critic_avg[i]
    
#     agent1.actor.model.set_weights = actor_avg[i]
#     agent2.actor.model.set_weights = actor_avg[i]
    
# agent1.train(1)
# agent2.train(1)

In [None]:
# def main():
#     wandb.init(name='DDPG', project="deep-rl-tf2")
#     env_name = 'Pendulum-v0'
#     env = gym.make(env_name)
#     agent = Agent(env)
#     agent.train()


# if __name__ == "__main__":
#     main()