In [1]:
import wandb
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda

import gym
import argparse
import numpy as np

tf.keras.backend.set_floatx('float64')

# parser = argparse.ArgumentParser()
# parser.add_argument('--gamma', type=float, default=0.99)
# parser.add_argument('--update_interval', type=int, default=5)
# parser.add_argument('--actor_lr', type=float, default=0.0005)
# parser.add_argument('--critic_lr', type=float, default=0.001)
# parser.add_argument('--clip_ratio', type=float, default=0.1)
# parser.add_argument('--lmbda', type=float, default=0.95)
# parser.add_argument('--epochs', type=int, default=3)

# args = parser.parse_args()

# class Args:
#     gamma = 0.99
#     update_interval = 5
#     actor_lr = 0.0005
#     critic_lr = 0.001
#     batch_size = 64
#     clip_ratio = 0.1
#     lmbda = 0.95
#     intervals = 3
    
#     episodes = 10
#     N = 3
#     epochs = 100

# args = Args()


In [2]:
class Actor:
    def __init__(self, state_dim, action_dim, action_bound, std_bound):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.std_bound = std_bound
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(wandb.config.actor_lr)

    def get_action(self, state):
        state = np.reshape(state, [1, self.state_dim])
        mu, std = self.model.predict(state)
        action = np.random.normal(mu[0], std[0], size=self.action_dim)
        action = np.clip(action, -self.action_bound, self.action_bound)
        log_policy = self.log_pdf(mu, std, action)

        return log_policy, action

    def log_pdf(self, mu, std, action):
        std = tf.clip_by_value(std, self.std_bound[0], self.std_bound[1])
        var = std ** 2
        log_policy_pdf = -0.5 * (action - mu) ** 2 / \
            var - 0.5 * tf.math.log(var * 2 * np.pi)
        return tf.reduce_sum(log_policy_pdf, 1, keepdims=True)

    def create_model(self):
        state_input = Input((self.state_dim,))
        dense_1 = Dense(wandb.config.actor['layer1'], activation='relu')(state_input)
        dense_2 = Dense(wandb.config.actor['layer2'], activation='relu')(dense_1)
        out_mu = Dense(self.action_dim, activation='tanh')(dense_2)
        mu_output = Lambda(lambda x: x * self.action_bound)(out_mu)
        std_output = Dense(self.action_dim, activation='softplus')(dense_2)
        return tf.keras.models.Model(state_input, [mu_output, std_output])

    def compute_loss(self, log_old_policy, log_new_policy, actions, gaes):
        ratio = tf.exp(log_new_policy - tf.stop_gradient(log_old_policy))
        gaes = tf.stop_gradient(gaes)
        clipped_ratio = tf.clip_by_value(
            ratio, 1.0-wandb.config.clip_ratio, 1.0+wandb.config.clip_ratio)
        surrogate = -tf.minimum(ratio * gaes, clipped_ratio * gaes)
        return tf.reduce_mean(surrogate)

    def train(self, log_old_policy, states, actions, gaes):
        with tf.GradientTape() as tape:
            mu, std = self.model(states, training=True)
            log_new_policy = self.log_pdf(mu, std, actions)
            loss = self.compute_loss(
                log_old_policy, log_new_policy, actions, gaes)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss


In [3]:
class Critic:
    def __init__(self, state_dim):
        self.state_dim = state_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(wandb.config.critic_lr)

    def create_model(self):
        return tf.keras.Sequential([
            Input((self.state_dim,)),
            Dense(wandb.config.critic['layer1'], activation='relu'),
            Dense(wandb.config.critic['layer2'], activation='relu'),
            Dense(wandb.config.critic['layer3'], activation='relu'),
            Dense(1, activation='linear')
        ])

    def compute_loss(self, v_pred, td_targets):
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model(states, training=True)
            assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss


In [4]:
class Agent:
    def __init__(self, env, iden = 0):
        self.env = env
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.action_bound = self.env.action_space.high[0]
        self.std_bound = [1e-2, 1.0]

        self.actor_opt = tf.keras.optimizers.Adam(wandb.config.actor_lr)
        self.critic_opt = tf.keras.optimizers.Adam(wandb.config.critic_lr)
        self.actor = Actor(self.state_dim, self.action_dim,
                           self.action_bound, self.std_bound)
        self.critic = Critic(self.state_dim)
        
        self.iden = iden

    def gae_target(self, rewards, v_values, next_v_value, done):
        n_step_targets = np.zeros_like(rewards)
        gae = np.zeros_like(rewards)
        gae_cumulative = 0
        forward_val = 0

        if not done:
            forward_val = next_v_value

        for k in reversed(range(0, len(rewards))):
            delta = rewards[k] + wandb.config.gamma * forward_val - v_values[k]
            gae_cumulative = wandb.config.gamma * wandb.config.lmbda * gae_cumulative + delta
            gae[k] = gae_cumulative
            forward_val = v_values[k]
            n_step_targets[k] = gae[k] + v_values[k]
        return gae, n_step_targets

    def list_to_batch(self, list):
        batch = list[0]
        for elem in list[1:]:
            batch = np.append(batch, elem, axis=0)
        return batch

    def train(self, max_episodes=1000):
        for ep in range(max_episodes):
            state_batch = []
            action_batch = []
            reward_batch = []
            old_policy_batch = []

            episode_reward, done = 0, False

            state = self.env.reset()

            while not done:
                # self.env.render()
                log_old_policy, action = self.actor.get_action(state)

                next_state, reward, done, _ = self.env.step(action)

                state = np.reshape(state, [1, self.state_dim])
                action = np.reshape(action, [1, 1])
                next_state = np.reshape(next_state, [1, self.state_dim])
                reward = np.reshape(reward, [1, 1])
                log_old_policy = np.reshape(log_old_policy, [1, 1])

                state_batch.append(state)
                action_batch.append(action)
                reward_batch.append((reward+8)/8)
                old_policy_batch.append(log_old_policy)

                if len(state_batch) >= wandb.config.update_interval or done:
                    states = self.list_to_batch(state_batch)
                    actions = self.list_to_batch(action_batch)
                    rewards = self.list_to_batch(reward_batch)
                    old_policys = self.list_to_batch(old_policy_batch)

                    v_values = self.critic.model.predict(states)
                    next_v_value = self.critic.model.predict(next_state)

                    gaes, td_targets = self.gae_target(
                        rewards, v_values, next_v_value, done)

                    for epoch in range(wandb.config.intervals):
                        actor_loss = self.actor.train(
                            old_policys, states, actions, gaes)
                        critic_loss = self.critic.train(states, td_targets)

                    state_batch = []
                    action_batch = []
                    reward_batch = []
                    old_policy_batch = []

                episode_reward += reward[0][0]
                state = next_state[0]

            print('EP{} EpisodeReward={}'.format(ep, episode_reward))
            wandb.log({'Reward' + str(self.iden): episode_reward})
        
        return episode_reward


In [None]:
if __name__ == "__main__":
    
    try: wandb.finish()
    except: pass
    
    ####configurations
    wandb.init(name='PPO-multiple-long', project="deep-rl-tf2")
    env_name = 'Pendulum-v0'

    
    wandb.config.gamma = 0.99
    wandb.config.update_interval = 5
    wandb.config.actor_lr = 0.0005
    wandb.config.critic_lr = 0.001
    wandb.config.batch_size = 64
    wandb.config.clip_ratio = 0.1
    wandb.config.lmbda = 0.95
    wandb.config.intervals = 3
    
    wandb.config.episodes = 5
    wandb.config.num = 3
    wandb.config.epochs = 200

    wandb.config.actor = {'layer1': 32, 'layer2' : 32}
    wandb.config.critic = {'layer1': 32, 'layer2' : 32, 'layer3': 16}
    
    print(wandb.config)
    
    # main run    
    N = wandb.config.num
    agents = []
    
    # set up the agent
    for i in range(N):
        env_t = gym.make(env_name)
        agents.append(Agent(env_t, i))

    # start the training
    for z in range(wandb.config.epochs):

        reward = 0
        # train the agent
        for j in range(len(agents)):
            print('Training Agent {}'.format(agents[j].iden))
            reward += agents[j].train(wandb.config.episodes)
    
        reward = reward / N
        print('Epoch={}\t Average reward={}'.format(z, reward))
        wandb.log({'batch': z, 'Epoch': reward})


        # get the average - actor and critic
        critic_avg = []
        actor_avg = []

        for i in range(len(agents[0].actor.model.get_weights())):
            
            actor_t = agents[0].actor.model.get_weights()[i]

            for j in range(1, N):
                actor_t += agents[j].actor.model.get_weights()[i]

            actor_t = actor_t / N
            actor_avg.append(actor_t)


        for i in range(len(agents[0].critic.model.get_weights())):
            critic_t = agents[0].critic.model.get_weights()[i]

            for j in range(1, N):
                critic_t += agents[j].critic.model.get_weights()[i]

            critic_t = critic_t / N
            critic_avg.append(critic_t)


        # set the average
        for j in range(N):
            agents[j].actor.model.set_weights(actor_avg)
            agents[j].critic.model.set_weights(critic_avg)


    # wrtie things out
    for j in range(N):
        with open("agent{}-actor.txt".format(j), "w") as f:
            f.write(str(agents[j].actor.model.get_weights()))
            f.close()
        wandb.save("agent{}-actor.txt".format(j))
        
        
        with open("agent{}-critic.txt".format(j), "w") as f:
            f.write(str(agents[j].critic.model.get_weights()))
            f.close()
        wandb.save("agent{}-critic.txt".format(j))

    
    wandb.finish()

{'gamma': 0.99, 'update_interval': 5, 'actor_lr': 0.0005, 'critic_lr': 0.001, 'batch_size': 64, 'clip_ratio': 0.1, 'lmbda': 0.95, 'intervals': 3, 'episodes': 5, 'num': 3, 'epochs': 200, 'actor': {'layer1': 32, 'layer2': 32}, 'critic': {'layer1': 32, 'layer2': 32, 'layer3': 16}}
Training Agent 0
EP0 EpisodeReward=-1664.5169533860847
EP1 EpisodeReward=-1826.4128968716336
EP2 EpisodeReward=-1087.9888521201874
EP3 EpisodeReward=-1501.3778726966254
EP4 EpisodeReward=-1342.9093457667946
Training Agent 1
EP0 EpisodeReward=-1396.1159420464223
EP1 EpisodeReward=-1664.627745580286
EP2 EpisodeReward=-1527.551076731109
EP3 EpisodeReward=-1548.8385124540102
EP4 EpisodeReward=-1526.9715217677403
Training Agent 2
EP0 EpisodeReward=-1498.6069248862202
EP1 EpisodeReward=-1406.521938994423
EP2 EpisodeReward=-1455.239321374338
EP3 EpisodeReward=-1492.9091363408036
EP4 EpisodeReward=-1596.2103027970516
Epoch=0	 Average reward=-1488.6970567771957
Training Agent 0
EP0 EpisodeReward=-1432.5729383124021
EP1 E

EP0 EpisodeReward=-1416.405314580213
EP1 EpisodeReward=-1320.6026979995781
EP2 EpisodeReward=-1469.3904611527507
EP3 EpisodeReward=-1420.2965380377652
EP4 EpisodeReward=-1225.3596291322153
Training Agent 1
EP0 EpisodeReward=-1169.9040885320705
EP1 EpisodeReward=-1338.0637944673774
EP2 EpisodeReward=-1294.207154429655
EP3 EpisodeReward=-1284.3358454679335
EP4 EpisodeReward=-1197.5554429929546
Training Agent 2
EP0 EpisodeReward=-1306.4562591876474
EP1 EpisodeReward=-1064.6915205550185
EP2 EpisodeReward=-1411.7772989435746
EP3 EpisodeReward=-1391.2531247276
EP4 EpisodeReward=-1155.5039526482144
Epoch=12	 Average reward=-1192.806341591128
Training Agent 0
EP0 EpisodeReward=-1160.9134278680904
EP1 EpisodeReward=-1203.684040343666
EP2 EpisodeReward=-1141.7336354262725
EP3 EpisodeReward=-1423.0271945790255
EP4 EpisodeReward=-1458.9110819065938
Training Agent 1
EP0 EpisodeReward=-1160.3408955231519
EP1 EpisodeReward=-1242.8832873114904
EP2 EpisodeReward=-1237.3430206093026
EP3 EpisodeReward=-1

EP2 EpisodeReward=-1438.4183931230396
EP3 EpisodeReward=-1375.6688828694248
EP4 EpisodeReward=-1317.1221749136278
Training Agent 2
EP0 EpisodeReward=-900.1419054918812
EP1 EpisodeReward=-1100.3807876842752
EP2 EpisodeReward=-1244.265729187993
EP3 EpisodeReward=-1126.2865512228964
EP4 EpisodeReward=-1175.821030245861
Epoch=24	 Average reward=-1147.328982494215
Training Agent 0
EP0 EpisodeReward=-1341.405619568132
EP1 EpisodeReward=-1231.3335916625354
EP2 EpisodeReward=-905.3055835998216
EP3 EpisodeReward=-1236.0816641256597
EP4 EpisodeReward=-1526.5622310220692
Training Agent 1
EP0 EpisodeReward=-1164.0672243449992
EP1 EpisodeReward=-1201.029064832471
EP2 EpisodeReward=-841.4433186254979
EP3 EpisodeReward=-1075.7901080614295
EP4 EpisodeReward=-1220.582465308517
Training Agent 2
EP0 EpisodeReward=-1048.7765256887478
EP1 EpisodeReward=-885.5903629245963
EP2 EpisodeReward=-912.8750877399658
EP3 EpisodeReward=-931.2089320645243
EP4 EpisodeReward=-764.0553570525745
Epoch=25	 Average reward=-

EP0 EpisodeReward=-1211.6217032215966
EP1 EpisodeReward=-953.8446537078573
EP2 EpisodeReward=-912.7136734067799
EP3 EpisodeReward=-957.1487538160691
EP4 EpisodeReward=-901.8757037942061
Training Agent 1
EP0 EpisodeReward=-952.6341771336023
EP1 EpisodeReward=-392.0032231174125
EP2 EpisodeReward=-266.9632388633265
EP3 EpisodeReward=-1552.280328151975
EP4 EpisodeReward=-763.2500476415464
Training Agent 2
EP0 EpisodeReward=-792.3071063309568
EP1 EpisodeReward=-930.8162010994047
EP2 EpisodeReward=-390.9575169339331
EP3 EpisodeReward=-671.1699342106837
EP4 EpisodeReward=-970.9310360833389
Epoch=37	 Average reward=-878.6855958396972
Training Agent 0
EP0 EpisodeReward=-909.5646695758657
EP1 EpisodeReward=-1329.3756022978919
EP2 EpisodeReward=-928.9892827280554
EP3 EpisodeReward=-648.9670723413113
EP4 EpisodeReward=-274.5726403556639
Training Agent 1
EP0 EpisodeReward=-518.0699817575805
EP1 EpisodeReward=-519.6643358249678
EP2 EpisodeReward=-780.1989553876448
EP3 EpisodeReward=-653.823855474019

EP4 EpisodeReward=-127.92345705645045
Training Agent 2
EP0 EpisodeReward=-772.8557601938979
EP1 EpisodeReward=-647.1494573879572
EP2 EpisodeReward=-1548.7651839001862
EP3 EpisodeReward=-274.744395393136
EP4 EpisodeReward=-1505.3583057774822
Epoch=49	 Average reward=-1067.8460708869743
Training Agent 0
EP0 EpisodeReward=-130.46694032656458
EP1 EpisodeReward=-1559.8406628482928
EP2 EpisodeReward=-660.3087702132489
EP3 EpisodeReward=-1440.2329841090186
EP4 EpisodeReward=-1062.9186713733152
Training Agent 1
EP0 EpisodeReward=-793.0870978970604
EP1 EpisodeReward=-257.01527206437214
EP2 EpisodeReward=-925.9275613836684
EP3 EpisodeReward=-1058.5376069175254
EP4 EpisodeReward=-1172.3514819052018
Training Agent 2
EP0 EpisodeReward=-1033.8233882245052
EP1 EpisodeReward=-642.2922559535938
EP2 EpisodeReward=-1576.8393621274151
EP3 EpisodeReward=-1077.8466404869941
EP4 EpisodeReward=-1515.3668493653286
Epoch=50	 Average reward=-1250.212334214615
Training Agent 0
EP0 EpisodeReward=-1042.665778257782

EP1 EpisodeReward=-648.8189800180219
EP2 EpisodeReward=-659.2986785497405
EP3 EpisodeReward=-1236.5653451256662
EP4 EpisodeReward=-1359.4075932773262
Training Agent 1
EP0 EpisodeReward=-135.62120900925433
EP1 EpisodeReward=-408.0468126369727
EP2 EpisodeReward=-527.2714539398801
EP3 EpisodeReward=-393.00091943870484
EP4 EpisodeReward=-421.97872120160906
Training Agent 2
EP0 EpisodeReward=-129.45099362264096
EP1 EpisodeReward=-379.74938086332514
EP2 EpisodeReward=-784.6973246633355
EP3 EpisodeReward=-1209.9178479806762
EP4 EpisodeReward=-1285.6589288319683
Epoch=62	 Average reward=-1022.3484144369678
Training Agent 0
EP0 EpisodeReward=-1081.3828394143204
EP1 EpisodeReward=-126.93953705783194
EP2 EpisodeReward=-1084.9754151729708
EP3 EpisodeReward=-924.887634440992
EP4 EpisodeReward=-1094.039762647558
Training Agent 1
EP0 EpisodeReward=-260.435241879705
EP1 EpisodeReward=-653.9618464547968
EP2 EpisodeReward=-127.84397679016706
EP3 EpisodeReward=-258.63993981184376
EP4 EpisodeReward=-789.9

EP0 EpisodeReward=-658.1508237698991
EP1 EpisodeReward=-533.8981173709428
EP2 EpisodeReward=-722.3680653734565
EP3 EpisodeReward=-924.4546803858444
EP4 EpisodeReward=-780.7233802791905
Epoch=74	 Average reward=-947.8727757148718
Training Agent 0
EP0 EpisodeReward=-822.526254037127
EP1 EpisodeReward=-1043.0421527149458
EP2 EpisodeReward=-1186.361278587714
EP3 EpisodeReward=-561.8895791439709
EP4 EpisodeReward=-630.8149847355489
Training Agent 1
EP0 EpisodeReward=-661.1858830621646
EP1 EpisodeReward=-777.9846353924448
EP2 EpisodeReward=-783.9926375743468
EP3 EpisodeReward=-903.8254736684062
EP4 EpisodeReward=-1206.317535543852
Training Agent 2
EP0 EpisodeReward=-1032.0682275495724
EP1 EpisodeReward=-653.4400488350506
EP2 EpisodeReward=-1562.2276026875938
EP3 EpisodeReward=-1096.7607808311827
EP4 EpisodeReward=-1048.6700973314264
Epoch=75	 Average reward=-961.9342058702758
Training Agent 0
EP0 EpisodeReward=-646.7264398343622
EP1 EpisodeReward=-1049.0767832529257
EP2 EpisodeReward=-1275.3

EP3 EpisodeReward=-779.0720623963571
EP4 EpisodeReward=-648.3325459292404
Training Agent 1
EP0 EpisodeReward=-643.7966330889287
EP1 EpisodeReward=-1092.902108474552
EP2 EpisodeReward=-1217.0050654743582
EP3 EpisodeReward=-1336.712286971902
EP4 EpisodeReward=-1423.2271037599533
Training Agent 2
EP0 EpisodeReward=-525.8118085108232
EP1 EpisodeReward=-926.1542866069987
EP2 EpisodeReward=-941.3208768962585
EP3 EpisodeReward=-402.18475129258866
EP4 EpisodeReward=-611.979928897726
Epoch=87	 Average reward=-894.5131928623065
Training Agent 0
EP0 EpisodeReward=-523.563647967728
EP1 EpisodeReward=-263.5511297830764
EP2 EpisodeReward=-533.5504980615223
EP3 EpisodeReward=-524.9889738192319
EP4 EpisodeReward=-269.69594760485563
Training Agent 1
EP0 EpisodeReward=-658.2368343053081
EP1 EpisodeReward=-652.4991679630031
EP2 EpisodeReward=-825.2110859561964
EP3 EpisodeReward=-1218.7687821481752
EP4 EpisodeReward=-1310.9829328209707
Training Agent 2
EP0 EpisodeReward=-931.9684158316553
EP1 EpisodeRewar

EP3 EpisodeReward=-8.46462679136666
EP4 EpisodeReward=-6.547064901238234
Epoch=99	 Average reward=-488.24825424475347
Training Agent 0
EP0 EpisodeReward=-529.5260429791219
EP1 EpisodeReward=-662.3452326200796
EP2 EpisodeReward=-531.4755811439986
EP3 EpisodeReward=-541.4844593910915
EP4 EpisodeReward=-134.35219399414498
Training Agent 1
EP0 EpisodeReward=-629.9073765807735
EP1 EpisodeReward=-684.0966789278305
EP2 EpisodeReward=-534.9945218373194
EP3 EpisodeReward=-697.3446093551287
EP4 EpisodeReward=-291.1987083881531
Training Agent 2
EP0 EpisodeReward=-8.991064535706979
EP1 EpisodeReward=-559.6518902371754
EP2 EpisodeReward=-2.1740694528016915
EP3 EpisodeReward=-732.189758703005
EP4 EpisodeReward=-1062.958947349477
Epoch=100	 Average reward=-496.1699499105917
Training Agent 0
EP0 EpisodeReward=-270.26915737442005
EP1 EpisodeReward=-269.82222635356777
EP2 EpisodeReward=-263.17054431484047
EP3 EpisodeReward=-1064.8564512964772
EP4 EpisodeReward=-7.61764361232555
Training Agent 1
EP0 Epis

EP0 EpisodeReward=-267.5360051176558
EP1 EpisodeReward=-405.0225761902247
EP2 EpisodeReward=-194.01341245246826
EP3 EpisodeReward=-1193.719352540475
EP4 EpisodeReward=-913.5784964910605
Training Agent 2
EP0 EpisodeReward=-405.3382382686698
EP1 EpisodeReward=-3.1949985509955647
EP2 EpisodeReward=-5.162671379614226
EP3 EpisodeReward=-533.5387030381347
EP4 EpisodeReward=-534.1519577455998
Epoch=112	 Average reward=-694.8494541485112
Training Agent 0
EP0 EpisodeReward=-9.619072137106569
EP1 EpisodeReward=-277.4916371867635
EP2 EpisodeReward=-674.6260481140527
EP3 EpisodeReward=-927.6399404745637
EP4 EpisodeReward=-136.23815910627778
Training Agent 1
EP0 EpisodeReward=-803.7553290237868
EP1 EpisodeReward=-407.45445546792934
EP2 EpisodeReward=-273.35743237482353
EP3 EpisodeReward=-263.1062942792274
EP4 EpisodeReward=-133.48809829394798
Training Agent 2
EP0 EpisodeReward=-276.79922943016726
EP1 EpisodeReward=-647.5412177920211
EP2 EpisodeReward=-808.0733951965143
EP3 EpisodeReward=-403.875399

EP3 EpisodeReward=-1114.2700107602586
EP4 EpisodeReward=-1107.7659085804503
Epoch=124	 Average reward=-844.2557303737825
Training Agent 0
EP0 EpisodeReward=-1041.5752195709633
EP1 EpisodeReward=-255.4458722277431
EP2 EpisodeReward=-273.6793791765754
EP3 EpisodeReward=-128.5075329439765
EP4 EpisodeReward=-265.1109561615597
Training Agent 1
EP0 EpisodeReward=-130.79354184095092
EP1 EpisodeReward=-660.1483036086596
EP2 EpisodeReward=-458.5710305973335
EP3 EpisodeReward=-267.4137737909638
EP4 EpisodeReward=-1089.6823463958453
Training Agent 2
EP0 EpisodeReward=-128.19816858386423
EP1 EpisodeReward=-1086.4193988561237
EP2 EpisodeReward=-418.21664056409463
EP3 EpisodeReward=-272.81269544426704
EP4 EpisodeReward=-264.23013064330905
Epoch=125	 Average reward=-539.6744777335713
Training Agent 0
EP0 EpisodeReward=-132.3302635564835
EP1 EpisodeReward=-135.01869397428132
EP2 EpisodeReward=-724.668218778763
EP3 EpisodeReward=-1232.8559136679867
EP4 EpisodeReward=-1223.2029108507795
Training Agent 1

EP0 EpisodeReward=-262.72205290694546
EP1 EpisodeReward=-790.7954865266731
EP2 EpisodeReward=-136.0774828408385
EP3 EpisodeReward=-5.149091665200754
EP4 EpisodeReward=-940.0167902457343
Training Agent 2
EP0 EpisodeReward=-135.0027355165069
EP1 EpisodeReward=-165.9555822493747
EP2 EpisodeReward=-270.8407931618315
EP3 EpisodeReward=-410.4462471475787
EP4 EpisodeReward=-139.21302596966137
Epoch=137	 Average reward=-403.744459588598
Training Agent 0
EP0 EpisodeReward=-555.2919933845791
EP1 EpisodeReward=-398.64264095304867
EP2 EpisodeReward=-528.3748421638891
EP3 EpisodeReward=-435.89823976547916
EP4 EpisodeReward=-526.666936227943
Training Agent 1
EP0 EpisodeReward=-276.20544671611765
EP1 EpisodeReward=-693.193027534103
EP2 EpisodeReward=-4.776841787809774
EP3 EpisodeReward=-265.95610326649285
EP4 EpisodeReward=-3.081238332299956
Training Agent 2
EP0 EpisodeReward=-417.0740657738691
EP1 EpisodeReward=-129.39089510079867
EP2 EpisodeReward=-263.95316891521253
EP3 EpisodeReward=-274.51456123

EP4 EpisodeReward=-130.01668004365848
Epoch=149	 Average reward=-343.9237099708796
Training Agent 0
EP0 EpisodeReward=-522.1397747584975
EP1 EpisodeReward=-630.2208678056818
EP2 EpisodeReward=-784.9676356338454
EP3 EpisodeReward=-526.4822187755668
EP4 EpisodeReward=-654.4956555332884
Training Agent 1
EP0 EpisodeReward=-129.63067722517934
EP1 EpisodeReward=-278.7945469576061
EP2 EpisodeReward=-754.4962224305647
EP3 EpisodeReward=-1098.7104983030792
EP4 EpisodeReward=-1229.5276828901517
Training Agent 2
EP0 EpisodeReward=-134.19476945600664
EP1 EpisodeReward=-400.15799106574474
EP2 EpisodeReward=-542.1381138260218
EP3 EpisodeReward=-526.7155133421926
EP4 EpisodeReward=-569.4991996282162
Epoch=150	 Average reward=-817.8408460172187
Training Agent 0
EP0 EpisodeReward=-397.28882207937403
EP1 EpisodeReward=-940.7316909313917
EP2 EpisodeReward=-1064.42097023561
EP3 EpisodeReward=-769.8771075518608
EP4 EpisodeReward=-819.4640926023505
Training Agent 1
EP0 EpisodeReward=-134.59391225065662
EP1 

EP1 EpisodeReward=-1091.1989749560926
EP2 EpisodeReward=-1069.8455575872972
EP3 EpisodeReward=-1146.3403524096916
EP4 EpisodeReward=-1112.155596203772
Training Agent 2
EP0 EpisodeReward=-1076.174856244368
EP1 EpisodeReward=-1088.1070685463155
EP2 EpisodeReward=-942.8163977969052
EP3 EpisodeReward=-745.2043662067287
EP4 EpisodeReward=-911.8329776858549
Epoch=162	 Average reward=-1124.627740497844
Training Agent 0
EP0 EpisodeReward=-1110.8672995734055
EP1 EpisodeReward=-928.0567102456266
EP2 EpisodeReward=-889.586185000728
EP3 EpisodeReward=-1037.3863001103734
EP4 EpisodeReward=-1000.3073324380814
Training Agent 1
EP0 EpisodeReward=-1196.4189519320419
EP1 EpisodeReward=-1119.5860464043767
EP2 EpisodeReward=-797.1394677386371
EP3 EpisodeReward=-999.405006286803
EP4 EpisodeReward=-794.4902256218747
Training Agent 2
EP0 EpisodeReward=-929.7471071323723
EP1 EpisodeReward=-1167.9107662438075
EP2 EpisodeReward=-1115.0197400053667
EP3 EpisodeReward=-1228.5728835956063
EP4 EpisodeReward=-1103.49

EP4 EpisodeReward=-991.1856506894906
Epoch=174	 Average reward=-922.7860666354774
Training Agent 0
EP0 EpisodeReward=-956.8313820838428
EP1 EpisodeReward=-925.3585874655217
EP2 EpisodeReward=-786.9315048819855
EP3 EpisodeReward=-849.2912519029929
EP4 EpisodeReward=-1298.3014490855635
Training Agent 1
EP0 EpisodeReward=-884.1684868797339
EP1 EpisodeReward=-1142.52069212976
EP2 EpisodeReward=-937.657714223539
EP3 EpisodeReward=-775.2275565212576
EP4 EpisodeReward=-808.2463911864784
Training Agent 2
EP0 EpisodeReward=-935.8094286807219
EP1 EpisodeReward=-867.8054205207739
EP2 EpisodeReward=-940.2115682646527
EP3 EpisodeReward=-1036.1957761548467
EP4 EpisodeReward=-964.3947911951142
Epoch=175	 Average reward=-1023.6475438223853
Training Agent 0
EP0 EpisodeReward=-818.4463217650477
EP1 EpisodeReward=-530.3287294656997
EP2 EpisodeReward=-1212.0469561747213
EP3 EpisodeReward=-676.24712123654
EP4 EpisodeReward=-1219.4906649601185
Training Agent 1
EP0 EpisodeReward=-909.577698789975
EP1 Episode