In [23]:
import gym
import numpy as np
import slimevolleygym

from maddpg import MADDPGAgent
env = gym.make("SlimeVolley-v0")

In [27]:
obs_dim_list, action_dim_list = [12, 12], [3, 3]

BUFFER_SIZE = int(1e6) # replay buffer size
BATCH_SIZE = 32        # minibatch size
GAMMA = 0.95           # discount factor
LR_ACTOR = 0.01        # learning rate of the actor
LR_CRITIC = 0.01       # learning rate of the critic
TAU = 0.01             # soft update
UPDATE_EVERY = 100     # update network every X samples added to replay buffer

maddpg = MADDPGAgent(obs_dim_list, action_dim_list, shared_obs = False, 
                     lr_actor = LR_ACTOR, lr_critic = LR_CRITIC, 
                     batch_size = BATCH_SIZE, gamma = GAMMA, tau = TAU, 
                     buffer_size = BUFFER_SIZE, update_every = UPDATE_EVERY)

In [36]:
NUM_EPISODES = 1000
PRINT_EVERY = 100
TMAX = 100

for e in range(1, NUM_EPISODES):
    score1, score2 = 0, 0
    
    obs1 = env.reset()
    obs2 = obs1
    obs = [obs1, obs2]
    
    for t in range(TMAX):
        
        a1, a2 = maddpg.get_actions(obs)
        
        obs1, r1, done, info = env.step(a1, a2)
        obs2 = info['otherObs']
        next_obs = [obs1, obs2]
        
        maddpg.step(obs, [a1, a2], [r1, -r1], next_obs, done)
        
        obs = next_obs
        
        score1 += r1
        score2 += -r1
    
        if done:
            break

    print('\rEpisode {}\t score1 {:6d}\t score2 {:6d}'.format(e, score1, score2), end = "")
    if e % PRINT_EVERY == 0:
        print('\rEpisode {}\t score1 {:6d}\t score2 {:6d}'.format(e, score1, score2))

Episode 100	 score1     -1	 score2      1
Episode 200	 score1      1	 score2     -1
Episode 300	 score1     -1	 score2      1
Episode 400	 score1      0	 score2      0
Episode 500	 score1      0	 score2      0
Episode 600	 score1      1	 score2     -1
Episode 700	 score1      1	 score2     -1
Episode 800	 score1     -1	 score2      1
Episode 900	 score1      0	 score2      0
Episode 999	 score1      1	 score2     -1

In [None]:
for t in range(TMAX):
        
    a1, a2 = maddpg.get_actions(obs)
        
    obs1, r1, done, info = env.step(a1, a2)
    obs2 = info['otherObs']
    next_obs = [obs1, obs2]