In [11]:
import celery
import numpy as np
from copy import deepcopy
import json
from celery import group
from worker import upload_params, NumpyEncoder
from common.arguments import get_args
from common.utils import make_env
from common.replay_buffer import Buffer
from tqdm import tqdm
from agent import Agent
import torch
import os
import numpy as np
import matplotlib.pyplot as plt

In [12]:
myargs = {}
myargs["scenario_name"] = "simple_tag"
myargs["max_episode_len"] = 100
myargs["time_steps"] = 2000000
myargs["num_adversaries"] = 1
myargs["lr_actor"] = 1e-4
myargs["lr_critic"] = 1e-3
myargs["epsilon"] = 0.1
myargs["noise_rate"] = 0.1
myargs["gamma"] = 0.95
myargs["tau"] = 0.01
myargs["buffer_size"] = int(5e5)
myargs["batch_size"] = 256
myargs["save_dir"] = "./model"
myargs["save_rate"] = 2000
myargs["model_dir"] = ""
myargs["evaluate_episodes"] = 10
myargs["evaluate_episodes_len"] = 100
myargs["evaluate"] = False
myargs["evaluate_rate"] = 1000
myargs["render"] = False

In [None]:
# upload params
response = group(
                upload_params.s(json_dump = json.dumps({"args": myargs}, cls=NumpyEncoder))
            )()

In [13]:
scenario_name = myargs["scenario_name"]
max_episode_len = myargs["max_episode_len"]
time_steps = myargs["time_steps"]
num_adversaries = myargs["num_adversaries"]
lr_actor = myargs["lr_actor"]
lr_critic = myargs["lr_critic"]
epsilon = myargs["epsilon"]
noise_rate = myargs["noise_rate"]
gamma = myargs["gamma"]
tau = myargs["tau"]
buffer_size = myargs["buffer_size"]
batch_size = myargs["batch_size"]
save_dir = myargs["save_dir"]
save_rate = myargs["save_rate"]
model_dir = myargs["model_dir"]
evaluate_episodes = myargs["evaluate_episodes"]
evaluate_episodes_len = myargs["evaluate_episodes_len"]
evaluate = myargs["evaluate"]
evaluate_rate = myargs["evaluate_rate"]
render = myargs["render"]

class Arguments:
    def __init__(self, scenario_name, max_episode_len, time_steps, num_adversaries, lr_actor, lr_critic, 
                    epsilon, noise_rate, gamma, tau, buffer_size, save_dir, save_rate, model_dir, evaluate_episodes, 
                    evaluate_episodes_len, evaluate, evaluate_rate, render):
        self.scenario_name = scenario_name
        self.max_episode_len = max_episode_len
        self.time_steps = time_steps
        self.num_adversaries = num_adversaries
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.epsilon = epsilon
        self.noise_rate = noise_rate
        self.gamma = gamma
        self.tau = tau
        self.buffer_size = buffer_size
        self.save_dir = save_dir
        self.save_rate = save_rate
        self.model_dir = model_dir
        self.evaluate_episodes = evaluate_episodes
        self.evaluate_episodes_len = evaluate_episodes_len
        self.evaluate = evaluate
        self.evaluate_rate = evaluate_rate
        self.render = render

args = Arguments(scenario_name, max_episode_len, time_steps, num_adversaries, lr_actor, lr_critic, 
                    epsilon, noise_rate, gamma, tau, buffer_size, save_dir, save_rate, model_dir, evaluate_episodes, 
                    evaluate_episodes_len, evaluate, evaluate_rate, render)

env, args = make_env(args)


In [None]:
def run(self):
    returns = []
    for time_step in tqdm(range(self.args.time_steps)):
        # reset the environment
        if time_step % self.episode_limit == 0:
            s = self.env.reset()
        u = []
        actions = []
        with torch.no_grad():
            for agent_id, agent in enumerate(self.agents):
                action = agent.select_action(s[agent_id], self.noise, self.epsilon)
                u.append(action)
                actions.append(action)
        for i in range(self.args.n_agents, self.args.n_players):
            actions.append([0, np.random.rand() * 2 _ 1, 0, np.random.rand() * 2 _ 1, 0])
        s_next, r, done, info = self.env.step(actions)
        self.buffer.store_episode(s[:self.args.n_agents], u, r[:self.args.n_agents], s_next[:self.args.n_agents])
        s = s_next
        if self.buffer.current_size >= self.args.batch_size:
            transitions = self.buffer.sample(self.args.batch_size)
            # send to worker
            
            for agent in self.agents:
                other_agents = self.agents.copy()
                other_agents.remove(agent)
                agent.learn(transitions, other_agents)
        if time_step > 0 and time_step % self.args.evaluate_rate == 0:
            returns.append(self.evaluate())
            plt.figure()
            plt.plot(range(len(returns)), returns)
            plt.xlabel('episode * ' + str(self.args.evaluate_rate / self.episode_limit))
            plt.ylabel('average returns')
            plt.savefig(self.save_path + '/plt.png', format='png')
            np.save(self.save_path + '/returns.pkl', returns)
        self.noise = max(0.05, self.noise - 0.0000005)
        self.epsilon = max(0.05, self.epsilon - 0.0000005)
        # np.save(self.save_path + '/returns.pkl', returns)

def evaluate(self):
    returns = []
    for episode in range(self.args.evaluate_episodes):
        # reset the environment
        s = self.env.reset()
        rewards = 0
        for time_step in range(self.args.evaluate_episode_len):
            if self.args.render:
                self.env.render()
            actions = []
            with torch.no_grad():
                for agent_id, agent in enumerate(self.agents):
                    action = agent.select_action(s[agent_id], 0, 0)
                    actions.append(action)
            for i in range(self.args.n_agents, self.args.n_players):
                actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0])
            s_next, r, done, info = self.env.step(actions)
            rewards += r[0]
            s = s_next
        returns.append(rewards)
        print('Returns is', rewards)
    return sum(returns) / self.args.evaluate_episodes