# Simple
## Make an environment

In [2]:
from pettingzoo.mpe import simple_v3
env = simple_v3.env(max_cycles=25, continuous_actions=False)

In [27]:
env.reset()
agent = env.agents[0]
print(agent)

agent_0


In [28]:
env.observation_space(agent)

Box(-inf, inf, (4,), float32)

In [29]:
env.action_space(agent)

Discrete(5)

## Setup Vectorized Environment

In [30]:
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv

def _get_env():
    """This functions is needed to provide callables for DummyVectorEnv"""
    return PettingZooEnv(simple_v3.env(max_cycles=25, continuous_actions=False))

train_envs = DummyVectorEnv([_get_env for _ in range(10)])
test_envs = DummyVectorEnv([_get_env for _ in range(10)])

## Build the network

In [31]:
import torch
import numpy as np
from torch import nn

class Net(nn.Module):
    def __init__(self, state_shape, action_shape):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(np.prod(state_shape), 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, np.prod(action_shape))
        )
    
    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float)
        batch = obs.shape[0]
        logits = self.model(obs.view(batch, -1))
        return logits, state
    
state_shape = env.observation_space(agent).shape or env.observation_space(agent).n
action_shape = env.action_space(agent).shape or env.action_space(agent).n
net = Net(state_shape, action_shape)
optim = torch.optim.Adam(net.parameters(), lr=1e-3)

## Setup Policy

In [32]:
from tianshou.policy import DQNPolicy
policy = DQNPolicy(
    model=net,
    optim=optim,
    action_space=env.action_space(agent),
    discount_factor=0.9,
    estimation_step=3,
    target_update_freq=320
)

## Setup Collector

In [33]:
from tianshou.data import Collector, VectorReplayBuffer
train_collector = Collector(policy, train_envs, VectorReplayBuffer(20000, 10), exploration_noise=True)
test_collector = Collector(policy, test_envs, exploration_noise=True)

## Train Policy with a Trainer

In [34]:
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger
writer = SummaryWriter('log/dqn')
logger = TensorboardLogger(writer)

In [38]:
from tianshou.trainer import OffpolicyTrainer

result = OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=10, step_per_epoch=10000, step_per_collect=10,
    update_per_step=0.1, episode_per_test=100, batch_size=64,
    train_fn=lambda epoch, env_step: policy.set_eps(0.1),
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= 0.6,
    logger=logger
).run()
print(f'Finished training! Use {result["duration"]}')

Epoch #1: 10001it [00:10, 971.58it/s, env_step=10000, len=25, loss=5.699, n/ep=10, n/st=10, rew=-11.65]                            


Epoch #1: test_reward: -8.595039 ± 7.131145, best_reward: -8.595039 ± 7.131145 in #1


Epoch #2: 10001it [00:09, 1012.25it/s, env_step=20000, len=25, loss=3.209, n/ep=10, n/st=10, rew=-7.66]                           


Epoch #2: test_reward: -9.683536 ± 7.779126, best_reward: -8.595039 ± 7.131145 in #1


Epoch #3: 10001it [00:10, 961.31it/s, env_step=30000, len=25, loss=0.079, n/ep=10, n/st=10, rew=-8.77]                            


Epoch #3: test_reward: -7.906466 ± 7.785225, best_reward: -7.906466 ± 7.785225 in #3


Epoch #4: 10001it [00:09, 1016.27it/s, env_step=40000, len=25, loss=0.051, n/ep=10, n/st=10, rew=-5.42]                           


Epoch #4: test_reward: -9.016955 ± 9.452644, best_reward: -7.906466 ± 7.785225 in #3


Epoch #5: 10001it [00:09, 1013.81it/s, env_step=50000, len=25, loss=0.044, n/ep=10, n/st=10, rew=-7.90]                           


Epoch #5: test_reward: -7.366735 ± 7.846527, best_reward: -7.366735 ± 7.846527 in #5


Epoch #6: 10001it [00:10, 931.52it/s, env_step=60000, len=25, loss=0.036, n/ep=10, n/st=10, rew=-7.52]                            


Epoch #6: test_reward: -8.135940 ± 7.735059, best_reward: -7.366735 ± 7.846527 in #5


Epoch #7: 10001it [00:11, 897.93it/s, env_step=70000, len=25, loss=0.034, n/ep=10, n/st=10, rew=-7.39]                            


Epoch #7: test_reward: -7.666644 ± 8.502152, best_reward: -7.366735 ± 7.846527 in #5


Epoch #8: 10001it [00:09, 1007.05it/s, env_step=80000, len=25, loss=0.043, n/ep=10, n/st=10, rew=-16.64]                          


Epoch #8: test_reward: -6.889538 ± 7.526689, best_reward: -6.889538 ± 7.526689 in #8


Epoch #9: 10001it [00:10, 976.07it/s, env_step=90000, len=25, loss=0.027, n/ep=10, n/st=10, rew=-10.68]                           


Epoch #9: test_reward: -9.483051 ± 8.605454, best_reward: -6.889538 ± 7.526689 in #8


Epoch #10: 10001it [00:10, 997.61it/s, env_step=100000, len=25, loss=0.035, n/ep=10, n/st=10, rew=-11.28]                            


Epoch #10: test_reward: -7.541169 ± 7.900672, best_reward: -6.889538 ± 7.526689 in #8
Finished training! Use 107.93s


In [39]:
print(result)

{'duration': '107.93s', 'train_time/model': '65.85s', 'test_step': 27500, 'test_episode': 1100, 'test_time': '5.51s', 'test_speed': '4994.87 step/s', 'best_reward': -6.8895384718012584, 'best_result': '-6.89 ± 7.53', 'train_step': 100000, 'train_episode': 4000, 'train_time/collector': '36.57s', 'train_speed': '976.30 step/s'}


: 