In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from pettingzoo.sisl import pursuit_v4

In [3]:
class Policy(nn.Module):
    def __init__(self, num_actions=5):
        super().__init__()

        self.network = nn.Sequential(
            self._layer_init(nn.Conv2d(3, 32, 3, padding=1)),
            nn.MaxPool2d(kernel_size=2,stride=2,ceil_mode=True),
            nn.ReLU(),
            self._layer_init(nn.Conv2d(32, 64, 3, padding=1)),
            nn.MaxPool2d(kernel_size=2,stride=2),
            nn.ReLU(),
            nn.Flatten(),
            self._layer_init(nn.Linear(64 * 2 * 2,64)),
            nn.ReLU(),
            self._layer_init(nn.Linear(64,num_actions)),
            nn.Softmax(dim=-1)
        )
    def _layer_init(self,layer,std = np.sqrt(2),bias = 0):
        torch.nn.init.orthogonal_(layer.weight,std)
        torch.nn.init.constant_(layer.bias,bias)
        return layer
    def forward(self,x):
        if len(x.shape)==3:
            x =torch.tensor(x, dtype=torch.float).permute(2, 0, 1).unsqueeze(0) # transform shape[7,7,3] to [1,3,7,7]
        else:
            x =torch.tensor(x, dtype=torch.float).permute(0,3,1,2)
        return self.network(x)

In [4]:

class PPO:
    def __init__(self,lr, gamma,clip_ratio):
        self.policy = Policy()
        self.optimizer = optim.Adam(self.policy.parameters(),lr=lr)
        self.gamma =gamma
        self.clip_ratio = clip_ratio

    def update (self,rollouts):
        obs,act,rew,logp_old,adv = rollouts
        obs = np.array(obs)
        returns = np.zeros_like(rew)
        for t in reversed(range(len(rew))):
            if t==len(rew)-1:
                returns[t]=rew[t]
            else:
                returns[t]=rew[t]+self.gamma*returns[t+1]
        values = self.policy(obs).detach().numpy()
        adv = returns -np.sum(values,axis=1)

        act = torch.tensor(act).long()
        logp_old=torch.tensor(logp_old).float()
        pi_old = self.policy(obs).gather(1,act.unsqueeze(-1)).squeeze(-1)
        ratio = torch.exp(torch.log(pi_old))
        surr1 = ratio*torch.from_numpy(adv).float()
        surr2 = torch.clamp(ratio,1-self.clip_ratio,1+self.clip_ratio)*torch.from_numpy(adv).float()
        loss = -torch.min(surr1,surr2).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


In [64]:
agent = PPO(lr =0.01,gamma=0.01,clip_ratio=0.01)

In [6]:
from collections import deque
from tqdm import tqdm
def train(env,epochs,steps_per_epoch,batch_size,lr,gamma,clip_ratio):
    ppo = PPO(lr=lr,gamma = gamma,clip_ratio=clip_ratio)
    ep_reward = deque(maxlen=10)
    for epoch in range(epochs):
        obs_buf,act_buf,rew_buf,logp_buf = [],[],[],[]
        for _ in tqdm(range(steps_per_epoch)):
            env.reset(seed=42)
            obs,_,_,_,_ = env.last()
            ep_reward.append(0)
            for t in range(batch_size):
                probs = ppo.policy(obs)
                m = Categorical(probs)
                act = m.sample()
                logp = m.log_prob(act)
                obs_buf.append(obs)
                act_buf.append(act)
                rew_buf.append(0)
                logp_buf.append(logp)
                env.step(act.item())
                obs,rew,done,_,_= env.last()
                ep_reward[-1]+=rew
                rew_buf[-1]+=rew
                if done:
                    break
            ppo.update((obs_buf,act_buf,rew_buf,logp_buf,np.zeros_like(rew_buf)))
        print("Epoch: {}, Avg Reward: {:.2f}".format(epoch,np.mean(ep_reward)))

In [66]:
policy(observation)

tensor([[0.3394, 0.1336, 0.1356, 0.2565, 0.1350]], grad_fn=<SoftmaxBackward0>)

In [7]:
n_evaders =2
n_pursuers = 1
n_catch =1
max_cycles = 200
env = pursuit_v4.env(n_evaders = n_evaders, n_pursuers = n_pursuers,max_cycles =max_cycles,n_catch=n_catch)
env.reset(seed=42)


In [102]:
observation.shape

(7, 7, 3)

In [8]:
train(env,epochs=5,steps_per_epoch=max_cycles//10,batch_size=128,lr=0.2,gamma=0.99,clip_ratio=0.2)

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [00:02<00:00,  7.92it/s]


Epoch: 0, Avg Reward: -12.69


100%|██████████| 20/20 [00:02<00:00,  8.45it/s]


Epoch: 1, Avg Reward: -12.70


100%|██████████| 20/20 [00:02<00:00,  8.39it/s]


Epoch: 2, Avg Reward: -12.71


100%|██████████| 20/20 [00:02<00:00,  8.76it/s]


Epoch: 3, Avg Reward: -12.72


100%|██████████| 20/20 [00:02<00:00,  8.59it/s]

Epoch: 4, Avg Reward: -12.69





In [14]:
cnt = 0
for agent in env.agent_iter():
    observation, reward, termination, truncation, info = env.last()
    cnt+=1
    if termination or truncation:
        action = None
    else:
        # this is where you would insert your policy
        action = env.action_space(agent).sample()
    env.step(action)
env.close()
print(cnt)



201


In [17]:
agent.policy(observation)

AttributeError: 'str' object has no attribute 'policy'

In [6]:
#device=torch.device('cuda')
device= torch.device('cpu')

In [10]:
a = torch.rand(1)
a.to(device)

tensor([0.0066], device='cuda:0')

In [20]:
agent1 = Agent().to(device)
optimizer = optim.Adam(params = agent1.parameters(), lr=0.001,eps=1e-5)