In [1]:
## Auto-format notebook
# %load_ext lab_black

In [2]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.onnx
import shutil
from time import strftime, time
from collections import deque, namedtuple
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import IPython.display
from functools import partial
from itertools import chain

import gym
from gym import Env, Wrapper

from pommerman import make
from pommerman.agents import BaseAgent, RandomAgent, SimpleAgent
from graphic_pomme_env import graphic_pomme_env
from graphic_pomme_env.wrappers import PommerEnvWrapperFrameSkip2

print(
    '''Hint: just ignore the error "Import error NSDE! You will not be able to render --> Cannot connect to 'None'"'''
)
pomenvs = [es.id for es in gym.envs.registry.all() if es.id.startswith("Pomme")]
print("\n".join(pomenvs))
res = graphic_pomme_env.load_resources()
N_PLAYERS = 2
NUM_STACK = 5

NUM_ACTIONS = 6
"""
0 Stop
1 Up
2 Down
3 Left
4 Right
5 Bomb
"""

['/home/raphael/uni/jku/2_sem/reinforcement_learning/assignments', '/home/raphael/.pyenv/versions/3.7.12/lib/python37.zip', '/home/raphael/.pyenv/versions/3.7.12/lib/python3.7', '/home/raphael/.pyenv/versions/3.7.12/lib/python3.7/lib-dynload', '', '/home/raphael/.local/share/virtualenvs/assignments-geAH-P9e/lib/python3.7/site-packages', '/home/raphael/.local/share/virtualenvs/assignments-geAH-P9e/lib/python3.7/site-packages/IPython/extensions', '/home/raphael/.ipython']
/home/raphael/.local/share/virtualenvs/assignments-geAH-P9e/lib/python3.7/site-packages
['/home/raphael/.local/share/virtualenvs/assignments-geAH-P9e/lib/python3.7/site-packages', '/home/raphael/uni/jku/2_sem/reinforcement_learning/assignments', '/home/raphael/.pyenv/versions/3.7.12/lib/python37.zip', '/home/raphael/.pyenv/versions/3.7.12/lib/python3.7', '/home/raphael/.pyenv/versions/3.7.12/lib/python3.7/lib-dynload', '', '/home/raphael/.local/share/virtualenvs/assignments-geAH-P9e/lib/python3.7/site-packages', '/home/

'\n0 Stop\n1 Up\n2 Down\n3 Left\n4 Right\n5 Bomb\n'

In [3]:
from dataclasses import dataclass
from typing import Any
from datetime import datetime
from copy import deepcopy
from tqdm.notebook import tqdm
import random

from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler

import pandas as pd
import numpy as np

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Plan to train agent

1. train agains static agent
2. train against random no bomb agent
3. train againts random bomb agent
4. train against itself


## Define ReplayBuffer

In [5]:
# @dataclass
# class Transition:
#     state: Any
#     action: int
#     reward: float
#     next_state: Any
#     done: bool


"""
Idea to implement:
If agent wins we will add the rewards of all the steps which led to this win.
If the agent looses we will add the loss to all the steps which led to this loss.

Given the agent plants a bomb 10 timesteps ahead of the win - it should get more reward on this action.
Otherwise there are multiple options:
- equally split the rewards over all the steps in this episode
- give reward to the n steps before the win / loss
- decrease the rewards over n or all timesteps

Problem:
- some states might be the same -> will therefore be higher and lower in the end
- need to figure out if this happend in the same episode: DONE
- need to figure out a good value for the reward since 200 might be too much
- need to find a good size of n



Implement the equal over n episodes
if the agent wins - it maybe approached the opponent and placed a bomb and then got out of the way
so 10 steps bomb + 10 steps approaching should be good

but if the agent looses
not too much steps behind are bad
in most cases it should take the agent 2 steps to get away from a bomb -> I add a extra one to be sure
so 4 steps will be for looses

draws aren't really supportet yet :P we use the loose stuff since he gets in the way of the bomb and there aren't any pos rewards anyway

"""


class ReplayBuffer:
    def __init__(self, num_actions, size=10_000, remove_low_reward=0.0):
        self.size = size
        self.transition = []
        self.num_actions = num_actions
        self.remove_low_reward = remove_low_reward

    def add(
        self,
        state,
        action,
        reward,
        next_state,
        done,
        episode,
        split_over_win=30,
        split_over_loose=4,
    ):
        if self.length() > self.size:
            self.remove()

        if reward > 0:
            split_over = split_over_win
        else:
            split_over = split_over_loose

        if done and self.length() > split_over and reward > 0:
            reward = 20
            helper = list(filter(lambda x: x[5] == episode, self.transition))[
                -split_over:
            ]

            helper = [
                [
                    el[0],
                    el[1],
                    el[2] + (reward * 2) / (len(helper) * 2 - i),
                    el[3],
                    el[4],
                    el[5],
                ]
                for i, el in enumerate(helper)
            ]

            for idx in range(-11, -8):
                if abs(idx) <= len(helper):
                    if helper[idx][1] == 5:
                        helper[idx][2] = helper[idx][2] + 10

        elif done and self.length() > split_over and reward <= 0:
            reward = -20
            helper = list(filter(lambda x: x[5] == episode, self.transition))[
                -split_over:
            ]

            r_plus = reward / len(helper)

            helper = [
                [el[0], el[1], el[2] + r_plus, el[3], el[4], el[5]] for el in helper
            ]

            self.transition[-len(helper) :] = helper

        self.transition.append([state, action, reward, next_state, done, episode])

    def length(self):
        return len(self.transition)

    def remove(self):
        idx = 0
        if np.random.random() < self.remove_low_reward:

            lowest_reward = min(self.transition, key=lambda x: x[2])[2]

            for i, t in enumerate(self.transition):
                if t[2] == lowest_reward:
                    self.transition.pop(i)
                    break

        else:
            self.transition.pop(idx)

    def sample_batch(self, batch_size):
        batch = random.sample(self.transition, batch_size)
        states, actions, rewards, next_states, dones, _ = map(np.array, zip(*batch))
        actions = torch.from_numpy(actions.astype(int)).to(device, dtype=torch.int)
        rewards = torch.from_numpy(rewards).to(device, dtype=torch.float32)
        dones = torch.from_numpy(dones.astype(int)).to(device, dtype=torch.int)

        return states, actions, rewards, next_states, dones

    def get_action_distribution(self):
        states, actions, rewards, next_states, dones, _ = map(
            np.array, zip(*self.transition)
        )

        _lens = []
        for i in range(0, 6):
            _lens.append(len(list(filter(lambda x: x == i, actions))))

        return _lens

    def save_episode(self, episode):
        eps = list(filter(lambda x: x[5] == episode, self.transition))
        states, actions, rewards, next_states, dones, _ = map(
            np.array, zip(*self.transition)
        )

        np.save(
            f"./demos/states_{datetime.now().strftime('%Y_%m_%d-%H_%M_%S')}.npy",
            states,
            allow_pickle=True,
        )
        np.save(
            f"./demos/actions_{datetime.now().strftime('%Y_%m_%d-%H_%M_%S')}.npy",
            actions,
            allow_pickle=True,
        )

## Define cheap oponent agents

In [6]:
def static_agent(frame_stack):
    del frame_stack
    return 0


def rand_no_bomb_agent(frame_stack):
    del frame_stack
    return np.random.randint(NUM_ACTIONS - 1)


def rand_agent(frame_stack):
    return np.random.randint(NUM_ACTIONS)


def model_agent(frame_stack, model):
    with torch.no_grad():
        obs = torch.from_numpy(np.array(frame_stack.get_obersvation()))
        net_out = model(obs).detach().cpu().numpy()

    action = np.argmax(net_out)
    return action

## Define our Agent Model


In [7]:
class PommermanAgent(nn.Module):
    def __init__(self, num_stack, num_actions=6):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(num_stack, 32, kernel_size=5, stride=1, padding=2),
            nn.MaxPool2d(kernel_size=2, stride=1),
            # nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(kernel_size=2, stride=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(kernel_size=2, stride=1),
            nn.ReLU(inplace=True),
            # nn.AdaptiveAvgPool2d(1),
        )

        self.classifier = nn.Sequential(
            nn.Linear(305280, 64),
            # nn.BatchNorm1d(64),
            nn.ReLU(inplace=True),
            nn.Linear(64, 32),
            nn.ReLU(inplace=True),
            nn.Linear(32, num_actions),
        )

    def forward(self, x):
        x = np.array(x)
        x = torch.Tensor(x).to(device)

        if x.dim() == 3:
            x = x.unsqueeze(0)

        x = self.features(x)
        x = x.reshape(x.size(0), -1)
        x = self.classifier(x)

        return x

In [8]:
def get_random_environment(oponent, random_env):
    # since this is also used for evaluation I see no need to train for different values atm
    env = PommerEnvWrapperFrameSkip2(num_stack=5, start_pos=0, opponent_actor=oponent)

    if random_env:
        n_rigid = np.random.randint(low=1, high=6) * 2
        n_wood = np.random.randint(low=3, high=7) * 2
        n_items = np.random.randint(low=1, high=n_wood)

        env.set_board_params(num_rigid=n_rigid, num_wood=n_wood, num_items=n_items)

    return env

In [9]:
env = get_random_environment(None, False)

env.reset()

env.board

'GraphicOVOCompact-v0'

## Pre-Train our Agent


In [10]:
class Demonstrations(Dataset):
    def __init__(self, actions, states):
        self.actions = actions
        self.states = states

    def __len__(self):
        return self.actions.shape[0]

    def __getitem__(self, idx):
        state = self.states[idx]
        action = self.actions[idx]

        return state, action

In [11]:
loaded_actions = np.load("demo_actions.npy", allow_pickle=True)
loades_states = np.load("demo_states.npy", allow_pickle=True)

print(loaded_actions.shape)
print(loades_states.shape)

(2553,)
(2553, 5, 56, 48)


In [12]:
demos = Demonstrations(loaded_actions, loades_states)

In [13]:
def train(net, demos, loss_func, optimizer, epoch):

    net.train()
    ts_len = len(demos)
    alpha = 0.3

    loader = DataLoader(
        demos,
        batch_size=batchsize,
        num_workers=1,
        shuffle=True,
        drop_last=False,
        pin_memory=True,
    )

    mean_losses = []

    l_len = len(loader)
    for j, (frame, action) in enumerate(loader):
        frame = frame.numpy()
        action = action.to(device)
        # action = action.numpy()

        prediction = net(frame)
        loss = loss_func(prediction, action)

        with torch.no_grad():
            probs = torch.softmax(prediction, dim=-1)
            entropy = torch.mean(-torch.sum(probs * torch.log(probs), dim=-1))

        # Update weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        mean_losses.append(loss.item())

    return sum(mean_losses) / len(mean_losses)

In [14]:
learning_rate = 1e-4
weight_decay = 1e-5
batchsize = 32
n_epochs = 50

In [15]:
# agent = PommermanAgent(num_stack=5, num_actions=NUM_ACTIONS).to(device)
# loss_func = nn.CrossEntropyLoss().to(device)

# optimizer = optim.Adam(agent.parameters(), lr=learning_rate, weight_decay=weight_decay)
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# losses = []

# with tqdm(range(n_epochs)) as pbar:
#     for i_ep in pbar:
#         running_loss = train(agent, demos, loss_func, optimizer, i_ep + 1)
#         losses.append(running_loss)
#         scheduler.step()

#         pbar.set_description(
#             f"Epoch: \t{i_ep}\t{running_loss}\t{datetime.now().strftime('%Y_%m_%d-%H_%M_%S')}"
#         )


# checkpoint_dict = {
#     "model_params": agent.state_dict(),
#     "timesteps": 0,
# }
# torch.save(
#     checkpoint_dict,
#     "./final_challenge/pre_trained_model.p",
# )

In [16]:
# plt.plot(losses)
# plt.show()

## Define our Training


In [17]:
class Runner:
    def __init__(
        self,
        name,
        agent,
        target_agent,
        buffer,
        optimizer,
        loss,
        eps_min,
        eps_max,
        eps_decay,
        batch_size,
        update_every,
        gamma,
        tau,
        oponent,
        random_env,
    ):
        self.name = name
        self.agent = agent
        self.target_agent = target_agent
        self.buffer = buffer
        self.optimizer = optimizer
        self.loss = loss
        self.eps_min = eps_min
        self.eps_max = eps_max
        self.eps_decay = eps_decay
        self.batch_size = batch_size
        self.update_every = update_every
        self.gamma = gamma
        self.tau = tau

        self.oponent = oponent
        self.random_env = random_env

        self.returns = []
        self.losses = []

    def soft_update(self):
        """
        agent -> weights will be copied from
        target_agent -> weights will be copied to
        tau -> interpolation parameter
        """
        for agent_param, target_agent_param in zip(
            self.agent.parameters(), self.target_agent.parameters()
        ):
            target_agent_param.data.copy_(
                tau * agent_param.data + (1 - tau) * target_agent_param.data
            )

    def run(self, timesteps, n_epochs):
        returns = []
        losses = []

        env = get_random_environment(self.oponent, self.random_env)
        agent_obs, opponent_obs = env.reset()
        agent_obs = np.array(agent_obs).astype(np.uint8)
        
        ws = 0
        ls = 0
        ds = 0
        

        with tqdm(range(n_epochs)) as pbar:
            for i in pbar:
                ret = 0
                done = False

                bomb_timestep = []

                while not done:
                    eps = max(self.eps_min, self.eps_max - timesteps / self.eps_decay)

                    if np.random.choice([0, 1], p=[1 - eps, eps]) == 1:
                        a = np.random.randint(low=1, high=NUM_ACTIONS, size=1)[0]
                    else:
                        with torch.no_grad():
                            self.agent.eval()
                            net_out = self.agent(agent_obs).detach().cpu().numpy()
                            a = np.argmax(net_out)

                    agent_step, oponent_step = env.step(a)
                    next_agent_obs, r, done, info = agent_step

                    next_agent_obs = np.array(next_agent_obs).astype(np.uint8)
                    
                    if done:
                        if r == 1:
                            ws = ws + 1
                        elif r == -1:
                            ls = ls + 1
                        else:
                            ds = ds + 1

                    r = r * 20  # winning or loosing is now +20 / -20

                    if r == 0:  # to encourage agent to win fast
                        r = r - 0.03

                    #                     if a == 5:  # if action == bomb
                    #                         bomb_timestep.append(timesteps)

                    #                     for b_t in bomb_timestep:
                    #                         if timesteps - b_t == 10 and not done:
                    #                             bomb_timestep.remove(b_t)
                    #                             r = r + 0.2
                    ret = ret + r

                    self.buffer.add(agent_obs, a, r, next_agent_obs, done, i)
                    agent_obs = next_agent_obs

                    timesteps = timesteps + 1

                    if (
                        self.buffer.length() > batch_size
                        and self.buffer.length() > self.update_every
                    ):
                        self.agent.train()
                        self.optimizer.zero_grad()

                        (
                            states,
                            actions,
                            rewards,
                            next_states,
                            dones,
                        ) = self.buffer.sample_batch(self.batch_size)

                        with torch.no_grad():
                            next_state_preds = self.target_agent(next_states)

                        q_values = rewards + self.gamma * (
                            (1 - dones) * torch.max(next_state_preds, dim=1)[0]
                        )

                        state_preds = self.agent(states)

                        mask = torch.nn.functional.one_hot(
                            actions.to(torch.int64), num_classes=6
                        ).bool()
                        predictions = torch.masked_select(state_preds, mask)

                        loss = self.loss(predictions, q_values)
                        loss.backward()

                        self.optimizer.step()
                        losses.append(loss.item())

                        if timesteps % update_every == 0:
                            self.soft_update()

                    if done:
                        # if ret > 18.9:
                        #     self.buffer.save_episode(i)

                        if self.random_env:
                            env = get_random_environment(self.oponent, self.random_env)
                        agent_obs, opponent_obs = env.reset()

                        pbar.set_description(
                            f"Episode: \t{i}\t{ret}\tW/L/D:{ws}/{ls}/{ds}\t{datetime.now().strftime('%Y_%m_%d-%H_%M_%S')}"
                        )
                    returns.append(ret)

                if i % 200 == 0:
                    checkpoint_dict = {
                        "model_params": self.agent.state_dict(),
                        "timesteps": timesteps,
                    }
                    torch.save(
                        checkpoint_dict,
                        f"./final_challenge/{self.name}-{datetime.now().strftime('%Y_%m_%d-%H_%M_%S')}.p",
                    )

        return returns, losses, self.agent, timesteps

In [18]:
@dataclass
class Curriculm:
    name: str
    oponent: Any
    n_epochs: int
    random_env: bool
    run_till_above: bool
    threshold: int = 105


class CurriculumRunner:
    def __init__(
        self,
        curriculums,
        agent,
        learning_rate,
        buffer_size,
        eps_min,
        eps_max,
        eps_decay,
        batch_size,
        update_every,
        gamma,
        tau,
    ):
        self.curriculums = curriculums

        self.agent = agent
        self.target_agent = PommermanAgent(num_stack=5, num_actions=NUM_ACTIONS).to(device)
        self.target_agent.load_state_dict(self.agent.state_dict())

        self.buffer_size = buffer_size
        self.learning_rate = learning_rate

        self.eps_min = eps_min
        self.eps_max = eps_max
        self.eps_decay = eps_decay
        self.batch_size = batch_size
        self.update_every = update_every
        self.gamma = gamma
        self.tau = tau

        self.returns = []
        self.losses = []

        self.buffer = None

    def run(self):
        for curriculum in self.curriculums:
            while True:
                print(f"Running: {curriculum.name}")
                self.buffer = ReplayBuffer(
                    num_actions=NUM_ACTIONS, size=self.buffer_size
                )
                optimizer = optim.Adam(self.agent.parameters(), lr=self.learning_rate)

                runner = Runner(
                    name=curriculum.name,
                    agent=self.agent,
                    target_agent=self.target_agent,
                    buffer=self.buffer,
                    optimizer=optimizer,
                    loss=torch.nn.MSELoss(),
                    eps_min=self.eps_min,
                    eps_max=self.eps_max,
                    eps_decay=self.eps_decay,
                    batch_size=self.batch_size,
                    update_every=self.update_every,
                    gamma=self.gamma,
                    tau=self.tau,
                    oponent=curriculum.oponent,
                    random_env=curriculum.random_env,
                )

                rs, ls, agent, timesteps = runner.run(
                    timesteps=0, n_epochs=curriculum.n_epochs
                )

                self.agent = agent
                self.target_agent.load_state_dict(self.agent.state_dict())
                

                mean_returns = sum(rs) / len(rs)
                mean_losses = sum(ls) / len(ls)

                self.returns.append(mean_returns)
                self.losses.append(mean_losses)

                plt.plot(rs, label="returns")
                plt.plot(ls, label="mean_losses", alpha=0.2)
                plt.legend()
                plt.show()

                print(f"action distribution -> {self.buffer.get_action_distribution()}")

                if curriculum.run_till_above == False:
                    break
                elif mean_returns >= curriculum.threshold:
                    print(f"Learned section with: {mean_returns} in {timesteps}")
                    break
                else:
                    print(f"Retry with: {mean_returns} in {timesteps}")

        return self.returns, self.losses

## Define Hyperparameters and initialize everything

In [19]:
cs = [
    # Curriculm(
    #     name="new_static_oponent",
    #     oponent=static_agent,
    #     n_epochs=2_000,
    #     random_env=False,
    #     run_till_above=False,
    #     threshold=-5,
    # ),
    # Curriculm(
    #     name="new_rand_no_bomb",
    #     oponent=rand_no_bomb_agent,
    #     n_epochs=2_000,
    #     random_env=False,
    #     run_till_above=False,
    #     threshold=-6,
    # ),
    # Curriculm(
    #     name="new_f_rand_bomb",
    #     oponent=rand_agent,
    #     n_epochs=1_000,
    #     random_env=False,
    #     run_till_above=True,
    #     threshold=10,
    # ),
    Curriculm(
        name="new_f_simple_bomb",
        oponent=None,
        n_epochs=1_000,
        random_env=False,
        run_till_above=True,
        threshold=10,
    ),
    # Curriculm(
    #     name="static_oponent_random",
    #     oponent=static_agent,
    #     n_epochs=10_000,
    #     random_env=True
    # ),
    # Curriculm(
    #     name="rand_no_bomb_random",
    #     oponent=rand_no_bomb_agent,
    #     n_epochs=10_000,
    #     random_env=True,
    # ),
    # Curriculm(
    #     name="rand_bomb_random",
    #     oponent=rand_agent,
    #     n_epochs=10_000,
    #     random_env=True),
]

agent = PommermanAgent(num_stack=5, num_actions=NUM_ACTIONS).to(device)
loaded = torch.load("final_challenge/new_f_simple_bomb-2022_06_20-23_20_45.p", map_location=device)
timesteps = loaded["timesteps"]
agent.load_state_dict(loaded["model_params"])


learning_rate = 1e-3
buffer_size = 3_500

eps_min = 0.0
eps_max = 0.8
eps_decay = 10_000
batch_size = 256
update_every = 4
gamma = 0.99
tau = 1e-3

In [20]:
curriculum_runner = CurriculumRunner(
    curriculums=cs,
    agent=agent,
    learning_rate=learning_rate,
    buffer_size=buffer_size,
    eps_min=eps_min,
    eps_max=eps_max,
    eps_decay=eps_decay,
    batch_size=batch_size,
    update_every=update_every,
    gamma=gamma,
    tau=tau,
)

In [21]:
# (5, 56, 48) ERROR
returns, losses = curriculum_runner.run()

Running: new_f_simple_bomb


  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [25]:
env = get_random_environment(None, False)
obs, _ = env.reset()

# agent.eval()

agent = PommermanAgent(num_stack=5, num_actions=NUM_ACTIONS).to(device)

loaded = torch.load("final_challenge/new_f_simple_bomb-2022_06_20-19_53_22.p", map_location=device)
timesteps = loaded["timesteps"]
agent.load_state_dict(loaded["model_params"])



onnx_path = "./submission_loaded_old.onnx"
state_for_onnx = np.array(obs, dtype=np.float32)
torch.onnx.export(agent,
                  torch.from_numpy(state_for_onnx).float(), # example model input
                  onnx_path, # file path
                  export_params=True, # save trained parameters
                  opset_version=10,
                  do_constant_folding=True)



### plt.plot(curriculum_runner.losses)
plt.show()