# Imports/tools

In [147]:
!./containers_run.sh

Starting container 1
74945919425dce3484f01a5129892be1b5e3628bd4d9d62835f1f359c3d774bb


In [4]:
import gym
import ptan
import time
import copy
import numpy as np
import universe
import collections
from typing import List, Optional, Tuple
from universe import vectorized
from universe.wrappers.experimental import SoftmaxClickMouse

from PIL import Image, ImageDraw
import matplotlib.pylab as plt

%matplotlib inline

  spec = yaml.load(f)


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [6]:
DOCKER_IMAGE = "shmuma/miniwob:latest"
ENV_NAME = "wob.mini.ClickDialog-v0"

In [7]:
# function to build connection endpoints for set of containers
# you should tweak its args if you're not using standalone installation
def remotes_url(port_ofs=0, hostname='localhost', count=8):
    hosts = ["%s:%d+%d" % (hostname, 5900 + ofs, 15900 + ofs) for ofs in range(port_ofs, port_ofs+count)]
    return "vnc://" + ",".join(hosts)

In [8]:
def make_env(wrapper_func = lambda env: env, count: int = 1, fps: float = 5) -> universe.envs.VNCEnv:
    """
    Builds the vectorized env
    """
    env = gym.make(ENV_NAME)
    env = wrapper_func(env)
    url = remotes_url(count=count)
    print("Remotes URL: %s" % url)

    env.configure(remotes=url, docker_image=DOCKER_IMAGE, fps=fps, vnc_kwargs={
            'encoding': 'tight', 'compress_level': 0,
            'fine_quality_level': 100, 'subsample_level': 0
        })
    return env

In [9]:
def join_env(env: universe.envs.VNCEnv):
    """
    Function performs initial reset of the env and waits for observations to become ready
    """
    obs_n = env.reset()
    while any(map(lambda o: o is None, obs_n)):
        a = [env.action_space.sample() for _ in obs_n]
        obs_n, reward, is_done, info = env.step(a)
    return obs_n

In [10]:
class MiniWoBCropper(vectorized.ObservationWrapper):
    """
    Crops the WoB area and converts the observation into PyTorch (C, H, W) format.
    """
    # Area of interest
    WIDTH = 160
    HEIGHT = 210
    X_OFS = 10
    Y_OFS = 75
    
    def __init__(self, env, keep_text=False):
        super(MiniWoBCropper, self).__init__(env)
        self.keep_text = keep_text
        img_space = gym.spaces.Box(low=0, high=255, shape=(3, self.HEIGHT, self.WIDTH))
        if keep_text:
            self.observation_space = gym.spaces.Tuple(spaces=(img_space, gym.spaces.Space))
        else:
            self.observation_space = img_space

    def _observation(self, observation_n):
        res = []
        for obs in observation_n:
            if obs is None:
                res.append(obs)
                continue
            img = obs['vision'][self.Y_OFS:self.Y_OFS+self.HEIGHT, self.X_OFS:self.X_OFS+self.WIDTH, :]
            img = np.transpose(img, (2, 0, 1))
            if self.keep_text:
                text = " ".join(map(lambda d: d.get('instruction', ''), obs.get('text', [{}])))
                res.append((img, text))
            else:
                res.append(img)
        return res

In [11]:
class Model(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(Model, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 64, 5, stride=5),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, stride=2),
            nn.ReLU(),
        )

        conv_out_size = self._get_conv_out(input_shape)

        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, n_actions),
        )

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        fx = x.float() / 256
        conv_out = self.conv(fx).view(fx.size()[0], -1)
        return self.fc(conv_out)

In [12]:
@torch.no_grad()
def unpack_batch(batch: List[ptan.experience.ExperienceFirstLast], net: nn.Module, gamma: float, device="cpu"):
    states = []
    actions = []
    rewards = []
    done_masks = []
    last_states = []
    for exp in batch:
        states.append(exp.state)
        actions.append(exp.action)
        rewards.append(exp.reward)
        done_masks.append(exp.last_state is None)
        if exp.last_state is None:
            last_states.append(exp.state)
        else:
            last_states.append(exp.last_state)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    last_states_v = torch.tensor(last_states).to(device)
    last_state_q_v = net(last_states_v)
    best_last_q_v = torch.max(last_state_q_v, dim=1)[0]
    best_last_q_v[done_masks] = 0.0
    return states_v, actions_v, best_last_q_v + rewards_v

In [None]:
class TrajectoryTracking(vectorized.Wrapper):
    """
    Keeps last N trajectories from the environment
    """
    def __init__(self, env, size: int):
        super(TrajectoryTracking, self).__init__(env)
        self.trajectories = collections.deque(maxlen=size)
        self._in_progress = []

    def reset(self):
        obs_n = self.env.reset()
        if not self._in_progress:
            self._in_progress = [[] for _ in obs_n]
        for t, obs in zip(self._in_progress, obs_n):
            if t:
                self.trajectories.append(list(t))
                t.clear()
            if obs:
                t.append(obs)
        return obs_n
    
    def step(self, action_n):
        obs_n, r_n, done_n, info_n = self.env.step(action_n)
        for t, obs, r, act in zip(self._in_progress, obs_n, r_n, action_n):
            t.append((obs, act, r))
        for t, done in zip(self._in_progress, done_n):
            if done:
                self.trajectories.append(list(t))
                t.clear()
        return obs_n, r_n, done_n, info_n

In [None]:
class MiniWoBTrackClicks(vectorized.Wrapper):
    """
    Track the clicks
    """
    def __init__(self, env):
        super(MiniWoBTrackClicks, self).__init__(env)
        s = env.observation_space.shape
        self.clicks_buf = np.zeros(s[1:], dtype=np.uint8)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(s[0]+1, s[1], s[2]))

    def reset(self):
        self.clicks_buf[:] = 0
        return self.env.reset()
    
    def step(self, action_n):
        # TODO: implement me properly for N environments :)
        obs_n, r_n, done_n, info_n = self.env.step(action_n)
        if obs_n[0] is None:
            return obs_n, r_n, done_n, info_n

        # track the click
        x = action_n[0] // 16
        y = action_n[0] % 16 + 5    # +5 is needed to offset click region down 50 pixels
        self.clicks_buf[y*10:(y+1)*10, x*10:(x+1)*10] = 255
        
        obs_n[0] = np.vstack((obs_n[0], [self.clicks_buf]))
        if done_n[0]:
            self.clicks_buf[:] = 0
        return obs_n, r_n, done_n, info_n

In [None]:
class MiniWoBSlowpoke(vectorized.Wrapper):
    """
    Limits the amount of time we can click.
    """
    def __init__(self, env, click_delay: float = 1):
        super(MiniWoBSlowpoke, self).__init__(env)
        self.click_delay = click_delay
        self.click_ts = None

    def reset(self):
        self.click_ts = None
        return self.env.reset()
    
    def step(self, action_n):
        # TODO: implement me properly for N environments :)
        if self.click_ts is not None:
            dt = self.click_delay - (time.time() - self.click_ts)
            if dt > 0:
                time.sleep(dt)
        obs_n, r_n, done_n, info_n = self.env.step(action_n)
        if done_n[0]:
            self.click_ts = None
        else:
            self.click_ts = time.time()
        return obs_n, r_n, done_n, info_n

# Possible directions to explore

* Take different environment
* Better exploration, for example code of NoisyNetworks [could be found here](https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter07/lib/dqn_model.py#L9) 
* Try DQN tweaks, like Dueling, N-steps, Prioritized reploy buffers, [examples are here](https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter07)
* Tweak params
* Incorporate human demonstrations into the learning process (no time to describe it here, could be found in chapter 13 of my book, [code is here](https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter13))
* Your favorite method :)