<a href="https://colab.research.google.com/github/tb0se/RL-MiniHack-Project/blob/Dev/src/reinforce/REINFORCE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# SETUP NLE(GOOGLE COLAB)

# Python and most build deps
!sudo apt update
!sudo apt-get install -y build-essential autoconf libtool pkg-config python3-dev python3-pip python3-numpy git flex bison libbz2-dev

# recent cmake version
!wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | sudo apt-key add -
!sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
!sudo apt-get update && apt-get --allow-unauthenticated install -y cmake kitware-archive-keyring

!sudo rm $(which cmake)
!$(which cmake) --version

[33m0% [Working][0m            Hit:1 http://archive.ubuntu.com/ubuntu bionic InRelease
[33m0% [Waiting for headers] [Connecting to security.ubuntu.com (91.189.91.38)] [Co[0m                                                                               Get:2 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
[33m0% [2 InRelease 27.2 kB/88.7 kB 31%] [Connecting to security.ubuntu.com (91.189[0m[33m0% [1 InRelease gpgv 242 kB] [2 InRelease 30.1 kB/88.7 kB 34%] [Connecting to s[0m                                                                               Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
[33m0% [1 InRelease gpgv 242 kB] [2 InRelease 75.0 kB/88.7 kB 85%] [Connecting to s[0m[33m0% [1 InRelease gpgv 242 kB] [2 InRelease 79.3 kB/88.7 kB 89%] [Connecting to s[0m[33m0% [1 InRelease gpgv 242 kB] [Waiting for headers] [Connecting to security.ubun[0m                                                   

In [3]:
!pip3 install nle
!pip3 install minihack

Collecting nle
  Downloading nle-0.7.3.tar.gz (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 10.4 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pybind11>=2.2
  Using cached pybind11-2.8.1-py2.py3-none-any.whl (208 kB)
Building wheels for collected packages: nle
  Building wheel for nle (PEP 517) ... [?25l[?25hdone
  Created wheel for nle: filename=nle-0.7.3-cp37-cp37m-linux_x86_64.whl size=2874647 sha256=2f51d4e3b6f0ccb1b9e44b0b04102eee6cfbaa24b9f469940f7053bf906514c0
  Stored in directory: /root/.cache/pip/wheels/25/9a/cc/5df4c522352fb289d67b08b16cbb28b2131d982798343e681f
Successfully built nle
Installing collected packages: pybind11, nle
Successfully installed nle-0.7.3 pybind11-2.8.1
Collecting minihack
  Downloading minihack-0.1.1-py3-none-any.whl (259 kB)
[K     |████████████

In [4]:
import numpy as np
import matplotlib.pyplot as plt


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym
from gym import ObservationWrapper
from gym.spaces import Box
from nle import nethack
import minihack

In [5]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()

True

In [72]:
# Gym wrapper
class GrayScaleObservation(ObservationWrapper):
  r"""Convert the image observation from RGB to gray scale."""

  def __init__(self, env, keep_dim=False):
    super(GrayScaleObservation, self).__init__(env)
    self.keep_dim = keep_dim

    assert (
        len(env.observation_space['pixel_crop'].shape) == 3
        and env.observation_space['pixel_crop'].shape[-1] == 3
    )

    obs_shape = self.observation_space['pixel_crop'].shape[:2]
    if self.keep_dim:
        self.observation_space['pixel_crop'] = Box(
            low=0, high=255, shape=(obs_shape[0], obs_shape[1], 1), dtype=np.uint8
        )
    else:
        self.observation_space['pixel_crop'] = Box(
            low=0, high=255, shape=obs_shape, dtype=np.uint8
        )

  def observation(self, observation):
    import cv2

    observation = cv2.cvtColor(observation, cv2.COLOR_RGB2GRAY)
    if self.keep_dim:
        observation = np.expand_dims(observation, -1)
    return observation

# Simple Neural Network (Policy)

In [12]:
class PolicyNetwork(nn.Module):
    def __init__(self, obs_space=4, hidden=16, act_space=2):
        ''' A simple Neural Network.
        :params obs_space: Observation space (Default=4)
        :params hidden: Hiddens size (Default=16)
        :params act_space: Action space (Default=2)
        '''
        super(PolicyNetwork, self).__init__()

        # Convolutional and pooling layers
        self.conv1 = nn.Conv2d(3, 20, 3)
        self.conv2 = nn.Conv2d(20, 10, 4)
        self.pool = nn.MaxPool2d(2,2)
        
        # Linear layers
        self.linear1 = nn.Linear(10*34*34, hidden)
        self.linear2 = nn.Linear(hidden, act_space)

        # Dropout
        self.dropout = nn.Dropout(p=0.6)


    def forward(self, x):

      # 2 Conv,relu and pooling layers
      x = F.relu(self.conv1(x)) 
      x = self.pool(x)
      x = F.relu(self.conv2(x)) 
      x= self.pool(x)
      
      # Flatten image
      x = x.view(-1, 10*34*34)

      x = self.linear1(x)
      x = F.relu(x)
      x = self.dropout(x)
      actions = self.linear2(x)

      act_probs = F.softmax(actions, dim=1)

      return act_probs

# REINFORCE Agent

In [7]:
class Agent():
    
    def __init__(self,obs_size, action_size, policy_model, optimizer, gamma):
        self.eps = np.finfo(np.float32).eps.item()
        self.obs_size = obs_size
        self.action_size = action_size
        self.policy_model = policy_model
        self.optim = optimizer
        self.gamma = gamma
        
        self.states = []
        self.actions = []
        self.gradients = []
        self.rewards = []
        self.probs = []
        
    def __compute_returns(self,rewards):
        returns = []
        cumul_rets = 0
        
        for reward in reversed(rewards):
            cumul_rets = reward + cumul_rets*self.gamma
            returns.insert(0, cumul_rets)

        # Baseline
        returns = torch.tensor(returns).to(DEVICE)
        returns = (returns - returns.mean()) / (returns.std() + self.eps)
        return returns
        
    def choose_action(self,  state):
        state = torch.from_numpy(state).float().to(DEVICE)
        state = torch.permute(state, (2, 0, 1))
        batch_state = state[None, ...]

        probs = self.policy_model(batch_state)
        state = state.detach()
        
        dist = Categorical(probs)
        action = dist.sample()
        
        return action.item(), dist.log_prob(action)
    
    def save_trajectory(self, state, action, reward, prob):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.probs.append(prob)
        
    
    def train(self,):
        G = self.__compute_returns(self.rewards)
        
        policy_loss= []
        for ret,prob in zip(G, self.probs):
            policy_loss.append(-prob*ret)
        
        # Backpropagation
        self.optim.zero_grad()
        policy_loss = torch.cat(policy_loss).sum()
        policy_loss.backward()
        self.optim.step()
        
        # Reset
        self.states, self.probs, self.gradients, self.rewards = [],[],[],[]

In [14]:
def main(env, agent, seed, num_episodes, max_episode_len):
    
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    env.seed(seed)
    
    tot_scores = []
    
    for ep in range(num_episodes):
        
        state = env.reset()['pixel_crop']
        score = 0
        
        # Create episode
        for step in range(max_episode_len):
            act, prob = agent.choose_action(state)
            
            new_state, reward, done, _ = env.step(act)
            
            score += reward
        
            agent.save_trajectory(state, act, reward, prob)
            
            if done:
                break
            state = new_state['pixel_crop']
        
        tot_scores.append(score)
        
        # For every step in episode
        agent.train()
        
        if ep % 100 == 0:
            print(f'Episode {ep+1}\tLast score: {score}\tAverage reward: {np.array(tot_scores).mean()}')
        

# Training

In [21]:
# Initialise
env = gym.make("MiniHack-Quest-Hard-v0",observation_keys=("pixel_crop",'glyphs'))
print('Action space:',env.action_space.n)
print('Observation shape:',env.observation_space['pixel_crop'].shape)

env_obs_space = env.observation_space['pixel_crop'].shape[0]
policy_model = PolicyNetwork(obs_space=env_obs_space, hidden=128, act_space=env.action_space.n)
print(policy_model,'\n')
policy_model.to(DEVICE)
optimizer = optim.Adam(policy_model.parameters(), lr=1e-2)

agent = Agent(env_obs_space, env.action_space.n, policy_model, optimizer, gamma=0.99)

main(env, agent, seed=54, num_episodes=1500, max_episode_len=1000)

Action space: 78
Observation shape: (144, 144, 3)
PolicyNetwork(
  (conv1): Conv2d(3, 20, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(20, 10, kernel_size=(4, 4), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (linear1): Linear(in_features=11560, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=78, bias=True)
  (dropout): Dropout(p=0.6, inplace=False)
) 

Episode 0	Last score: -9.699999999999838	Average reward: -9.699999999999838




Episode 100	Last score: -0.08	Average reward: -1.8073267326732436
Episode 200	Last score: 0.0	Average reward: -0.9906467661691414
Episode 300	Last score: 0.0	Average reward: -0.67810631229235
Episode 400	Last score: -0.03	Average reward: -0.529127182044881
Episode 500	Last score: -0.01	Average reward: -0.4245508982035873
Episode 600	Last score: 0.0	Average reward: -0.3823294509151365
Episode 700	Last score: 0.0	Average reward: -0.33124108416547365
Episode 800	Last score: 0.0	Average reward: -0.3014606741572995
Episode 900	Last score: 0.0	Average reward: -0.28029966703662235
Episode 1000	Last score: 0.0	Average reward: -0.27261738261737906
Episode 1100	Last score: 0.0	Average reward: -0.2483651226158006
Episode 1200	Last score: 0.0	Average reward: -0.2328143213988313
Episode 1300	Last score: 0.0	Average reward: -0.21507302075326393
Episode 1400	Last score: 0.0	Average reward: -0.19980014275517233


# Plots

# Visualise environment

## 1. Install rendering libraries

In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

## 2. Import rendering libraries

In [None]:
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only

import math
import glob
import io
import base64
import cv2
from IPython.display import HTML
from IPython import display as ipythondisplay

## 3. Create virtual display

In [None]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

## 4. Render function

In [None]:
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

## 5. Gym wrapper for rendering

In [None]:
cv2.ocl.setUseOpenCL(False)

class RenderRGB(gym.Wrapper):
    def __init__(self, env, key_name="pixel"):
        super().__init__(env)
        self.last_pixels = None
        self.viewer = None
        self.key_name = key_name

        render_modes = env.metadata['render.modes']
        render_modes.append("rgb_array")
        env.metadata['render.modes'] = render_modes

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.last_pixels = obs[self.key_name]
        return obs, reward, done, info

    def render(self, mode="human", **kwargs):
        img = self.last_pixels

        # Hacky but works
        if mode != "human":
            return img
        else:
            from gym.envs.classic_control import rendering

            if self.viewer is None:
                self.viewer = rendering.SimpleImageViewer()
            self.viewer.imshow(img)
            return self.viewer.isopen

    def reset(self):
        obs = self.env.reset()
        self.last_pixels = obs[self.key_name]
        return obs

    def close(self):
        if self.viewer is not None:
            self.viewer.close()
            self.viewer = None

## Visualise

In [None]:
env = RenderRGB(env, 'pixel_crop')
env = gym.wrappers.Monitor(env, "recordings", force=True)

vis_scores = []


for ep in range(50):
  state = env.reset()['pixel_crop']
  score = 0

  while True:
    env.render()

    act, log_prob = agent.choose_action(state)
    new_state, reward, done, info = env.step(act)
    score += rewards
    if done:
      break
    state = new_state
    
  vis_scores.append(score)

env.close()
show_video()