In [None]:
!pip install nes-py==0.2.6
!pip install gym-super-mario-bros
!apt-get update
!apt-get install ffmpeg libsm6 libxext6  -y
!apt install -y libgl1-mesa-glx
!pip install opencv-python

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical

import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY, SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
from gym import Wrapper
from nes_py.wrappers import JoypadSpace
from gym.spaces import Box

import random
from tqdm import tqdm
import pickle
import numpy as np
import cv2
import collections
import matplotlib.pyplot as plt

In [None]:
FILE = True

In [None]:
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_actions):
        super(ActorCritic, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.lstm = nn.LSTMCell(32 * 6 * 6, 512)
        self.critic_linear = nn.Linear(512, 1)
        self.actor_linear = nn.Linear(512, num_actions)
        self._initialize_weights()

    def _initialize_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                # nn.init.kaiming_uniform_(module.weight)
                nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.LSTMCell):
                nn.init.constant_(module.bias_ih, 0)
                nn.init.constant_(module.bias_hh, 0)

    def forward(self, x, hx, cx):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        hx, cx = self.lstm(x.view(x.size(0), -1), (hx, cx))
        return self.actor_linear(hx), self.critic_linear(hx), hx, cx

In [None]:
def process_frame(frame):
    if frame is not None:
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (84, 84))[None, :, :] / 255.
        return frame
    else:
        return np.zeros((1, 84, 84))

class CustomReward(Wrapper):
    def __init__(self, env=None):
        super(CustomReward, self).__init__(env)
        self.observation_space = Box(low=0, high=255, shape=(1, 84, 84))
        self.curr_score = 0
        self.prev_x = None

    def step(self, action):
        # print(action)
        state, reward, done, info = self.env.step(action)
        state = process_frame(state)
        reward += (info["score"] - self.curr_score) / 100.
        if self.prev_x != None:
          if(info["x_pos"] - self.prev_x <= 0):
            reward += -1
        self.prev_x = info["x_pos"]
        self.curr_score = info["score"]

        if done:
            if info["flag_get"]:
                reward += 50
            else:
                reward -= 50
        return state, reward / 10., done, info

    def reset(self):
        self.curr_score = 0
        return process_frame(self.env.reset())


class CustomSkipFrame(Wrapper):
    def __init__(self, env, skip=4):
        super(CustomSkipFrame, self).__init__(env)
        self.observation_space = Box(low=0, high=255, shape=(4, 84, 84))
        self.skip = skip

    def step(self, action):
        total_reward = 0
        states = []
        state, reward, done, info = self.env.step(action)
        for i in range(self.skip):
            if not done:
                state, reward, done, info = self.env.step(action)
                total_reward += reward
                states.append(state)
            else:
                states.append(state)
        states = np.concatenate(states, 0)[None, :, :, :]
        return states.astype(np.float32), reward, done, info

    def reset(self):
        state = self.env.reset()
        states = np.concatenate([state for _ in range(self.skip)], 0)[None, :, :, :]
        return states.astype(np.float32)

In [None]:
class GlobalAdam(torch.optim.Adam):
    def __init__(self, params, lr):
        super(GlobalAdam, self).__init__(params, lr=lr)
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                # state['step'] = 0
                state['step'] = torch.zeros(1)
                state['exp_avg'] = torch.zeros_like(p.data)
                state['exp_avg_sq'] = torch.zeros_like(p.data)

                state['exp_avg'].share_memory_()
                state['exp_avg_sq'].share_memory_()

In [None]:
num_local_steps = 300
episode_max = 500

gamma = 0.9
tau = 1.0
beta = 0.1 # change from 0.01

world = 1
stage = 1

max_actions = 200
num_global_steps = 5e6

action_type = SIMPLE_MOVEMENT

In [None]:
def train():
  torch.manual_seed(123)
  if torch.cuda.is_available():
    device = torch.device("cuda:0")
  else:
    device = torch.device("cpu")

  env = gym_super_mario_bros.make("SuperMarioBros-{}-{}-v0".format(world, stage))
  env = JoypadSpace(env, action_type)
  env = CustomReward(env)
  env = CustomSkipFrame(env)

  local_model = ActorCritic(env.observation_space.shape[0], len(action_type))

  state = torch.from_numpy(env.reset()).to(device)

  done = True
  curr_step = 0
  curr_episode = 0

  local_model.load_state_dict(torch.load("./a2c_super_mario_bros_1_1_S.pt"))

  local_model.to(device)
  local_model.train()

  #########
  optimizer = GlobalAdam(local_model.parameters(), lr=1e-4)
  #########

  while curr_episode <= episode_max:
    curr_episode += 1

    if curr_episode % 100 == 0:
      torch.save(local_model.state_dict(), "./a2c_super_mario_bros_1_1_S.pt")
      print("Episode {}".format(curr_episode))

      for name, param in local_model.named_parameters():
        print(name, param)
        print()

    if done:
      h_0 = torch.zeros((1, 512), dtype=torch.float).to(device)
      c_0 = torch.zeros((1, 512), dtype=torch.float).to(device)
    else:
      h_0 = h_0.detach().to(device)
      c_0 = c_0.detach().to(device)

    log_policies = []
    values = []
    rewards = []
    entropies = []

    for _ in range(num_local_steps):
      curr_step += 1
      logits, value, h_0, c_0 = local_model.forward(state, h_0, c_0)
      policy = F.softmax(logits, dim=1)
      log_policy = F.softmax(logits, dim=1)
      entropy = -(policy * log_policy).sum(1, keepdim=True)

      m = Categorical(policy)
      action = m.sample().item()

      state, reward, done, info = env.step(action)
      state = torch.from_numpy(state).to(device)

      values.append(value)
      log_policies.append(log_policy[0, action])
      rewards.append(reward)
      entropies.append(entropy)

      if done:
        curr_step = 0
        state = torch.from_numpy(env.reset()).to(device)
        break

    print(info["score"], info["flag_get"])
    R = torch.zeros((1, 1), dtype=torch.float).to(device)

    if not done:
      _, R, _, _ = local_model.forward(state, h_0, c_0) # bootstrap from last state

    actor_loss = 0
    critic_loss = 0
    entropy_loss = 0
    gae = torch.zeros((1, 1), dtype=torch.float).to(device)
    next_value = R

    for value, log_policy, reward, entropy in list(zip(values, log_policies, rewards, entropies))[::-1]:
            gae = gae * gamma * tau
            gae = gae + reward + gamma * next_value.detach() - value.detach()
            next_value = value
            actor_loss = actor_loss + log_policy * gae
            R = R * gamma + reward
            critic_loss = critic_loss + (R - value) ** 2 / 2
            entropy_loss = entropy_loss + entropy


    total_loss = -actor_loss + critic_loss - beta * entropy_loss
    #####
    optimizer.zero_grad()
    #####
    total_loss.backward()

    ####
    optimizer.step()
    ####

In [None]:
train()

NameError: ignored

In [None]:
import imageio

def test():
    torch.manual_seed(123)
    if torch.cuda.is_available():
      device = torch.device("cuda:0")
    else:
      device = torch.device("cpu")

    env = gym_super_mario_bros.make("SuperMarioBros-{}-{}-v0".format(world, stage))
    env = JoypadSpace(env, action_type)
    env = CustomReward(env)
    env = CustomSkipFrame(env)

    # local_model = ActorCritic(num_states, num_actions)
    local_model = ActorCritic(env.observation_space.shape[0], len(action_type))
    local_model.load_state_dict(torch.load("./a2c_super_mario_bros_1_1_S.pt"))

    #for name, param in local_model.named_parameters():
    #  print(name, param)
    #  return

    local_model = local_model.to(device)
    local_model.eval()

    state = torch.from_numpy(env.reset())
    done = True

    # actions = collections.deque(maxlen=max_actions)

    images = []
    img = env.render(mode='rgb_array')
    images.append(img)

    curr_step = 0
    while True:
        curr_step += 1
        #curr_step += 1
        # if done:
        #     local_model.load_state_dict(global_model.state_dict())
        #with torch.no_grad():
        if done:
          h_0 = torch.zeros((1, 512), dtype=torch.float)
          c_0 = torch.zeros((1, 512), dtype=torch.float)
          env.reset()
        else:
          h_0 = h_0.detach()
          c_0 = c_0.detach()

        h_0 = h_0.to(device)
        c_0 = c_0.to(device)
        state = state.to(device)

        logits, value, h_0, c_0 = local_model.forward(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        action = int(action)
        print(action)
        state, reward, done, info = env.step(action)
        print(reward)
        state = torch.from_numpy(state)
        img = env.render(mode='rgb_array')
        images.append(img)
        #actions.append(action)

        #if curr_step > num_global_steps or actions.count(actions[0]) == actions.maxlen:
        #    done = True
        #if done:
        #    curr_step = 0
        #    actions.clear()
        #    state = env.reset()

        # print(info)

        if info["flag_get"]:
          print("World 1 stage 1 completed")
          break

        if info["time"] == 0 or curr_step > 1000:
          print("ERROR")
          break

        # state = torch.from_numpy(state)
    # print(len(images))
    imageio.mimsave("./replay.mp4", [np.array(img) for i, img in enumerate(images)], fps=30)

In [None]:
!apt-get install -y python3-opengl ffmpeg xvfb

In [None]:
!pip3 install pyvirtualdisplay

In [None]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7b32b5abcb20>

In [None]:
test()

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  logger.warn(


2
-0.1
2
0.1
2
0.1
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
3
0.2
3
0.3
3
0.3
2
0.3
3
0.3
3
0.3
3
0.3
3
-6.5
3
0.1
2
0.1
2
0.1
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
3
0.2
3
0.3
3
0.3
2
0.3
3
0.3
3
0.3
3
0.3
3
0.3
2
0.3
3
0.3
3
0.3
3
0.3
2
0.3
2
0.3
2
0.3
2
0.3
2
0.3
2
0.3
2
0.3
2
0.3
3
0.3
3
0.3
3
0.3
3
0.3
3
0.3
3
0.3
3
-0.1
2
-0.1
2
-0.1
2
0.1
2
0.1
2
0.1
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
3
0.3
3
0.3
3
-6.5
2
-0.1
2
0.1
2
0.1
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
3
0.2
3
0.3
3
0.3
2
0.3
3
0.3
3
0.3
3
0.3
3
-6.5
3
0.1
2
0.1
2
0.1
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
2
0.2
2
0.2
2
0.2
2
0.1
3
0.2
3
0.3
3
0.3
2
0.3
3
0.3
3
0.3
3
0.3
3
0.3
2
0.3
3
0.3
3
0.3
3
0.3
2
0.3
2
0.3
2
0.3
2
0.3
2
0.3
2
0.3
2
0.3
2


KeyboardInterrupt: ignored

In [None]:
def record_video(env, policy, out_directory, fps=30):
  """
  Generate a replay video of the agent
  :param env
  :param Qtable: Qtable of our agent
  :param out_directory
  :param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
  """
  images = []
  done = False
  state = env.reset()
  img = env.render(mode='rgb_array')
  images.append(img)
  while not done:
    # Take the action (index) that have the maximum expected future reward given that state
    action, _ = policy.act(state)
    state, reward, done, info = env.step(action) # We directly put next_state = state for recording logic
    img = env.render(mode='rgb_array')
    images.append(img)
  imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)