In [1]:
import os
import gym
import pickle
import theano
import theano.tensor as T
import matplotlib.pyplot as plt
import numpy as np
import heapq
import ppaquette_gym_doom

In [2]:
# Create a classic Doom environment with Gym
env = gym.make('ppaquette/DoomDefendCenter-v0')

INFO:gym.envs.registration:Making new env: ppaquette/DoomDefendCenter-v0
[2017-03-23 01:09:31,753] Making new env: ppaquette/DoomDefendCenter-v0


In [3]:
# Agent
class QAgent:
  def __init__(self,\
               learn_rate=0.05,\
               observation_encoder=lambda x: x,\
               verbose=True):
    self.alpha = 0.9 # Probability of re-learning 
    self.policy = dict() # Q: (state,action) => value
    self.total_reward = 0
    self.learn_rate = learn_rate
    self.action_mapping = {} # Map [action str] => [action object]
    self.observation_encoder = observation_encoder
    self.verbose = verbose
    self.reset()
    
  def reset(self):
    self.prev_state = 0
    self.total_reward = 0
        
  def learn_aggregate(self, observation, observation_, action, reward):
    # Encode [observation] => [state]
    state  = self.observation_encoder(observation)
    state_ = self.observation_encoder(observation_)
    self.learn_Q(state, state_, str(action), reward)
    self.action_mapping[str(action)] = action
    self.prev_state = state
    self.total_reward += reward

  """
  Predict the reward we would get
  if take [action] on [state]
  """
  def learn_Q(self, state, state_, action, reward):
    # if self.verbose: print(state_, ' [ reward = {} ]'.format(reward))
    curr_Q  = self.Q(state, action)
    _,max_Q = self.find_best_action(state_)
    new_Q   = curr_Q + self.alpha*(reward + self.learn_rate * max_Q - curr_Q)
    if state in self.policy:
      self.policy[state][action] = reward # Always overwrite
    else: self.policy[state] = {action: new_Q}

  """
  Find maximum possible reward we would get
  from the best action attempted on [state]
  """
  def find_best_action(self, state):
    if state in self.policy:
      max_Q    = -1
      best_act = -1
      for a,r in self.policy[state].items():
        if r > max_Q:
          best_act = a
          max_Q = r
      return (best_act, max_Q)
    else: return (-1,-1)
    
  def decode_action(self, action):
    if action not in self.action_mapping:
      return None
    else: 
      return self.action_mapping[action]

  """
  Get recorded Q value of (state, action)
  """
  def Q(self, state, action):
    if state in self.policy:
      if action in self.policy[state]:
        return self.policy[state][action]
      else: return -1
    else: return -1
  
  @staticmethod
  def load(path, default):
    if os.path.isfile(path):
      with open(path,'rb') as f:
        return pickle.load(f) 
    else:
      print('MODEL NOT FOUND, initialising a brand new one.')
      return default

  @staticmethod
  def save(path,agent):
    with open(path,'wb+') as f:
      return pickle.dump(agent, f)



In [4]:
# Computer vision utils
from PIL import Image
from scipy.stats import threshold
from scipy.signal import medfilt
from scipy.misc import toimage
def encode_screen(observation):
  # Crop & downsampling & grayscale
  cropped = observation[150:230:3, ::2 , :]
  r = cropped[:,:,0]
  return projection(pixelate(r))

def show(mat):
  toimage(mat).show()

"""
Coarse pixelate
"""
def pixelate(observation):
  # Threshold
  m = threshold(observation, threshmin=100, threshmax=None, newval=0)
  # Remove noise
  m = medfilt(m, 3)
  return m

def projection(pixels):
  # Horizontal projection
  _,w = pixels.shape
  proj = np.zeros(w)
  for x in range(w):
    proj[x] = np.where(np.sum(pixels[:,x]) < 2400, 1, 0)

  # Reduce
  pj = []
  count = np.count_nonzero
  start = 0
  stride = 5
  while len(proj[start:start+stride])>0:
    pj.append(count(proj[start:start+stride]) > 3)
    start += stride+1
    
  return str(pj)
        

In [5]:
# Play
model_name = 'QAgent.pkl'
num_episodes = 5
all_actions = set(range(3)) # Attack / Right / Left

bot = QAgent.load(model_name,\
                  QAgent(learn_rate=0.8, \
                         observation_encoder=encode_screen))

for i in range(num_episodes):
  print('Ep#{} started...'.format(i))
  observation = env.reset()
  while True:
    env.render()
    
    # Try to find best action, otherwise, random
    state = bot.observation_encoder(observation)
    action, _ = bot.find_best_action(state)
    if action == -1:
      action = env.action_space.sample()
    else:
      print('... Best action : ', action)
      action = bot.decode_action(action)

    observation_, reward, done, info = env.step(action)
    bot.learn_aggregate(observation, observation_, action, reward)
    observation = np.copy(observation_)
    if done:
      print('[Done]')
      print('...Total reward : {}'.format(bot.total_reward))
      # Save the agent
      QAgent.save(model_name, bot)
      bot.reset()
      break
    


MODEL NOT FOUND, initialising a brand new one.
Ep#0 started...
... Best action :  [1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 9, 4, -28, -13, 40]
... Best action :  [0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, -2, -9, 39, 13, -47]
... Best action :  [0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, -2, -9, 39, 13, -47]
... Best action :  [0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, -2, -9, 39, 13, -47]
... Best action :  [1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 9, 4, -28, -13, 40]
... Best action :  [1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 9, 4, -28, -13, 40]
... Best a