In [1]:
import os
import gym
import pickle
import matplotlib.pyplot as plt
import numpy as np
import ppaquette_gym_doom

In [2]:
# Create a classic Doom environment with Gym
env = gym.make('ppaquette/DoomDefendCenter-v0')

INFO:gym.envs.registration:Making new env: ppaquette/DoomDefendCenter-v0
[2017-04-26 20:47:43,780] Making new env: ppaquette/DoomDefendCenter-v0


In [3]:
class Agent():
    def __init__(self,\
               learn_rate=0.05,\
               observation_encoder=lambda x: x,\
               verbose=True):
        self.alpha = 0.9 # Probability of re-learning 
        self.policy = dict() # Q: (state,action) => value
        self.value_policy = dict() # V: (state) => value
        self.total_reward = 0
        self.learn_rate = learn_rate
        self.observation_encoder = observation_encoder
        self.history = [] # List of (state, action, reward)
        self.state_transition = {} # Mapping of [state] => [action] => [state']
    
    def reset(self):
        self.history.clear()
    
    def learn_aggregate(self, observation, observation_, action, reward):
        # Encode [observation] => [state]
        state  = self.observation_encoder(observation)
        state_ = self.observation_encoder(observation_)
        self.history.append((state, action, reward))
        if state not in self.state_transition:
            self.state_transition[state] = {action: state_}
        else:
            self.state_transition[state][action] = state_
        # Update the model policies
        self.update_V(state)
        self.learn_Q(state, state_, action, reward)
    
    """
    Find maximum possible reward we would get
    from the best action attempted on [state]
    """
    def find_best_action(self, state):
        if state in self.state_transition:
            act_next = [(a, self.state_transition[state][a]) \
                        for a in self.state_transition[state]]
            act_next_reward = [(a, self.V(s_)) for (a,s_) in act_next]
            sorted(act_next_reward, key=lambda x: )
    
    """
    Calculate R(t) of an most recent [action] on [state] from the recorded history
    NOTE: The most recent reward of the latest state has to be updated to the model.
    """
    def R(self):
        sum_reward = 0
        for i, rec in enumerate(self.history):
            (s,a,r)     = rec
            sum_reward += self.alpha * r ** i
        return sum_reward
    
    def update_V(self, state):
        R_ = self.learn_rate * (self.R() + self.alpha * self.V(state) - self.V(state))
        self.value_policy[state] = self.V(state) + R_
        return R_
    
    """
    Get recorded V value of (state)
    """
    def V(self, state):
        if state in self.value_policy:
            return self.value_policy[state]
        else: 
            return -1
    
    """
    Get recorded Q value of (state, action)
    """
    def Q(self, state, action):
        if state in self.policy:
          if action in self.policy[state]:
            return self.policy[state][action]
          else: return -1
        else: return -1
    
    """
    Predict the reward of the state we would get
    """
    def learn_V(self, state, state_, action, reward):
        pass
    
    """
    Predict the reward we would get
    if take [action] on [state]
    """
    def learn_Q(self, state, state_, action, reward):
        # if self.verbose: print(state_, ' [ reward = {} ]'.format(reward))
        curr_Q  = self.Q(state, action)
        _,max_Q = self.find_best_action(state_)
        # TAOTODO: Following needs updating
        new_Q   = curr_Q + self.alpha*(reward + self.learn_rate * max_Q - curr_Q)
        if state in self.policy:
          self.policy[state][action] = reward # Always overwrite
        else: self.policy[state] = {action: new_Q}
    
    @staticmethod
    def load(path, default):
        if os.path.isfile(path):
          with open(path,'rb') as f:
            return pickle.load(f) 
        else:
          print('MODEL NOT FOUND, initialising a brand new one.')
          return default

    @staticmethod
    def save(path,agent):
        with open(path,'wb+') as f:
          return pickle.dump(agent, f)

In [4]:
# Computer vision utils
from PIL import Image
from scipy.stats import threshold
from scipy.signal import medfilt
from scipy.misc import toimage
def encode_screen(observation):
  # Crop & downsampling & grayscale
  cropped = observation[150:230:3, ::2 , :]
  r = cropped[:,:,0]
  return projection(pixelate(r))

def show(mat):
  toimage(mat).show()

"""
Coarse pixelate
"""
def pixelate(observation):
  # Threshold
  m = threshold(observation, threshmin=100, threshmax=None, newval=0)
  # Remove noise
  m = medfilt(m, 3)
  return m

def projection(pixels):
  # Horizontal projection
  _,w = pixels.shape
  proj = np.zeros(w)
  for x in range(w):
    proj[x] = np.where(np.sum(pixels[:,x]) < 2400, 1, 0)

  # Reduce
  pj = []
  count = np.count_nonzero
  start = 0
  stride = 5
  while len(proj[start:start+stride])>0:
    pj.append(count(proj[start:start+stride]) > 3)
    start += stride+1
    
  return str(pj)