In [1]:
import os
import gym
import pickle
import matplotlib.pyplot as plt
import numpy as np
import ppaquette_gym_doom
from collections import deque

In [2]:
# Create a classic Doom environment with Gym
env = gym.make('ppaquette/DoomDefendCenter-v0')

INFO:gym.envs.registration:Making new env: ppaquette/DoomDefendCenter-v0
[2017-05-05 23:12:00,894] Making new env: ppaquette/DoomDefendCenter-v0


In [3]:
class TDAgent():
    def __init__(self,\
               learn_rate=0.05,\
               observation_encoder=lambda x: x,\
               verbose=True):
        self.alpha = 1.0 # Degree of temporal difference
        self.policy = dict() # Vπ: state => action => R 
        self.total_reward = 0
        self.learn_rate = learn_rate
        self.observation_encoder = observation_encoder
        self.action_mapping = {} # Map [action str] => [action object]
        self.history = deque() # List of (state, action, reward), first is newest
    
    def reset(self):
        self.history.clear()
    
    def learn_aggregate(self, observation, observation_, action, reward):
        # Encode [observation] => [state]
        state  = self.observation_encoder(observation)
        state_ = self.observation_encoder(observation_)
        self.history.appendleft((state, str(action), reward))
        self.action_mapping[str(action)] = action
        self.learn_V(state, str(action), reward, state_)
        
    """
    Find maximum possible reward we would get
    from the best action attempted on [state]
    """
    def find_best_action(self, state):
        best_return = 0
        best_action = -1
        # Find the best policy which applies on [state]
        # and it yields the highest [V]
        if state not in self.policy:
            return (best_action, best_return)
        actions = [(a, self.policy[state][a]) for a in self.policy[state]]
        best_act_tuple = sorted(actions, key=first, )

        # TAOTODO:
        return (best_action, best_return)
    
    def decode_action(self, action):
        if action not in self.action_mapping:
          return None
        else: 
          return self.action_mapping[action]
    
    """
    Query the value policy V(state):π
    """
    def V(self, state, action):
        if state not in self.policy:
            self.policy[state] = {action: 0}
            return 0
        if action not in self.policy[state]:
            self.policy[state][action] = 0
            return 0
        return self.policy[state][action]

    """
    Update the value policy π
    """
    def learn_V(self, state, action, reward, next_state):
        old_V = self.V(state, action)
        new_V = self.V(next_state, action)
        
        self.policy[state][action] = old_V + \
            self.learn_rate * (reward + self.alpha * new_V - old_V)
    
    @staticmethod
    def load(path, default):
        if os.path.isfile(path):
          with open(path,'rb') as f:
            return pickle.load(f) 
        else:
          print('MODEL NOT FOUND, initialising a brand new one.')
          return default

    @staticmethod
    def save(path,agent):
        with open(path,'wb+') as f:
          return pickle.dump(agent, f)

In [None]:
# Computer vision utils
from PIL import Image
from scipy.stats import threshold
from scipy.signal import medfilt
from scipy.misc import toimage
def encode_screen(observation):
  # Crop & downsampling & grayscale
  cropped = observation[150:230:3, ::2 , :]
  r = cropped[:,:,0]
  return projection(pixelate(r))

def show(mat):
  toimage(mat).show()

"""
Coarse pixelate
"""
def pixelate(observation):
  # Threshold
  m = threshold(observation, threshmin=100, threshmax=None, newval=0)
  # Remove noise
  m = medfilt(m, 3)
  return m

def projection(pixels):
  # Horizontal projection
  _,w = pixels.shape
  proj = np.zeros(w)
  for x in range(w):
    proj[x] = np.where(np.sum(pixels[:,x]) < 2400, 1, 0)

  # Reduce
  pj = []
  count = np.count_nonzero
  start = 0
  stride = 5
  while len(proj[start:start+stride])>0:
    pj.append(count(proj[start:start+stride]) > 3)
    start += stride+1
    
  return str(pj)

In [None]:
# Play (TRAINING)
model_name = 'TDAgent.pkl'
score_records = [] # Scores of each round
turn_records = [] # Number of turns lasting each round
num_episodes = 10
all_actions = set(range(3)) # Attack / Right / Left

bot = TDAgent.load(model_name,\
                  TDAgent(learn_rate=0.8, \
                         observation_encoder=encode_screen))

for i in range(num_episodes):
  if num_episodes<25 or i % 25 == 0:
    print('Ep#{} started...'.format(i))
  observation = env.reset()
  num_turns = 0
  action_space = set()
  while True:
    env.render()
    
    # Try to find best action, otherwise, random
    state = bot.observation_encoder(observation)
    # action, _ = bot.find_best_action(state)
    if action == -1:
      action = env.action_space.sample()
    else:
      # print('... Best action reward : ', bot.V(state, action)) # TAODEBUG:
      action = bot.decode_action(action)

    observation_, reward, done, info = env.step(action)
    bot.learn_aggregate(observation, observation_, action, reward)
    observation = np.copy(observation_)
    num_turns  += 1
    if done:
      score_records.append(bot.total_reward)
      turn_records.append(num_turns)
      if num_episodes<25 or i % 25 == 0:
        print('...[DONE] Total reward : {}'.format(bot.total_reward))
      # Save the agent
      TDAgent.save(model_name, bot)
      bot.reset()
      break
        
print('All {} episodes of training are DONE.'.format(num_episodes))

MODEL NOT FOUND, initialising a brand new one.
Ep#0 started...
action space size := 1
action space size := 2
action space size := 3
action space size := 4
action space size := 5
action space size := 6
action space size := 7
action space size := 8
action space size := 9
action space size := 10
action space size := 11
action space size := 12
action space size := 13
action space size := 14
action space size := 15
action space size := 16
action space size := 17
action space size := 18
action space size := 19
action space size := 20
action space size := 21
action space size := 22
action space size := 23
action space size := 24
action space size := 25
action space size := 26
action space size := 27
action space size := 28
action space size := 29
action space size := 30
action space size := 31
action space size := 32
action space size := 33
action space size := 34
action space size := 35
action space size := 36
action space size := 37
action space size := 38
action space size := 39
action spa

In [None]:
# Show training score
%matplotlib inline

def moving_avg(arr):
  mm = []
  wnd = 8
  for i in range(len(arr)):
    v = arr[i-wnd:i]
    if len(v)>0:
      mm.append(max(v))
  return mm

plt.figure(1)
plt.xlabel('# Episode')
plt.ylabel('Score/game')
plt.plot(moving_avg(score_records))

plt.figure(2)
plt.xlabel('# Episode')
plt.ylabel('Turns/game')
plt.plot(moving_avg(turn_records))
plt.show()

In [None]:
# Count top actions
list_actions = []
for action in bot.policy:
    aa = [bot.policy[action][s] for s in bot.policy[action]]
    r = max(aa)
    sl = len(aa)
    list_actions.append((r,sl))
    
def reward(tup):
    r, sl = tup
    return r
    
for i, tup in enumerate(sorted(list_actions, key=reward, reverse=True)):
    print('action #{} : Best Reward = {} (among {:3f} states)'.format(i, r, sl))

In [None]:
# Now play from experience and record the scores
