We will build a Q learning agent to learn how to play the move to beacon mini game.

In [1]:
import math
import numpy as np
from pysc2.agents import base_agent
from pysc2.lib import actions
from pysc2.lib import features
from pysc2.env import sc2_env, run_loop, available_actions_printer
from pysc2 import maps
from absl import flags

_AI_RELATIVE = features.SCREEN_FEATURES.player_relative.index
_AI_SELECTED = features.SCREEN_FEATURES.selected.index
_NO_OP = actions.FUNCTIONS.no_op.id
_MOVE_SCREEN = actions.FUNCTIONS.Attack_screen.id
_SELECT_ARMY = actions.FUNCTIONS.select_army.id
_SELECT_POINT = actions.FUNCTIONS.select_point.id
_MOVE_RAND = 1000
_BACKGROUND = 0
_AI_SELF = 1
_AI_ALLIES = 2
_AI_NEUTRAL = 3
_AI_HOSTILE = 4
_SELECT_ALL = [0]
_NOT_QUEUED = [0]

In [2]:
# define our actions
# it can choose to move to
# the beacon or to do nothing
# it can select the marine or deselect
# the marine, it can move to a random point
possible_actions = [
    _NO_OP,
    _SELECT_ARMY,
    _SELECT_POINT,
    _MOVE_SCREEN,
    _MOVE_RAND
]
possible_actions

[0, 7, 2, 12, 1000]

In [3]:
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 500

def get_eps_threshold(steps_done):
    return EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)

# define the state
def get_state(obs):
    # get the positions of the marine and the beacon
    ai_view = obs.observation['screen'][_AI_RELATIVE]
    beaconxs, beaconys = (ai_view == _AI_NEUTRAL).nonzero()
    marinexs, marineys = (ai_view == _AI_SELF).nonzero()
    marinex, mariney = marinexs.mean(), marineys.mean()
        
    marine_on_beacon = np.min(beaconxs) <= marinex <=  np.max(beaconxs) and np.min(beaconys) <= mariney <=  np.max(beaconys)
        
    # get a 1 or 0 for whether or not our marine is selected
    ai_selected = obs.observation['screen'][_AI_SELECTED]
    marine_selected = int((ai_selected == 1).any())
    
    return (marine_selected, marine_on_beacon), [beaconxs, beaconys]

def get_beacon_location(ai_relative_view):
    '''returns the location indices of the beacon on the map'''
    return (ai_relative_view == _AI_NEUTRAL).nonzero() 

class QTable(object):
    def __init__(self, actions, lr=0.01, reward_decay=0.9):
        self.lr = lr
        self.actions = actions
        self.reward_decay = reward_decay
        self.states_list = set()
        self.q_table = np.zeros((0, len(possible_actions))) # create a Q table
        
    def get_action(self, state, steps):
        if np.random.rand() < get_eps_threshold(steps):
            return np.random.randint(0, len(self.actions))
        else:
            if state not in self.states_list:
                self.add_state(state)
            idx = list(self.states_list).index(state)
            q_values = self.q_table[idx]
            return int(np.argmax(q_values))
    
    def add_state(self, state):
        self.q_table = np.vstack([self.q_table, np.zeros((1, len(possible_actions)))])
        self.states_list.add(state)
    
    def update_qtable(self, state, next_state, action, reward):
        if state not in self.states_list:
            self.add_state(state)
        if next_state not in self.states_list:
            self.add_state(next_state)
        # how much reward 
        state_idx = list(self.states_list).index(state)
        next_state_idx = list(self.states_list).index(next_state)
        # calculate q labels
        q_state = self.q_table[state_idx, action]
        q_next_state = self.q_table[next_state_idx].max()
        q_targets = reward + (self.reward_decay * q_next_state)
        # calculate our loss 
        loss = q_targets - q_state
        # update the q value for this state/action pair
        self.q_table[state_idx, action] += self.lr * loss
        return loss
    
    def get_size(self):
        print(self.q_table.shape)
    
class Agent3(base_agent.BaseAgent):
    def __init__(self):
        super(Agent3, self).__init__()
        self.qt = QTable(possible_actions)
        
    def step(self, obs, steps):
        '''Step function gets called automatically by pysc2 environment'''
        super(Agent3, self).step(obs)
        state, beacon_pos = get_state(obs)
        action = self.qt.get_action(state, steps)
        func = actions.FunctionCall(_NO_OP, [])
        
        if possible_actions[action] == _NO_OP:
            func = actions.FunctionCall(_NO_OP, [])
        elif state[0] and possible_actions[action] == _MOVE_SCREEN:
            beacon_x, beacon_y = beacon_pos[0].mean(), beacon_pos[1].mean()
            func = actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, [beacon_y, beacon_x]])
        elif possible_actions[action] == _SELECT_ARMY:
            func = actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
        elif state[0] and possible_actions[action] == _SELECT_POINT:
            ai_view = obs.observation['screen'][_AI_RELATIVE]
            backgroundxs, backgroundys = (ai_view == _BACKGROUND).nonzero()
            point = np.random.randint(0, len(backgroundxs))
            backgroundx, backgroundy = backgroundxs[point], backgroundys[point]
            func = actions.FunctionCall(_SELECT_POINT, [_NOT_QUEUED, [backgroundy, backgroundx]])
        elif state[0] and possible_actions[action] == _MOVE_RAND:
            # move somewhere that is not the beacon
            beacon_x, beacon_y = beacon_pos[0].max(), beacon_pos[1].max()
            movex, movey = np.random.randint(beacon_x, 64), np.random.randint(beacon_y, 64)
            func = actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, [movey, movex]])
        return state, action, func

In [4]:
FLAGS = flags.FLAGS
FLAGS(['run_sc2'])

viz = False
save_replay = False
steps_per_episode = 0 # 0 actually means unlimited
MAX_EPISODES = 100
MAX_STEPS = 400
total_steps = 0

# create a map
beacon_map = maps.get('MoveToBeacon')

# create an envirnoment
with sc2_env.SC2Env(agent_race=None,
                    bot_race=None,
                    difficulty=None,
                    map_name=beacon_map,
                    visualize=viz) as env:
    agent = Agent3()
    for i in range(MAX_EPISODES):
        print('Starting episode.')
        ep_reward = 0
        obs = env.reset()
        for j in range(MAX_STEPS):
            total_steps += 1
            state, action, func = agent.step(obs[0], total_steps)
            obs = env.step(actions=[func])
            next_state, _ = get_state(obs[0])
            reward = obs[0].reward
            ep_reward += reward
            loss = agent.qt.update_qtable(state, next_state, action, reward)
        print('Episode Reward: {}, Explore threshold: {}, Q loss: {}'.format(ep_reward, get_eps_threshold(total_steps), loss))
    if save_replay:
        env.save_replay(Agent3.__name__)

Starting episode.


  
  ret = ret.dtype.type(ret / rcount)


Episode Reward: 12, Explore threshold: 0.4319296194996383, Q loss: 0.023910463865926202
Starting episode.
Episode Reward: 11, Explore threshold: 0.22161204029545706, Q loss: 0.04142643829312305
Starting episode.
Episode Reward: 3, Explore threshold: 0.12711026029600064, Q loss: -0.005607384205412298
Starting episode.
Episode Reward: 3, Explore threshold: 0.08464787338161128, Q loss: -0.005887233744322706
Starting episode.
Episode Reward: 4, Explore threshold: 0.06556829305542405, Q loss: -0.006717998007231038
Starting episode.
Episode Reward: 3, Explore threshold: 0.056995284991667026, Q loss: -0.006327417101438297
Starting episode.
Episode Reward: 1, Explore threshold: 0.053143184159010495, Q loss: -0.0030889159882449904
Starting episode.
Episode Reward: 3, Explore threshold: 0.05141232368219785, Q loss: -0.004695781222213362
Starting episode.
Episode Reward: 1, Explore threshold: 0.050634597937120183, Q loss: -0.0035450500421394596
Starting episode.
Episode Reward: 2, Explore thresho

Episode Reward: 44, Explore threshold: 0.05, Q loss: -0.10352868613317101
Starting episode.
Episode Reward: 44, Explore threshold: 0.05, Q loss: 0.8983357588190155
Starting episode.
Episode Reward: 39, Explore threshold: 0.05, Q loss: -0.0997603639414234
Starting episode.
Episode Reward: 44, Explore threshold: 0.05, Q loss: 0.163971703223458
Starting episode.
Episode Reward: 40, Explore threshold: 0.05, Q loss: -0.09902019651432437
Starting episode.
Episode Reward: 43, Explore threshold: 0.05, Q loss: -0.10034731764565863
Starting episode.
Episode Reward: 42, Explore threshold: 0.05, Q loss: -0.10082770033242994
Starting episode.
Episode Reward: 39, Explore threshold: 0.05, Q loss: -0.09842674104476334
Starting episode.
Episode Reward: 44, Explore threshold: 0.05, Q loss: 0.9014751272764222
Starting episode.
Episode Reward: 42, Explore threshold: 0.05, Q loss: -0.09921306803691898
Starting episode.
Episode Reward: 44, Explore threshold: 0.05, Q loss: -0.10062659139479413
Starting episo

In [5]:
agent.reward/MAX_EPISODES

34.270000000000003

In [6]:
agent.qt.q_table.shape

(3, 5)

You may have noticed our Q learning agent actually outperforms an agent that is told to simple move to the beacon? How is that possible? Let's examine the Q Table.

In [7]:
for state in agent.qt.states_list:
    print(state, agent.qt.q_table[list(agent.qt.states_list).index(state)])

(1, False) [ 0.892184    0.95317795  0.89820497  1.0305249   0.84618695]
(0, False) [ 0.26923404  1.74120036  0.09948828  0.18952466  0.21669156]
(1, True) [ 0.15577372  1.90523865  0.11505073  0.22864302  0.15269515]


So when it has the marine selected but its not at the beacon, state=(1, False), our agent learns that moving to the beacon has the highest value 1.03 (action at index 3).

When it doesnt have the marine selected and its not at the beacon, state=(0,False), our agent learns to select the marine has the highest value 1.74 (action at index 1).

When it is one the beacon and it has the marine selected, state=(1,True) it learns that reselecting, continuing to select the marine gives the highest reward.

Structuring the porblem is very important. Originally I had a different state representation and it didn't learn anything (much fo anything).  had represented the state as the position of the beacon and the position of the marine but there were so many states and the Q learning agent didn't have any function that could tell it "here is what those locations mean."

I didn't want to teach it to recognize the beacon as well as move to the beacon so I just taught it to move to beacon as a block decision. If it chooses to move to the beacon

Our bot can choose to do 2 things. It can choose to just sit there and do _NO_OP or it canmove to the beacon. It has to learn to move to the beacon.

It would be nice to also teach it to recognize the becaon, let's examie this next. 