The first step is to create an AI that can play the mineral collection game.

In [None]:
import math
import numpy as np
from pysc2.agents import base_agent
from pysc2.lib import actions
from pysc2.lib import features
from pysc2.env import sc2_env, run_loop, available_actions_printer, environment
from pysc2 import maps
from absl import flags

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

In [None]:
_AI_RELATIVE = features.SCREEN_FEATURES.player_relative.index
_AI_SELECTED = features.SCREEN_FEATURES.selected.index
_AI_SELF = 1
_AI_NEUTRAL = 3
_NO_OP = actions.FUNCTIONS.no_op.id
_MOVE_SCREEN = actions.FUNCTIONS.Attack_screen.id
_SELECT_ARMY = actions.FUNCTIONS.select_army.id
_SELECT_ALL = [0]
_NOT_QUEUED = [0]

Describe how our AI will see transitions:

```
state: 
    marine_selected - a 0 or 1 representing if the marine is selected
    marinex_avg - the x position of the middle of the marine
    mariney_avg - the y position of the middle of the marine
    beaconx_avg - the x position of the middle of the beacon
    beacony_avg - the y position of the middle of the beacon
    
actions:
    select - the AI can take the action to select the marine or not select it
    movex - once selected the AI can choose to move the marine along the X direction
    movey - once selected the AI can choose to move the marine along the Y direction
```

For example a state might look like this:


```
(1,5,6,25,25) # selected marine at position (5,6), beacon at (25,25)
```

And some of the next possible states might look like:

```
(1,6,7,25,25)
(1,5,5,25,25)
(1,4,5,25,25)
(1,5,7,25,25)
(0,5,6,25,25) # unselecting the marine
```

If our algorithm works right - at the beginning we expect to go from a state like:

```
(0,x,y,x,y)
```

To:
```
(1,x,y,x,y)
```

Step 1 should be selecting the marine.

In [7]:
LR = 0.01
BATCH_SIZE = 128
BETA = 0.5
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 500
MAX_EPISODES = 100
MAX_STEPS = 500

In [8]:
class Agent3(base_agent.BaseAgent):
    def step(self, obs):
        super(Agent3, self).step(obs)
        return decision_function(obs)

In [12]:
class ReplayBuffer(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.transitions = []
        self.index = 0
        
    def store_transition(self, transition):
        # if were below capacity just add the transition
        if len(self.transitions) < self.capacity:
            self.transitions.append(transition)
        # if were above capacity replace transitions randomly
        else:
            rand_index = np.random.randint(0, len(self.transitions))
            self.transitions[rand_index] = transition
        
    def get_transitions_batches(self, batch_size):
        idxs = np.random.choice(len(self.transitions), batch_size)
        return np.array(self.transitions)[idxs]
        
    def __len__(self):
        return len(self.transitions)

class DQN(nn.Module):
    def __init__(self, in_size, out_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(in_size, 128)
        self.fc2 = nn.Linear(128, 32)
        self.fc3 = nn.Linear(32, out_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
def get_eps_threshold(steps_done):
    return EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)

def update_dqn(replay_buf, dqn, optimizer):
    if len(replay_buf) < BATCH_SIZE:
        return
    transitions = replay_buf.get_transitions_batches(BATCH_SIZE)
    
    batch_states = Variable(torch.stack([torch.Tensor(trans[0]) for trans in transitions]))
    batch_next_states = Variable(torch.stack([torch.Tensor(trans[1]) for trans in transitions if trans[1]]))
    batch_next_states_nonnull = Variable(torch.stack([torch.Tensor(trans[1]) for trans in transitions if trans[1] is not None]))
    batch_next_states_nonnull_mask = [i for i, trans in enumerate(transitions) if trans[1] is not None]
    batch_actions = Variable(torch.cat([torch.LongTensor([int(trans[2])]) for trans in transitions]))
    batch_rewards = Variable(torch.cat([torch.Tensor([int(trans[3])]) for trans in transitions]))
    batch_done = Variable(torch.cat([torch.Tensor([int(trans[4])]) for trans in transitions]))
    
    # get predictions
    state_qvalues = dqn(batch_states)
    state_qvalues = torch.gather(state_qvalues, 1, batch_actions.view(-1, 1)).view(-1)
    # get the actual reward for the next step
    next_state_qvalues = Variable(torch.zeros(BATCH_SIZE).type(torch.Tensor))
    next_state_qvalues[batch_next_states_nonnull_mask] = dqn(batch_next_states_nonnull).max(1)[0]
    print(next_state_qvalues)
    print(batch_rewards)
    # get the 'labels' to compare our q values against
    qvalue_labels = (next_state_qvalues*GAMMA) + batch_rewards
    
    # calculate a loss and update weights
    loss = torch.sum((state_qvalues - qvalue_labels)**2)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss
    
def get_state(obs):
    # get the positions of the marine and the beacon
    ai_view = obs.observation['screen'][_AI_RELATIVE]
    beacon_xs, beacon_ys = (ai_view == _AI_NEUTRAL).nonzero()
    beacon_x, beacon_y = beacon_xs.mean(), beacon_ys.mean()
    marine_xs, marine_ys = (ai_view == _AI_SELF).nonzero()
    marine_x, marine_y = marine_xs.mean(), marine_ys.mean()
    
    # get a 1 or 0 for whether or not our marine is selected
    ai_selected = obs.observation['screen'][_AI_SELECTED]
    marine_selected = int((ai_selected == 1).any())
    
    # return a state that summarizes where the marine is
    # and where the beacon is in this world, and whether the marine
    # is selected
    return (marine_selected, marine_x, marine_y, beacon_x, beacon_y)

def get_action(obs, dqn, steps, actiondict):
    current_state = get_state(obs)
    state_in = Variable(torch.FloatTensor(current_state))
    # get q values for potential future states
    q_values = dqn(state_in)
    # make a little matrix with the outputs that 
    # represents the q values for the next state
    q_values = q_values.data.view(3, 3).numpy()
    # takes some random action
    thresh = get_eps_threshold(steps)
    if np.random.rand() < thresh:
        action = np.random.randint(0,9)
    # of take the action our dqn chose
    else:
        action = np.argmax(q_values)
    movey, movex = actiondict[str(action)]
    destx = max(0, current_state[1]+movex)
    desty = max(0, current_state[2]+movey)
    # tell our agent to move the marine, swapping position of x and y
    return np.argmax(q_values), desty, destx

In [None]:
FLAGS = flags.FLAGS
FLAGS(['run_sc2'])

viz = False
total_steps = 0
save_replay = True
episode_rewards = []
rp_buf = ReplayBuffer(1000)
beacon_map = maps.get('MoveToBeacon')
actiondict = {
    '0':[0, 0],
    '1':[1, 0],
    '2':[1, 1],
    '3':[0, 1],
    '4':[-1, 0],
    '5':[-1, -1],
    '6':[0, -1],
    '7':[1, -1],
    '8':[-1, 1]
}

with sc2_env.SC2Env(agent_race=None, bot_race=None, difficulty=None, map_name=beacon_map, visualize=viz) as env:
    # create a dqn, agent
    dqn = DQN(5, 9)
    optimizer = optim.Adam(dqn.parameters(), lr=LR, betas=(BETA, 0.99))
    agent=Agent3()
    for i in range(MAX_EPISODES):
        ep_reward = 0
        ep_reward_extra = 0
        obs = env.reset()
        for j in range(MAX_STEPS):
            total_steps += 1
            # select our marine
            if _MOVE_SCREEN not in obs[0].observation['available_actions']:
                obs = env.step(actions=[actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])
            # get the state
            state1 = get_state(obs[0])
            # take an action in the env
            action, desty, destx = get_action(obs[0], dqn, total_steps, actiondict)
            func = actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, [desty, destx]])
            obs = env.step(actions=[func])
            # get the new state
            state2 = get_state(obs[0])
            # record the reward and
            # whether the episode is done
            reward = obs[0].reward
            ep_reward += reward
#             orig_dist = math.hypot(state1[1] - state1[3], state1[2] - state1[4])
#             new_dist = math.hypot(state2[1] - state2[3], state2[2] - state2[4])
#             if new_dist < orig_dist:
#                 reward += 1
            ep_reward_extra += reward
            done = int(obs[0].step_type == environment.StepType.LAST)
            
            if done:
                state2 = None
            rp_buf.store_transition((state1, state2, action, reward, done))
            
            if done:
                episode_rewards.append(reward)
                print('episode_done reward: {}, extra {}'.format(ep_reward, ep_reward_extra))
                break
            loss = update_dqn(rp_buf, dqn, optimizer)

    if save_replay:
        env.save_replay(Agent3.__name__)

Sources:

http://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

https://github.com/xhujoy/pysc2-agents

https://arxiv.org/abs/1708.04782

What is wron with my model?

Does the q function learn, dose the loss go down? Yes


Right now the bot just wanders around randomly. there isn't enough reward feedback. So i'll give it a bit of a hint. If it gets closer to the goal ill give it 0.1 reward. this seems ot help a little it actually gets to the goal more than 1x


All states are ending up with the same q value close to 0.