# Grid World --- past all grids (backpropagation for value updates)

In [1]:
import numpy as np

In [2]:
BOARD_ROWS = 3
BOARD_COLS = 4
START = (0, 0)
END = (2, 3)

In [3]:
class State:

    def __init__(self, state=START):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.state = state    # tuple of the coordinate
        self.is_end = False

    def check_end(self):
        if self.state == END:
            self.is_end = True

    def next_position(self, action):
        if action == "up":
            next_state = (self.state[0] - 1, self.state[1])
        elif action == "down":
            next_state = (self.state[0] + 1, self.state[1])
        elif action == "left":
            next_state = (self.state[0], self.state[1] - 1)
        else:
            next_state = (self.state[0], self.state[1] + 1)
        if (next_state[0] >= 0) and (next_state[0] < BOARD_ROWS):
            if (next_state[1] >= 0) and (next_state[1] < BOARD_COLS):
                return next_state
        return self.state

In [4]:
class Agent:

    def __init__(self):
        self.actions = ["up", "down", "left", "right"]      # space
        self.State = State()
        self.is_end = self.State.is_end
        self.alpha = 0.3
        self.exp_rate = 1
        self.decay_gamma = 0.9
        self.Q_values = {}             # init Q values (dict)
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.Q_values[(i, j)] = {}
                for a in self.actions:
                    self.Q_values[(i, j)][a] = 0
        self.past_all = []
        for i in range(BOARD_ROWS):
            t = []
            for j in range(BOARD_COLS): 
                t.append(False)
            self.past_all.append(t)
        
        self.steps = []
        self.states = []
    
    def optimal_action(self):
        max_value = -100
        action = ""
        for a in self.actions:
            next_value = self.Q_values[self.State.state][a]
            if next_value >= max_value:
                action = a
                max_value = next_value
        return action
    
    def get_action(self):
        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            action = self.optimal_action()
        return action
    
    def set_action(self, action):
        position = self.State.next_position(action)
        return State(state=position)
        

    def reset(self):
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.past_all[i][j] = False
        self.State = State()
        self.is_end = self.State.is_end
        self.states = []
    
    def check_all_past(self):
        for i in self.past_all:
            for j in i:
                if j is False:
                    return False
        return True

    def train(self, rounds=2000):
        print("Training...")
        i = 1
        while i <= rounds:
            if self.is_end and self.check_all_past():
                reward = 1
                for a in self.actions:
                    self.Q_values[self.State.state][a] = reward
                for s in reversed(self.states):
                    current_q_value = self.Q_values[s[0]][s[1]]
                    reward = current_q_value + self.alpha * (self.decay_gamma*reward - current_q_value)
                    self.Q_values[s[0]][s[1]] = round(reward,4)
                i += 1
                self.steps.append(len(self.states))
                self.reset()
            else:
                curr_state = self.State.state
                self.past_all[curr_state[0]][curr_state[1]] = True
                action = self.get_action()
                self.states.append([(self.State.state), action])
                self.State = self.set_action(action)
                if self.past_all[self.State.state[0]][self.State.state[1]] is False:
                    self.past_all[self.State.state[0]][self.State.state[1]] = True
                self.State.check_end()  
                self.is_end = self.State.is_end
#                 if self.is_end and self.check_all_past():
#                     for a in self.actions:
#                         self.Q_values[self.State.state][a] = 1
#                 self.exp_rate *= 0.9
        print("Training finished!")

In [5]:
agent = Agent()
agent.train()

Training...
Training finished!


In [6]:
agent.Q_values

{(0, 0): {'down': 0.0765, 'left': 0.0798, 'right': 0.0906, 'up': 0.0783},
 (0, 1): {'down': 0.0982, 'left': 0.0804, 'right': 0.1004, 'up': 0.0791},
 (0, 2): {'down': 0.15, 'left': 0.0882, 'right': 0.1401, 'up': 0.1426},
 (0, 3): {'down': 0.1563, 'left': 0.1265, 'right': 0.1409, 'up': 0.147},
 (1, 0): {'down': 0.1012, 'left': 0.0768, 'right': 0.0975, 'up': 0.0785},
 (1, 1): {'down': 0.1244, 'left': 0.0914, 'right': 0.1351, 'up': 0.084},
 (1, 2): {'down': 0.4242, 'left': 0.1267, 'right': 0.2667, 'up': 0.1251},
 (1, 3): {'down': 0.762, 'left': 0.2161, 'right': 0.1389, 'up': 0.1373},
 (2, 0): {'down': 0.1277, 'left': 0.0901, 'right': 0.1779, 'up': 0.0853},
 (2, 1): {'down': 0.147, 'left': 0.1033, 'right': 0.2167, 'up': 0.1291},
 (2, 2): {'down': 0.3314, 'left': 0.1556, 'right': 0.7561, 'up': 0.1284},
 (2, 3): {'down': 1, 'left': 1, 'right': 1, 'up': 1}}