# Reinforcement Learning Practice: Grid World

***Charles Zhang***

***Jun 9***

In [1]:
import numpy as np

In [2]:
BOARD_ROWS = 3
BOARD_COLS = 4
START = (0, 0)
END = (2, 3)

### Environment Setting

In [3]:
class State:

    def __init__(self, state=START):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.state = state
        self.is_end = False

    def give_reward(self):
        if self.state == END:
            return 1
        else:
            return 0

    def check_end(self):
        if self.state == END:
            self.is_end = True

    def next_position(self, action):
        if action == "up":
            next_state = (self.state[0] - 1, self.state[1])
        elif action == "down":
            next_state = (self.state[0] + 1, self.state[1])
        elif action == "left":
            next_state = (self.state[0], self.state[1] - 1)
        else:
            next_state = (self.state[0], self.state[1] + 1)
        # boundary check
        if (next_state[0] >= 0) and (next_state[0] < BOARD_ROWS):
            if (next_state[1] >= 0) and (next_state[1] < BOARD_COLS):
                return next_state
        return self.state

    def show_board(self):
        self.board[self.state] = 1
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = '*'
                if self.board[i, j] == -1:
                    token = 'z'
                if self.board[i, j] == 0:
                    token = '0'
                out += token + ' | '
            print(out)
        print('-----------------')

In [4]:
start = State()
print(start.state)
start.show_board()

(0, 0)
-----------------
| * | 0 | 0 | 0 | 
-----------------
| 0 | 0 | 0 | 0 | 
-----------------
| 0 | 0 | 0 | 0 | 
-----------------


In [5]:
end = State(state=END)
print(end.state)
end.show_board()

(2, 3)
-----------------
| 0 | 0 | 0 | 0 | 
-----------------
| 0 | 0 | 0 | 0 | 
-----------------
| 0 | 0 | 0 | * | 
-----------------


### Agent Setting

In [6]:
class Agent:

    def __init__(self):
        self.states = []
        self.actions = ["up", "down", "left", "right"]      # space
        self.State = State()
        self.is_end = self.State.is_end
        self.alpha = 0.3
        self.exp_rate = 1
        self.state_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_values[(i, j)] = 0

    def get_action(self):
        max_reward = 0
        action = ""
        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            for a in self.actions:
                next_reward = self.state_values[self.State.next_position(a)]
                if next_reward >= max_reward:
                    action = a
                    max_reward = next_reward
        return action

    def set_action(self, action):
        position = self.State.next_position(action)
        return State(state=position)

    def show_values(self):
        for i in range(0, BOARD_ROWS):
            print('----------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                out += str(self.state_values[(i, j)]) + ' | '
            print(out)
        print('----------------------------------')

    def reset(self):
        self.states = []
        self.State = State()
        self.is_end = self.State.is_end

    def train(self, rounds=500):
        i = 1
        while i <= rounds:
            if self.State.is_end:
                reward = self.State.give_reward()  # 1 for the end
                self.state_values[self.State.state] = reward
                if i % 100 == 0:
                    print("Game Round {} End".format(i))
                for s in reversed(self.states):
                    reward = self.state_values[s] + self.alpha * (reward - self.state_values[s])
                    self.state_values[s] = round(reward, 4)
                self.reset()
                i += 1
            else:
                action = self.get_action()
                self.states.append(self.State.next_position(action))
                self.State = self.set_action(action)
                self.State.check_end()  # renew is_end
                self.is_end = self.State.is_end

### Training

In [7]:
agent = Agent()
print("Training...")
agent.train()
print("Training finished!")

Training...
Game Round 100 End
Game Round 200 End
Game Round 300 End
Game Round 400 End
Game Round 500 End
Training finished!


In [8]:
agent.show_values()

----------------------------------
| 0.9995 | 0.9996 | 0.9997 | 0.9998 | 
----------------------------------
| 0.9996 | 0.9997 | 0.9998 | 0.9999 | 
----------------------------------
| 0.9997 | 0.9998 | 0.9999 | 1.0 | 
----------------------------------


***References:***

https://www.cs.swarthmore.edu/~bryce/cs63/s16/slides/3-21_value_iteration.pdf

https://mohitmayank.com/interactive-q-learning/

https://github.com/JaeDukSeo/reinforcement-learning-an-introduction