# Reinforcement Learning Practice: Grid World

***Charles Zhang***

***Jun 9***

![](state_values.png)

In [1]:
import numpy as np

In [2]:
BOARD_ROWS = 3
BOARD_COLS = 4
START = (0, 0)
END = (2, 3)

### Environment Setting

In [3]:
class State:

    def __init__(self, state=START):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.state = state    # tuple of the coordinate
        self.is_end = False

    def give_reward(self):
        if self.state == END:
            return 1
        else:
            return 0

    def check_end(self):
        if self.state == END:
            self.is_end = True

    def next_position(self, action):
        if action == "up":
            next_state = (self.state[0] - 1, self.state[1])
        elif action == "down":
            next_state = (self.state[0] + 1, self.state[1])
        elif action == "left":
            next_state = (self.state[0], self.state[1] - 1)
        else:
            next_state = (self.state[0], self.state[1] + 1)
        if (next_state[0] >= 0) and (next_state[0] < BOARD_ROWS):
            if (next_state[1] >= 0) and (next_state[1] < BOARD_COLS):
                return next_state
        return self.state

    def show_board(self):
        self.board[self.state] = 1
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = '*'
                if self.board[i, j] == -1:
                    token = 'z'
                if self.board[i, j] == 0:
                    token = '0'
                out += token + ' | '
            print(out)
        print('-----------------')

In [4]:
start = State()
print(start.state)
start.show_board()

(0, 0)
-----------------
| * | 0 | 0 | 0 | 
-----------------
| 0 | 0 | 0 | 0 | 
-----------------
| 0 | 0 | 0 | 0 | 
-----------------


In [5]:
end = State(state=END)
print(end.state)
end.show_board()

(2, 3)
-----------------
| 0 | 0 | 0 | 0 | 
-----------------
| 0 | 0 | 0 | 0 | 
-----------------
| 0 | 0 | 0 | * | 
-----------------


### Agent Setting

In [6]:
class Agent:

    def __init__(self, learning="sv"):
        """
        :param: learning="q" for Q-learning, "sv" for learning based on the state values
        """
        self.learning = learning  
        self.actions = ["up", "down", "left", "right"]      # space
        self.State = State()
        self.is_end = self.State.is_end
        self.alpha = 0.3
        self.exp_rate = 1
        self.decay_gamma = 0.9
        self.state_values = {}         # init state values
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_values[(i, j)] = 0
        self.state_values[END] = 1
        self.Q_values = {}             # init Q values (dict)
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.Q_values[(i, j)] = {}
                for a in self.actions:
                    self.Q_values[(i, j)][a] = 0
                    
    def get_action(self):
        max_value = 0
        action = ""
        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            for a in self.actions:
                if self.learning == "q":
                    next_value = self.Q_values[self.State.state][a]
                else:
                    next_value = self.state_values[self.State.next_position(a)]
                if next_value >= max_value:
                    action = a
                    max_value = next_value
        return action

    def set_action(self, action):
        curr_state = self.State.state  
        next_state = self.State.next_position(action)  
        self.State = State(state=next_state) 
        qs_of_next_state = []
        for q_value in self.Q_values[next_state]:  
            qs_of_next_state.append(self.Q_values[next_state][q_value])
        delta = self.alpha*(self.State.give_reward() +
                            self.decay_gamma*(max(qs_of_next_state)) -
                            self.Q_values[curr_state][action])
        self.Q_values[curr_state][action] = round(self.Q_values[curr_state][action]+delta,2)

        self.state_values[curr_state] += self.alpha*(self.state_values[next_state] -
                                                self.state_values[curr_state])
        self.state_values[curr_state] = round(self.state_values[curr_state], 3)
                

    def show_state_values(self):
        for i in range(0, BOARD_ROWS):
            print('----------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                out += str(self.state_values[(i, j)]) + ' | '
            print(out)
        print('----------------------------------')

    def reset(self):
        self.State = State()
        self.is_end = self.State.is_end

    def train(self, rounds=200):
        print("Training...")
        for r in range(rounds): 
            self.reset()
            while not self.is_end:
                action = self.get_action()
                self.set_action(action)
                self.State.check_end()
                self.is_end = self.State.is_end
        print("Training finished!")

### Training

In [7]:
agent = Agent(learning="q")
agent.train()

Training...
Training finished!


In [8]:
agent.Q_values

{(0, 0): {'down': 0.61, 'left': 0.54, 'right': 0.61, 'up': 0.54},
 (0, 1): {'down': 0.69, 'left': 0.54, 'right': 0.69, 'up': 0.61},
 (0, 2): {'down': 0.78, 'left': 0.61, 'right': 0.78, 'up': 0.69},
 (0, 3): {'down': 0.88, 'left': 0.69, 'right': 0.78, 'up': 0.78},
 (1, 0): {'down': 0.69, 'left': 0.61, 'right': 0.69, 'up': 0.54},
 (1, 1): {'down': 0.78, 'left': 0.61, 'right': 0.78, 'up': 0.61},
 (1, 2): {'down': 0.88, 'left': 0.69, 'right': 0.88, 'up': 0.69},
 (1, 3): {'down': 0.99, 'left': 0.78, 'right': 0.88, 'up': 0.78},
 (2, 0): {'down': 0.69, 'left': 0.69, 'right': 0.78, 'up': 0.61},
 (2, 1): {'down': 0.78, 'left': 0.69, 'right': 0.88, 'up': 0.69},
 (2, 2): {'down': 0.88, 'left': 0.78, 'right': 0.99, 'up': 0.78},
 (2, 3): {'down': 0, 'left': 0, 'right': 0, 'up': 0}}

In [9]:
agent.show_state_values()

----------------------------------
| 0.995 | 0.996 | 0.997 | 0.998 | 
----------------------------------
| 0.996 | 0.997 | 0.998 | 0.999 | 
----------------------------------
| 0.997 | 0.998 | 0.999 | 1 | 
----------------------------------


### Observation

The ***state values*** will converge after training around 200 times whereas ***Q values*** will converge after only training around 50 times. Therefore, Q learning method learns faster. 

***References:***

https://www.cs.swarthmore.edu/~bryce/cs63/s16/slides/3-21_value_iteration.pdf

https://mohitmayank.com/interactive-q-learning/

https://github.com/JaeDukSeo/reinforcement-learning-an-introduction