# Reinforcement Learning Practice: Grid World

***Charles Zhang***

***Jun 9***

<img src=https://raw.githubusercontent.com/zcczhang/UAV_Coverage/master/Daily_Codes/Grid%20World/state_values.png width="500" height="600"/> 

In [1]:
import numpy as np

In [2]:
BOARD_ROWS = 3
BOARD_COLS = 4
START = (0, 0)
END = (2, 3)

### Environment Setting

In [3]:
class State:

    def __init__(self, state=START):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.state = state
        self.is_end = False

    def give_reward(self):
        if self.state == END:
            return 1
        else:
            return 0

    def check_end(self):
        if self.state == END:
            self.is_end = True

    def next_position(self, action):
        if action == "up":
            next_state = (self.state[0] - 1, self.state[1])
        elif action == "down":
            next_state = (self.state[0] + 1, self.state[1])
        elif action == "left":
            next_state = (self.state[0], self.state[1] - 1)
        else:
            next_state = (self.state[0], self.state[1] + 1)
        # boundary check
        if (next_state[0] >= 0) and (next_state[0] < BOARD_ROWS):
            if (next_state[1] >= 0) and (next_state[1] < BOARD_COLS):
                return next_state
        return self.state

    def show_board(self):
        self.board[self.state] = 1
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = '*'
                if self.board[i, j] == -1:
                    token = 'z'
                if self.board[i, j] == 0:
                    token = '0'
                out += token + ' | '
            print(out)
        print('-----------------')

In [4]:
start = State()
print(start.state)
start.show_board()

(0, 0)
-----------------
| * | 0 | 0 | 0 | 
-----------------
| 0 | 0 | 0 | 0 | 
-----------------
| 0 | 0 | 0 | 0 | 
-----------------


In [5]:
end = State(state=END)
print(end.state)
end.show_board()

(2, 3)
-----------------
| 0 | 0 | 0 | 0 | 
-----------------
| 0 | 0 | 0 | 0 | 
-----------------
| 0 | 0 | 0 | * | 
-----------------


### Agent Setting

In [6]:
class Agent:

    def __init__(self):
        self.states = []
        self.actions = ["up", "down", "left", "right"]      # space
        self.State = State()
        self.is_end = self.State.is_end
        self.alpha = 0.3
        self.exp_rate = 1
        self.decay_gamma = 0.9
        self.state_values = {}         # init state values
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_values[(i, j)] = 0
        self.Q_values = {}             # init Q values (dict)
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.Q_values[(i, j)] = {}
                for a in self.actions:
                    self.Q_values[(i, j)][a] = 0  
                    
    def get_action(self, learning="sv"):
        """
        :param: learning=q for q learning, sv for learning based on the state values
        """
        max_reward = 0
        action = ""
        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            for a in self.actions:
                if learning is "q":
                    next_reward = self.Q_values[self.State.state][a]
                else:
                    next_reward = self.state_values[self.State.next_position(a)]
                if next_reward >= max_reward:
                    action = a
                    max_reward = next_reward
        return action

    def set_action(self, action):
        position = self.State.next_position(action)
        return State(state=position)

    def show_values(self):
        for i in range(0, BOARD_ROWS):
            print('----------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                out += str(self.state_values[(i, j)]) + ' | '
            print(out)
        print('----------------------------------')

    def reset(self):
        self.states = []
        self.State = State()
        self.is_end = self.State.is_end

    def train(self, learning="sv", rounds=500):
        i = 1
        while i <= rounds:
            if self.State.is_end:
                reward = self.State.give_reward()  # 1 for the end
                if i % 100 == 0:
                    print("Game Round {} End".format(i))
                if learning is "q":
                    for a in self.actions:
                        self.Q_values[self.State.state][a] = reward
                    for s in reversed(self.states):
                        current_q_value = self.Q_values[s[0]][s[1]]
                        reward = current_q_value + self.alpha * (self.decay_gamma*reward - current_q_value)
                        self.Q_values[s[0]][s[1]] = round(reward, 4)
                else:
                    self.state_values[self.State.state] = reward
                    for s in reversed(self.states):
                        reward = self.state_values[s] + self.alpha * (reward - self.state_values[s])
                        self.state_values[s] = round(reward, 4)
                self.reset()
                i += 1
            else:
                action = self.get_action(learning=learning)
                if learning is "q":
                    self.states.append([(self.State.state), action])
                else:
                    self.states.append(self.State.next_position(action))
                self.State = self.set_action(action)
                self.State.check_end()  # renew is_end
                self.is_end = self.State.is_end

### Training

In [7]:
agent = Agent()
print("Training...")
agent.train()
print("Training finished!")

Training...
Game Round 100 End
Game Round 200 End
Game Round 300 End
Game Round 400 End
Game Round 500 End
Training finished!


In [8]:
for i in range(BOARD_ROWS):
    for j in range(BOARD_COLS):
        agent.state_values[(i, j)] -= 0.999
        agent.state_values[(i, j)] *= 1000
        agent.state_values[(i, j)] = round(agent.state_values[(i, j)], 1)
agent.show_values()

----------------------------------
| 0.5 | 0.6 | 0.7 | 0.8 | 
----------------------------------
| 0.6 | 0.7 | 0.8 | 0.9 | 
----------------------------------
| 0.7 | 0.8 | 0.9 | 1.0 | 
----------------------------------


In [9]:
agent_q = Agent()
print("Training...")
agent_q.train(learning="q", rounds=50)
print("Training finished!")

Training...
Training finished!


In [10]:
agent_q.Q_values

{(0, 0): {'down': 0.0829, 'left': 0.0746, 'right': 0.1138, 'up': 0.0785},
 (0, 1): {'down': 0.1024, 'left': 0.0752, 'right': 0.1741, 'up': 0.0945},
 (0, 2): {'down': 0.1972, 'left': 0.1149, 'right': 0.2136, 'up': 0.1901},
 (0, 3): {'down': 0.3429, 'left': 0.1451, 'right': 0.1214, 'up': 0.1803},
 (1, 0): {'down': 0.112, 'left': 0.0742, 'right': 0.1258, 'up': 0.0778},
 (1, 1): {'down': 0.153, 'left': 0.0759, 'right': 0.1688, 'up': 0.106},
 (1, 2): {'down': 0.2139, 'left': 0.1181, 'right': 0.308, 'up': 0.143},
 (1, 3): {'down': 0.8999, 'left': 0.1835, 'right': 0.5249, 'up': 0.2023},
 (2, 0): {'down': 0.1087, 'left': 0.1174, 'right': 0.1715, 'up': 0.0937},
 (2, 1): {'down': 0.1067, 'left': 0.1017, 'right': 0.2454, 'up': 0.1343},
 (2, 2): {'down': 0.373, 'left': 0.1147, 'right': 0.8997, 'up': 0.1261},
 (2, 3): {'down': 1, 'left': 1, 'right': 1, 'up': 1}}

***References:***

https://www.cs.swarthmore.edu/~bryce/cs63/s16/slides/3-21_value_iteration.pdf

https://mohitmayank.com/interactive-q-learning/

https://github.com/JaeDukSeo/reinforcement-learning-an-introduction