In [1]:
import numpy as np
import random
from environment import Env
from collections import defaultdict
import jdc

In [2]:
class QLearningAgent:
    def __init__(self, actions):
        # actions = [0, 1, 2, 3]
        self.actions = actions
        self.learning_rate = 0.05 # alpha
        self.discount_factor = 0.7 # gamma
        self.epsilon = 0.1
        # Q table will be in format of dictionary,
        # keys will be vector of states [row, col],
        # values will be vectors of Q value of each action at this state
        self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])

In the cell, we define a function that takes in <s, a, r, s'> and update the Q table.
<br>
* Goal 1: Get the Q value of given action at given state. (Remember to call q_table by self.q_table, same for all other variables: actions, learning_rate, discount_factors, and epsilon)
* Goal 2: Update the Q value of given action at given state through Bellmen's Equation:
<br>
$Q(s,a)\gets Q(s,a)+\alpha[r+\gamma \max_{a'}Q(s',a')-Q(s,a)] $

In [3]:
%%add_to QLearningAgent
def learn(self, state, action, reward, next_state):
    current_q = # Goal 1
    self.q_table[state][action] += # Goal 2

In [None]:
%%add_to QLearningAgent
@staticmethod
def arg_max(state_action):
    max_index_list = []
    max_value = state_action[0]
    for index, value in enumerate(state_action):
        if value > max_value:
            max_index_list.clear()
            max_value = value
            max_index_list.append(index)
        elif value == max_value:
            max_index_list.append(index)
    return random.choice(max_index_list)

Following function takes in current state, then pick an action.
* Goal 3: write an if-else function for $\epsilon$-greedy research
<br>
$a=
\begin{cases}
\text{random}\quad a  & \text{ with probability } \epsilon \\
\max_{a}Q(s,a)  & \text{ with probability } 1-\epsilon 
\end{cases}$
<br>
Use self.arg_max function above to choose $\max_{a}Q(s,a)$

In [4]:
%%add_to QLearningAgent
def get_action(self, state):
    if np.random.rand() < self.epsilon:
        # epsilon-greedy search for action
        action = # Goal 3
    else:
        # 从q表中选择
        state_action = self.q_table[state]
        action = self.arg_max(state_action)
    return action

In [None]:
if __name__ == "__main__":
    env = Env()
    agent = QLearningAgent(actions=list(range(env.n_actions)))
    total_eps = 100
    for episode in range(total_eps):
        state = env.reset()
        while True:
            env.render()
            # agent产生动作
            action = agent.get_action(str(state))
            next_state, reward, done = env.step(action)
            # 更新Q表
            agent.learn(str(state), action, reward, str(next_state))
            state = next_state
            env.print_value_all(agent.q_table)
            # 当到达终点就终止游戏开始新一轮训练
            if done:
                break