In [1]:
"""
A simple example for Reinforcement Learning using table lookup Q-learning method.
An agent "o" is on the left of a 1 dimensional world, the treasure is on the rightmost location.
Run this program and to see how the agent will improve its strategy of finding the treasure.

View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import time

import numpy as np
import pandas as pd

# np.random.seed(2)
N_STATES = 6  # 1-dim world
ACTIONS = ("left", "right")
FRESHTIME = 0.3  # visual

epsilon = 0.9  # greedy policy
alpha = 0.1  # learning rate
gamma = 0.9  # discount
max_episodes = 13


def build_q_table(n_states, actions):
    q_table = pd.DataFrame(
        np.zeros((n_states, len(actions))), columns=actions
    )  # action's name
    return q_table


def choose_actions(state, q_table):
    state_actions = q_table.iloc[state, :]  # 选取state的整行
    if (np.random.uniform() > epsilon) or (
        (state_actions == 0).all()
    ):  # act non-greedy or state-action have no value
        action_name = np.random.choice(ACTIONS)
        # 随机选择or q-table中没有信息，还没有学到
    else:
        action_name = state_actions.idxmax()
    return action_name


def get_env_feedback(S, A):  # agent
    if A == "right":
        if S == N_STATES - 2:
            S_ = "terminal"
            R = 1  # 游戏结束，得到reward
        else:
            S_ = S + 1
            R = 0
    else:  # left
        R = 0
        if S == 0:
            S_ = S
        else:
            S_ = S - 1
    return S_, R


def update_env(S, episode, step_cnt):
    env_list = ["-"] * (N_STATES - 1) + ["T"]
    if S == "terminal":
        interaction = "Episode %s: total_steps = %s" % (episode + 1, step_cnt)
        print("\r{}".format(interaction), end="")
        time.sleep(2)
        print("\r                                ", end="")
        time.sleep(FRESHTIME)
    else:
        env_list[S] = "o"
        interaction = "".join(env_list)
        print("\r{}".format(interaction), end="")


def rl():
    q_table = build_q_table(N_STATES, ACTIONS)
    for episode in range(max_episodes):
        step_cnt = 0
        S = 0
        is_terminal = False
        update_env(S, episode, step_cnt)
        while not is_terminal:
            A = choose_actions(S, q_table)
            S_, R = get_env_feedback(S, A)
            q_predict = q_table.loc[S, A]
            if S_ != "terminal":
                q_target = R + gamma * q_table.iloc[S_, :].max()
            else:
                q_target = R
                is_terminal = True

            q_table.loc[S, A] += alpha * (q_target - q_predict)
            S = S_

            update_env(S, episode, step_cnt + 1)
            step_cnt += 1
    return q_table

In [None]:
q_table = rl()
print(q_table)

                                       left     right
0  0.000003  0.004810
1  0.000000  0.027191
2  0.000000  0.112258
3  0.001873  0.340790
4  0.000000  0.745813
5  0.000000  0.000000
