In [1]:
import numpy as np
import time

np.random.seed(2)

N_STATES = 10
ACTIONS = ["left", "right"]
ALPHA = 0.1
GAMMA = 0.9
EPSILON = 0.9
MAX_EPISODES = 13
FRESH_TIME = 0.1


def build_q_table(n_states, actions):
    table = np.zeros((n_states, len(actions)))
    return table

def choose_action(state, q_table):
    state_actions = q_table[state, :]
    # act non-greedy or state action has no value
    if (np.random.uniform() > EPSILON) or (state_actions.all() == 0):
        ac = np.random.choice(ACTIONS)
    # act greedy
    else:
        ac = ACTIONS[np.argmax(state_actions)]
    return ac

def get_env_feedback(state, action):
    """This is how the agent - environment interaction happens"""

    if action == 'right':
        # about to walk into the treasure state
        if state == N_STATES - 2:
            S_ = 'terminal'
            R = 1
        else:
            S_ = state + 1
            R = 0
    else:
        R = 0
        S_ = max(state - 1, 0)
    return (S_, R)

def update_env(state, episode, step_counter):

    env = ['-']*(N_STATES-1) + ['T']
    if state == 'terminal':
        interaction = ('Episode: %s | total_steps: %s') % (
            episode, step_counter)
        print('\r{}'.format(interaction), end='')
        time.sleep(2)
        print('\r                                ', end='')

    else:
        env[state] = 'o'
        interaction = ''.join(env)
        print('\r{}'.format(interaction), end='')
        time.sleep(FRESH_TIME)


def rl():
    """Main part of Reinforcement Learning"""
    q_table = build_q_table(N_STATES, ACTIONS)
    for episode in np.arange(MAX_EPISODES):
        step_counter = 0
        S = 0
        is_terminated = False
        update_env(S, episode, step_counter)
        while not is_terminated:

            A = choose_action(S, q_table)
            S_, R = get_env_feedback(S, A)
            ind_action = 0
            if A == 'right':
                ind_action = 1
            q_predict = q_table[S, ind_action]
            if S_ != 'terminal':
                q_target = R + GAMMA * np.max(q_table[S_, :])
            else:
                q_target = R
                is_terminated = True

            # update rule
            q_table[S, ind_action] += ALPHA * (q_target - q_predict)
            S = S_

            update_env(S, episode, step_counter)
            step_counter += 1

    return q_table

if __name__ == "__main__":
    q_table = rl()
    print('\r\nQ-table\n')
    print(q_table)

                                
Q-table

[[2.06107700e-10 8.03637579e-07]
 [8.81345466e-09 1.35570462e-05]
 [3.57324939e-07 1.92406137e-04]
 [1.71527422e-05 1.00760490e-03]
 [4.66636838e-05 6.25308951e-03]
 [2.18481300e-06 3.33378389e-02]
 [1.75566233e-03 1.22425293e-01]
 [7.29000000e-05 3.63157937e-01]
 [1.79545578e-02 7.45813417e-01]
 [0.00000000e+00 0.00000000e+00]]
