<a href="https://colab.research.google.com/github/wko1014/RL_Study/blob/main/notes/Planning_and_Learning_with_Tabular_Methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import APIs
import gym
import numpy as np
import random

# APIs for annimation
from IPython.display import clear_output
from time import sleep

In [None]:
# Call Taxi environment
env = gym.make('Taxi-v2').env
env.reset()
env.render()

+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+



In [None]:
%%time
# Now we implement tabular Dyna-Q algorithm.
num_states = env.observation_space.n
num_actions = env.action_space.n
num_iterations = 1000
alpha, epsilon, gamma, lambda_ = .4, .1, .9, .9
n = 5
# Initialize
Q = np.zeros([num_states, num_actions])
model = np.zeros([num_states, num_actions, 2])

# Repeat
for iter in range(num_iterations):
  S = env.reset() # Initialize S
  trajectory = np.zeros([num_states, num_actions])
  # Loop for each step of episode
  done = False
  while not done:
    env.s = S
    # Epsilon-greedy XX
    if np.random.rand() < epsilon:
      A = env.action_space.sample() # take a random action
    else:
      A = np.argmax(Q[S, :])
    S_prime, R, done, info = env.step(int(A))
    Q[S, A] += alpha * (R + gamma * np.max(Q[S_prime, :]) - Q[S, A]) # Direct RL
    model[S, A, 0], model[S, A, 1] = R, int(S_prime) # Model Learning (Assuming deterministic env.)
    trajectory[S, A] = 1
    current = S_prime
    for i in range(n): # if n=0, this algorithm do not have planning.
      S = np.random.choice(np.nonzero(np.sum(trajectory, -1))[0]) # random previously observed state
      A = int(np.random.choice(trajectory[S, :])) # random action previously taken in S
      R, S_prime = model[S, A, 0], int(model[S, A, 1])
      Q[S, A] += alpha * (R + gamma * np.max(Q[S_prime, :])-Q[S, A])
    S = current

CPU times: user 5.88 s, sys: 2.88 ms, total: 5.89 s
Wall time: 5.89 s


In [None]:
# Now, the agent is learnt with tabular Dyna-Q algorithm.

# Initialize
state = env.reset()

epochs, penalties, reward = 0, 0, 0
# For animation
animation_dyna_Q = []
done = False

while not done:
  action = np.argmax(Q[state, :])
  state, reward, done, info = env.step(action)
  
  animation_dyna_Q.append({"frame": env.render(mode="ansi"), "state":state,
                    "action":action, "reward":reward})
  epochs += 1

print("The agent used {} timesteps for delivery.".format(epochs))
print("The agent got {} penalties.".format(penalties))

The agent used 14 timesteps for delivery.
The agent got 0 penalties.


In [None]:
%%time
# To animate, we define a function.
def animating(frames, time_per_frame):
  for i, frame in enumerate(frames):
    clear_output(wait=True)
    print(frame["frame"].getvalue())
    print(f"Timesteps: {i}")
    print(f"State: {frame['state']}")
    print(f"Action: {frame['action']}")
    print(f"Reward: {frame['reward']}")
    sleep(time_per_frame)
    
animating(animation_dyna_Q, 0.5)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timesteps: 13
State: 0
Action: 5
Reward: 20
CPU times: user 38.5 ms, sys: 7.32 ms, total: 45.8 ms
Wall time: 7.03 s
