<a href="https://colab.research.google.com/github/wko1014/RL_Study/blob/main/notes/Dynamic_Programming2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import APIs
import gym
import numpy as np
import random

# APIs for annimation
from IPython.display import clear_output
from time import sleep

In [None]:
# Call Taxi environment
env = gym.make('Taxi-v2').env
env.reset()
env.render()

print("Possible actions: {}".format(env.action_space))
print("Possible states: {}\n".format(env.observation_space))

print("Rendering:\n -blue:passenger\n -magenta: destination\n -yellow:"
      "empty taxi\n -green: full taxi\n -other letters (R, G, B, and Y):"
      "locations for passengers and destinations\n")
print("There are 6 discrete deterministic actions:\n"
     " -0: move south\n -1: move north\n -2: move east\n -3: move west\n"
     " -4: pickup passenger\n -5: dropoff passenger\n")
print("Rewards:\n There is a reward of -1 for each action"
      " and an additional reward of + 20 for deliver.\n"
     " There is a reward of -10 for executing actions pickup and dropoff illegally.")

+---------+
|R: | : :[35mG[0m|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+

Possible actions: Discrete(6)
Possible states: Discrete(500)

Rendering:
 -blue:passenger
 -magenta: destination
 -yellow:empty taxi
 -green: full taxi
 -other letters (R, G, B, and Y):locations for passengers and destinations

There are 6 discrete deterministic actions:
 -0: move south
 -1: move north
 -2: move east
 -3: move west
 -4: pickup passenger
 -5: dropoff passenger

Rewards:
 There is a reward of -1 for each action and an additional reward of + 20 for deliver.
 There is a reward of -10 for executing actions pickup and dropoff illegally.


In [None]:
# The state space is represented by: 
# (taxi_row, taxi_col, passenger_location, destination)
state = env.encode(3, 1, 2, 0)
print("State code:", state)

env.s = state
env.render()

# Current Initial reward table.
# For each action 0-5, each column show transition probability, next state,
# immediate reward, and terminate.
env.P[328]

State code: 328
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+



{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [None]:
# Without any policy enhancement (random policy), we now show the agent.

# Initialize
epochs, penalties, reward = 0, 0, 0
# For animation
animation = []
done = False

while not done:
  action = env.action_space.sample() # select an action randomly
  state, reward, done, info = env.step(action)
  
  # For each illegal delivery or meaningless 10 actions, our agent get a penalty.
  if reward == -10:
    penalties += 1
    
  animation.append({"frame": env.render(mode="ansi"), "state":state,
                    "action":action, "reward":reward})
  epochs += 1

print("The agent used {} timesteps for delivery.".format(epochs))
print("The agent got {} penalties.".format(penalties))

The agent used 256 timesteps for delivery.
The agent got 58 penalties.


In [None]:
%%time
# To animate, we define a function.
def animating(frames, time_per_frame):
  for i, frame in enumerate(frames):
    clear_output(wait=True)
    print(frame["frame"].getvalue())
    print(f"Timesteps: {i}")
    print(f"State: {frame['state']}")
    print(f"Action: {frame['action']}")
    print(f"Reward: {frame['reward']}")
    sleep(time_per_frame)
    
animating(animation, 0.05)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timesteps: 255
State: 0
Action: 5
Reward: 20
CPU times: user 693 ms, sys: 187 ms, total: 880 ms
Wall time: 13.3 s


In [None]:
%%time
# Let us re-implement value iteration algorithm for this environment.
# Define hyperparameters
num_actions = env.action_space.n
num_states = env.observation_space.n
num_iterations = 1000
delta = 10**-4 # a small threshold
gamma = 0.9 # a discounted factor

# Define reward, transition, value, action-value matrices.
R = np.zeros([num_states, num_actions, num_states])
T = np.zeros([num_states, num_actions, num_states])
V = np.zeros([num_states])
Q = np.zeros([num_states, num_actions])

for state in range(num_states):
  for action in range(num_actions):
    for trans in env.P[state][action]:
      prob, s_prime, reward, done = trans
      R[state, action, s_prime] = reward
      T[state, action, s_prime] = prob
    T[state, action, :] /= np.sum(T[state, action, :])
    
for iteration in range(num_iterations):
  V_previous = V.copy()
  tmp = np.einsum("ijk,ijk->ij", T, R + gamma*V)
  V = np.max(tmp, axis=1)
  if np.max(np.abs(V-V_previous)) < delta:
    iters = iteration
    break
  policy = np.argmax(tmp, axis=1)

CPU times: user 879 ms, sys: 38.6 ms, total: 918 ms
Wall time: 919 ms


In [None]:
# Now, the agent is learnt with value iteration algorithm.

# Initialize
state = env.encode(3, 1, 2, 0)
env.s = state

epochs, penalties, reward = 0, 0, 0
# For animation
animation_val_iter = []
done = False

while not done:
  action = policy[state]
  state, reward, done, info = env.step(action)
  # For each illegal delivery or meaningless 10 actions, our agent get a penalty.
  if reward == -10:
    penalties += 1
    
  animation_val_iter.append({"frame": env.render(mode="ansi"), "state":state,
                    "action":action, "reward":reward})
  epochs += 1

print("The agent used {} timesteps for delivery.".format(epochs))
print("The agent got {} penalties.".format(penalties))

The agent used 10 timesteps for delivery.
The agent got 0 penalties.


In [None]:
animating(animation_val_iter, .5)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timesteps: 9
State: 0
Action: 5
Reward: 20
