# Reinforcement Q-Learning from Scratch in Python with OpenAI Gym

https://www.learndatasci.com/tutorials/reinforcement-q-learning-scratch-python-openai-gym/

In [None]:
#!pip install cmake 'gym[atari]' scipy

In [1]:
from time import sleep

import gym
from IPython.display import clear_output

In [3]:
env = gym.make("Taxi-v3", render_mode="human")
env.reset()  # reset environment to a new, random state
env.render()


In [5]:
import numpy as np

q_table = np.zeros([env.observation_space.n, env.action_space.n])
q_table.shape

(500, 6)

* observation: Observations of the environment
* reward: If your action was beneficial or not
* done: Indicates if we have successfully picked up and dropped off a passenger, also called one episode
* info: Additional info such as performance and latency for debugging purposes

In [7]:
print("\nAction Space {}".format(env.action_space))
print("\nState Space {}".format(env.observation_space))

state = env.encode(3, 1, 2, 0)  # (taxi row, taxi column, passenger index, destination index)
print("\nState:", state)

env.s = state
env.render()


Action Space Discrete(6)

State Space Discrete(500)

State: 328


In [8]:
import pprint as pp

pp.pprint(env.P[328])  # current state in reward table

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}


In [9]:
env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0

frames = []  # for animation

done = False

# while not done:
# NEVER MIND "WHILE NOT DONE".
for i in range(100):
    action = env.action_space.sample()
    state, reward, done, truncated, info = env.step(action)

    if reward == -10:
        penalties += 1

    # Put each rendered frame into dict for animation
    frames.append({
        # 'frame': env.render(mode='ansi'),
        'frame': env.render(),
        'state': state,
        'action': action,
        'reward': reward
    })

    epochs += 1
    clear_output(wait=True)
    print(f"Timestep: {i + 1}")
    # state, reward, done, info

    if done or truncated:
        observation, info = env.reset()

print("\nTimesteps taken (epochs): {}".format(epochs))
print("\nPenalties incurred: {}".format(penalties))

Timestep: 50

Timesteps taken (epochs): 50

Penalties incurred: 12


In [10]:
def print_frames(cadres):
    for item, frame in enumerate(cadres):
        clear_output(wait=True)
        # print(frame['frame'].getvalue())
        print(f"Timestep: {item + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)

# That's the key to printing that nice little ascii table [env.render(mode='ansi')]
print_frames(frames)

Timestep: 50
State: 321
Action: 5
Reward: -10
