## Q-Table
- 4x4 grid for environment
- 4 actions
- Total = 16 x 4 = 64 in Q-table

## State | Description | Reward
- S | Agent's starting point - safe | 0
- F | Frozen surface - safe | 0
- H | Hole - game over | 0
- G | Goal - game over | 1

In [27]:
import numpy as np
import random
import time
from IPython.display import clear_output

In [28]:
import gymnasium as gym
env = gym.make("FrozenLake-v1")


In [29]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n
print(action_space_size)
print(state_space_size)

q_table = np.zeros((state_space_size, action_space_size))
print(q_table)

4
16
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [30]:
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

In [31]:
def to_index(x):
    """Return a plain Python int from gym/gymnasium outputs."""
    if isinstance(x, (tuple, list)):
        # e.g., (obs, info) already split; for obs that is array-like, take first scalar
        x = x[0]
    x = np.array(x)
    if x.ndim == 0:
        return int(x.item())
    # If something odd slips through (e.g., shape (1,) )
    return int(x.squeeze()[0])

In [32]:
rewards_all_episodes = []

# Q-Learning algorithm
for episode in range(num_episodes):
    #state = env.reset()
    reset_out = env.reset()
    state = reset_out[0] if isinstance(reset_out, tuple) else reset_out
    state = to_index(state)

    done = False
    rewards_current_episode = 0

    for step in range(max_steps_per_episode):

        # Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()
        
        action = to_index(action)

        # step
        #new_state, reward, done, info = env.step(action)# step
        step_out = env.step(action)
        if len(step_out) == 5:
            # gymnasium
            new_state, reward, terminated, truncated, info = step_out
            done = terminated or truncated
        else:
            # old gym
            new_state, reward, done, info = step_out
        new_state = to_index(new_state)
        
        # Update Q-table for Q(s, a)
        q_table[state, action] = (1 - learning_rate) * q_table[state, action]  + \
        learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        state = new_state
        rewards_current_episode += reward

        if done == True:
            break


    # Exploration rate decay
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

    rewards_all_episodes.append(rewards_current_episode)

# Calculate and print the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000
print("**************Average reward per thousand episodes*****************\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000

# Print updated Q-table
print("\n\n*********Q-table**********\n")
print(q_table)

**************Average reward per thousand episodes*****************

1000 :  0.03900000000000003
2000 :  0.21000000000000016
3000 :  0.4130000000000003
4000 :  0.5660000000000004
5000 :  0.6240000000000004
6000 :  0.6690000000000005
7000 :  0.6660000000000005
8000 :  0.6790000000000005
9000 :  0.6930000000000005
10000 :  0.6690000000000005


*********Q-table**********

[[0.5174961  0.49592153 0.49586868 0.49180099]
 [0.31730621 0.27423425 0.30512562 0.46288829]
 [0.41419516 0.41669967 0.41505115 0.42655903]
 [0.23014825 0.30342966 0.24666801 0.41899365]
 [0.54200034 0.33079424 0.32154405 0.44031955]
 [0.         0.         0.         0.        ]
 [0.29701529 0.19360101 0.19881218 0.11365408]
 [0.         0.         0.         0.        ]
 [0.40101514 0.51439608 0.44732942 0.56696412]
 [0.40224636 0.57602811 0.41140909 0.39438222]
 [0.42419878 0.46499709 0.34075778 0.35313208]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.42977405 0.64

In [34]:
# If needed (once): pip install gymnasium
# %pip install gymnasium

import time
import numpy as np
from IPython.display import clear_output
import gymnasium as gym

# ---- Environment: FrozenLake-v1 only ----
env = gym.make("FrozenLake-v1", is_slippery=True, render_mode="ansi")
#env = gym.make("FrozenLake-v1", is_slippery=True, render_mode="rgb_array")


n_states  = env.observation_space.n
n_actions = env.action_space.n

# Sanity check: q_table must already exist and match env dims
# assert q_table.shape == (n_states, n_actions)

max_steps_per_episode = 200

def to_index(x):
    """Coerce obs/actions to plain Python int."""
    x = np.array(x)
    return int(x.item()) if x.ndim == 0 else int(x.squeeze()[0])

def render_text(env):
    """Return a printable text frame from render()."""
    frame = env.render()   # Gymnasium returns str or list[str] for 'ansi'
    if isinstance(frame, (list, tuple)):
        frame = "\n".join(map(str, frame))
    return str(frame)

# ---- Play a few episodes using the learned Q-table ----
for episode in range(3):
    state, info = env.reset()
    state = to_index(state)
    done = False

    print(f"***** EPISODE {episode+1} *****\n")
    time.sleep(0.6)

    for step in range(max_steps_per_episode):
        # Animate
        clear_output(wait=True)
        print(render_text(env))
        time.sleep(0.25)

        # Greedy action from Q-table
        action = int(np.argmax(q_table[state]))

        # Gymnasium step signature (5-tuple)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        next_state = to_index(next_state)

        if done:
            clear_output(wait=True)
            print(render_text(env))
            print("***** You reached the goal! *****" if reward == 1 else "***** You fell through a hole! *****")
            time.sleep(1.2)
            break

        state = next_state

env.close()


  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG

***** You fell through a hole! *****
