In [39]:
import numpy as np
import gymnasium as gym
from gymnasium.wrappers import TimeLimit
import matplotlib.pyplot as plt
from tqdm import tqdm

In [40]:
env = TimeLimit(gym.make('CliffWalking-v0'), max_episode_steps=30)
train_seed = 42
test_seed = 44

In [41]:
observation, _ = env.reset(seed=test_seed)
observation

36

In [42]:
def play_policy(env, policy=None):
    observation, _ = env.reset()
    episode_reward, elapsed_steps = 0, 0

    while True:
        if policy is None:
            action = env.action_space.sample()
        else:
            action = np.random.choice(env.action_space.n, p=policy[observation])

        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
        if terminated or truncated:
            break
    return episode_reward, elapsed_steps

In [43]:
def test_policy(n, env, policy, test_seed):
    env.reset(seed=test_seed)
    rewards = [play_policy(env, policy)[0] for _ in range(n)]
    return np.mean(rewards), np.std(rewards)

In [44]:
env.unwrapped.__dict__

{'shape': (4, 12),
 'start_state_index': np.int64(36),
 'nS': np.int64(48),
 'nA': 4,
 'is_slippery': False,
 '_cliff': array([[False, False, False, False, False, False, False, False, False,
         False, False, False],
        [False, False, False, False, False, False, False, False, False,
         False, False, False],
        [False, False, False, False, False, False, False, False, False,
         False, False, False],
        [False,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True, False]]),
 'P': {0: {0: [(1.0, np.int64(0), -1, False)],
   1: [(1.0, np.int64(1), -1, False)],
   2: [(1.0, np.int64(12), -1, False)],
   3: [(1.0, np.int64(0), -1, False)]},
  1: {0: [(1.0, np.int64(1), -1, False)],
   1: [(1.0, np.int64(2), -1, False)],
   2: [(1.0, np.int64(13), -1, False)],
   3: [(1.0, np.int64(0), -1, False)]},
  2: {0: [(1.0, np.int64(2), -1, False)],
   1: [(1.0, np.int64(3), -1, False)],
   2: [(1.0, np.int64(14), -1, False)],
   3: [(1.0, np.int

In [47]:
def softmax(x):
    """
    Computes the softmax of a NumPy array.

    Args:
        x (np.ndarray): The input array (logits).

    Returns:
        np.ndarray: The array with softmax probabilities.
    """
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True)) # Subtract max for numerical stability
    return e_x / np.sum(e_x, axis=-1, keepdims=True)

def mc_exploring_start(env, train_seed, no_episodes=500000, gamma=1):
    env.reset(seed=train_seed)

    no_states = env.env.env.observation_space.n
    no_actions = env.env.env.action_space.n

    policy = np.ones((no_states, no_actions)) / no_actions
    #policy[:, 0] = 1 # Only move up.
    q = np.zeros(policy.shape)
    c = np.zeros(policy.shape)

    for _ in tqdm(range(no_episodes)):
        # Choose initial state randomly. Ensure it is not on an invalid position
        state = np.random.randint(37)
        action = np.random.randint(4)

        env.reset()
        env.unwrapped.s = state

        state_actions = []
        rewards = []
        while True:
            state_actions.append((state, action))
            state, reward, terminated, truncated, _ = env.step(action)
            rewards.append(reward)
            if terminated or truncated:
                break
            action = np.random.choice(no_actions, p=policy[state])

        g = 0
        for (state, action), reward in zip(reversed(state_actions), reversed(rewards)):
            g = gamma * g + reward
            c[state, action] += 1
            q[state, action] += (g - q[state, action]) / c[state, action]

            a = q[state].argmax()
            #policy[state] = softmax(q[state])
            policy[state] = 0
            policy[state, a] = 1
    return policy, q

In [49]:
policy, q = mc_exploring_start(env, train_seed)

100%|██████████| 500000/500000 [05:18<00:00, 1571.23it/s]


In [52]:
policy.argmax(axis=1).reshape(4, -1)

array([[1, 1, 1, 1, 2, 3, 2, 2, 2, 1, 2, 2],
       [1, 2, 1, 1, 2, 1, 1, 1, 1, 0, 2, 3],
       [0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [54]:
test_policy(200, env, policy, test_seed)

(np.float64(-19.0), np.float64(0.0))