In [3]:
pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-1.2.2-py3-none-any.whl.metadata (10 kB)
Collecting cloudpickle>=1.2.0 (from gymnasium)
  Downloading cloudpickle-3.1.2-py3-none-any.whl.metadata (7.1 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.2.2-py3-none-any.whl (952 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m952.1/952.1 kB[0m [31m4.9 MB/s[0m  [33m0:00:00[0m
[?25hDownloading cloudpickle-3.1.2-py3-none-any.whl (22 kB)
Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, cloudpickle, gymnasium
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [gymnasium]/3[0m [gymnasium]
[1A[2KSuccessfully installed cloudpickle-3.1.2 farama-notifications-0.0.4 gymnasium-1.2.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
import gymnasium as gym
import numpy as np

# Create environment
env = gym.make("FrozenLake-v1", is_slippery=True)

# Get number of states and actions
n_states = env.observation_space.n
n_actions = env.action_space.n

# Initialize uniform random policy
policy = np.ones((n_states, n_actions)) / n_actions

# Hyperparameters
learning_rate = 0.1
gamma = 0.99
episodes = 2000

# -------- TRAINING (Monte Carlo) --------
for _ in range(episodes):
    state, _ = env.reset()
    trajectory = []
    done = False

    # Generate one full episode
    while not done:
        action = np.random.choice(n_actions, p=policy[state])
        next_state, reward, done, _, _ = env.step(action)
        trajectory.append((state, action, reward))
        state = next_state

    # Monte Carlo return update
    G = 0
    for s, a, r in reversed(trajectory):
        G = r + gamma * G
        policy[s, a] += learning_rate * G

        # Keep probabilities valid
        policy[s] = np.maximum(policy[s], 0)
        policy[s] /= np.sum(policy[s])

# -------- TESTING --------
success = 0
for _ in range(1000):
    state, _ = env.reset()
    done = False
    while not done:
        action = np.argmax(policy[state])
        state, reward, done, _, _ = env.step(action)
        success += reward

print(f"Success Rate: {success / 1000:.2%}")

# -------- DISPLAY POLICY --------
arrows = ["<", "v", ">", "^"]
print("\nLearned Best Actions:")
for s in range(n_states):
    best_action = np.argmax(policy[s])
    print(f"State {s:02d}: {arrows[best_action]} (Prob: {policy[s, best_action]:.2f})")


Success Rate: 49.70%

Learned Best Actions:
State 00: < (Prob: 1.00)
State 01: < (Prob: 0.35)
State 02: v (Prob: 0.41)
State 03: < (Prob: 0.42)
State 04: < (Prob: 1.00)
State 05: < (Prob: 0.25)
State 06: > (Prob: 0.38)
State 07: < (Prob: 0.25)
State 08: ^ (Prob: 1.00)
State 09: v (Prob: 1.00)
State 10: v (Prob: 1.00)
State 11: < (Prob: 0.25)
State 12: < (Prob: 0.25)
State 13: > (Prob: 1.00)
State 14: > (Prob: 1.00)
State 15: < (Prob: 0.25)
