In [7]:
import gymnasium as gym
import numpy as np

# Setup
env = gym.make("FrozenLake-v1", is_slippery=True)
policy = np.ones((16, 4)) / 4  # 16 states, 4 actions, equal probability (0.25)

# Training
for episode in range(5000):
    state, _ = env.reset()
    states, actions, rewards = [], [], []
    
    # Play one episode
    done = False
    while not done:
        action = np.random.choice(4, p=policy[state])  # 0=left, 1=down, 2=right, 3=up
        next_state, reward, done, _, _ = env.step(action)
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        state = next_state

    # Calculate returns (backwards)
    G = 0
    returns = []
    for r in reversed(rewards):
        G = r + 0.99 * G  # gamma = 0.99
        returns.insert(0, G)
        
    # Update policy
    baseline = np.mean(returns)
    for s, a, Gt in zip(states, actions, returns):
        policy[s, a] += 0.1 * (Gt - baseline)  # learning_rate = 0.1
        policy[s] = np.maximum(policy[s], 0)   # no negative probabilities
        policy[s] /= np.sum(policy[s])         # normalize to sum=1

# Test learned policy
success = 0
for _ in range(100):
    state, _ = env.reset()
    done = False
    while not done:
        action = np.argmax(policy[state])  # pick best action
        state, reward, done, _, _ = env.step(action)
        success += reward

print(f"Success rate: {success}%")

# Show learned actions
arrows = ["←", "↓", "→", "↑"]
for s in range(16):
    print(f"State {s}: {arrows[np.argmax(policy[s])]}")

Success rate: 7%
State 0: →
State 1: ←
State 2: ↑
State 3: ↓
State 4: ←
State 5: ←
State 6: ↓
State 7: ←
State 8: ↑
State 9: →
State 10: ←
State 11: ←
State 12: ←
State 13: ↓
State 14: ↓
State 15: ←
