In [4]:
import gymnasium as gym
import numpy as np
import utils

is_slippery = False
env = gym.make("FrozenLake-v1", desc=None, map_name="4x4", is_slippery=is_slippery)
print(env.reset())


(0, {'prob': 1})


In [14]:
def on_monte_carlo(iterations=100):
	epsilon = 0.1
	gamma = 0.95
	state_space = set(range(16))
	action_space = set(range(4))
	policy = np.full((16, 4), 0.25)
	Q = np.zeros((16, 4))
	returns = [[[0, 0] for _ in range(4)] for _ in range(16)] #sum, count instead of storing as lists


	for iter in range(iterations):
		if iter % 100 == 0:
			print(f"Iteration: {iter}")
		episode = []
		is_end = False
		state = env.reset()[0]
		while is_end == False:
			action_prob = policy[state]
			action = np.random.choice(np.arange(4), p=action_prob)
			next_state, reward, is_end, _, _ = env.step(action)
			episode.append((state, action, reward))
			state = next_state
			if reward == 1.0:
				print("Reached the End")

		G = 0
		episode_length = len(episode)
		for i in range(episode_length-1, -1, -1):
			current_state, current_action = episode[i][0], episode[i][1]
			G = gamma * G + episode[i][2]
			exists = any((prev_state, prev_action) == (current_state, current_action) for prev_state, prev_action, _ in episode[:i])
			if not exists:
				return_sum, count = returns[current_state][current_action]
				return_sum += G
				count += 1
				returns[current_state][current_action] = [return_sum, count]
				# returns[current_state][current_action].append(G)
				# Q[current_state, current_action] = np.mean(returns[current_state][current_action])
				Q[current_state, current_action] = return_sum / count
				best_action = np.argmax(Q[current_state, :])
				for a in range(len(action_space)):
					e_soft = epsilon / len(action_space)
					policy[current_state][a] = (1 - epsilon + e_soft) if a == best_action else e_soft
	
	return policy


policy = on_monte_carlo(iterations=50000)
print(policy)


Iteration: 0
Iteration: 100
Iteration: 200
Iteration: 300
Iteration: 400
Iteration: 500
Iteration: 600
Iteration: 700
Iteration: 800
Iteration: 900
Iteration: 1000
Iteration: 1100
Iteration: 1200
Iteration: 1300
Iteration: 1400
Iteration: 1500
Iteration: 1600
Iteration: 1700
Iteration: 1800
Iteration: 1900
Iteration: 2000
Iteration: 2100
Iteration: 2200
Iteration: 2300
Iteration: 2400
Iteration: 2500
Iteration: 2600
Iteration: 2700
Iteration: 2800
Iteration: 2900
Iteration: 3000
Iteration: 3100
Iteration: 3200
Iteration: 3300
Iteration: 3400
Iteration: 3500
Iteration: 3600
Iteration: 3700
Iteration: 3800
Iteration: 3900
Iteration: 4000
Iteration: 4100
Iteration: 4200
Iteration: 4300
Iteration: 4400
Iteration: 4500
Iteration: 4600
Iteration: 4700
Iteration: 4800
Iteration: 4900
Iteration: 5000
Iteration: 5100
Iteration: 5200
Iteration: 5300
Iteration: 5400
Iteration: 5500
Iteration: 5600
Iteration: 5700
Iteration: 5800
Iteration: 5900
Iteration: 6000
Iteration: 6100
Iteration: 6200
Iter

In [15]:
arrows = {0: '←', 1: '↓', 2: '→', 3: '↑'}

def visualize_policy(policy, size=4):
    grid = []
    for s in range(policy.shape[0]):
        if np.allclose(policy[s], 0.25):  # uniform = probably hole or terminal
            grid.append('·')  # mark as "don't care"
        else:
            best_action = np.argmax(policy[s])
            grid.append(arrows[best_action])
    # reshape into grid
    for i in range(size):
        print(' '.join(grid[i*size:(i+1)*size]))
visualize_policy(policy)

↓ ↑ ↓ ←
↓ · ↓ ·
→ ↓ ← ·
· → → ·
