In [1]:
import gymnasium as gym
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [2]:
env = gym.make('FrozenLake-v1', is_slippery=True)

In [4]:
def approximate_value_iteration_fixed_iterations(env, gamma=0.95, T=100):
    num_states = env.observation_space.n
    V = np.zeros(num_states)

    for _ in range(T):
        V_new = np.zeros(num_states)
        for state in range(num_states):
            V_new[state] = max([sum([prob * (reward + gamma * V[next_state])
                                     for prob, next_state, reward, _ in env.P[state][action]])
                                for action in range(env.action_space.n)])
        V = V_new

    return V

T = 100
value_function = approximate_value_iteration_fixed_iterations(env, T=T)
print("Value Function after", T, "iterations:", value_function)

Value Function after 100 iterations: [0.18035745 0.15466    0.15340661 0.13247087 0.20886331 0.
 0.17639284 0.         0.27037339 0.37459489 0.40363285 0.
 0.         0.50893908 0.72365223 0.        ]


In [5]:
def value_iteration(env, gamma=0.95, epsilon=1e-8):
    num_states = env.observation_space.n
    V = np.zeros(num_states)
    delta = float('inf')
    
    while delta > epsilon:
        delta = 0
        for state in range(num_states):
            v = V[state]
            V[state] = max([sum([prob * (reward + gamma * V[next_state])
                                 for prob, next_state, reward, _ in env.P[state][action]])
                            for action in range(env.action_space.n)])
            delta = max(delta, abs(v - V[state]))
    
    return V

optimal_value_function = value_iteration(env)
print("Optimal Value Function:", optimal_value_function)

Optimal Value Function: [0.1804715  0.15475666 0.1534771  0.13254839 0.20896702 0.
 0.17643077 0.         0.27045736 0.3746515  0.4036727  0.
 0.         0.50897993 0.72367363 0.        ]


In [8]:

env = gym.make('FrozenLake-v1', is_slippery=True)

def value_iteration(env, gamma=0.95, epsilon=1e-8):
    num_states = env.observation_space.n
    V = np.zeros(num_states)
    delta = float('inf')
    
    while delta > epsilon:
        delta = 0
        for state in range(num_states):
            v = V[state]
            V[state] = max([sum([prob * (reward + gamma * V[next_state])
                                 for prob, next_state, reward, _ in env.P[state][action]])
                            for action in range(env.action_space.n)])
            delta = max(delta, abs(v - V[state]))
    
    return V

def approximate_value_iteration_fixed_iterations(env, gamma=0.95, T=100):
    num_states = env.observation_space.n
    V = np.zeros(num_states)
    
    for _ in range(T):
        V_new = np.zeros(num_states)
        for state in range(num_states):
            V_new[state] = max([sum([prob * (reward + gamma * V[next_state])
                                     for prob, next_state, reward, _ in env.P[state][action]])
                                for action in range(env.action_space.n)])
        V = V_new
    
    return V

def calculate_suboptimality(V_approx, V_star):
    epsilon_infinity = np.max(np.abs(V_approx - V_star))
    return epsilon_infinity

# Run algorithms
optimal_value_function = value_iteration(env)
approx_value_function = approximate_value_iteration_fixed_iterations(env, T=100)
suboptimality_measure = calculate_suboptimality(approx_value_function, optimal_value_function)

print("Suboptimality Measure (ε∞) saved:", suboptimality_measure)

Suboptimality Measure (ε∞) saved: 0.00011405178932732962
