In [15]:
import torch
import gym

env = gym.make('FrozenLake-v0', map_name="8x8")

gamma = 0.99

threshold = 0.0001


def value_iteration(env, gamma, threshold):
    n_state = env.observation_space.n
    n_action = env.action_space.n
    V = torch.zeros(n_state)
    while True:
        V_temp = torch.empty(n_state)
        for state in range(n_state):
            v_actions = torch.zeros(n_action)
            for action in range(n_action):
                for trans_prob, new_state, reward, _ in env.env.P[state][action]:
                    v_actions[action] += trans_prob * (reward + gamma * V[new_state])
            V_temp[state] = torch.max(v_actions)
        max_delta = torch.max(torch.abs(V - V_temp))
        V = V_temp.clone()
        if max_delta <= threshold:
            break
    return V


def extract_optimal_policy(env, V_optimal, gamma):
    n_state = env.observation_space.n
    n_action = env.action_space.n
    optimal_policy = torch.zeros(n_state)
    for state in range(n_state):
        v_actions = torch.zeros(n_action)
        for action in range(n_action):
            for trans_prob, new_state, reward, _ in env.env.P[state][action]:
                v_actions[action] += trans_prob * (reward + gamma * V_optimal[new_state])
        optimal_policy[state] = torch.argmax(v_actions)
    return optimal_policy


V_optimal = value_iteration(env, gamma, threshold)
print('Optimal values:\n', V_optimal)


optimal_policy = extract_optimal_policy(env, V_optimal, gamma)
print('Optimal policy:\n', optimal_policy)


def run_episode(env, policy):
    state = env.reset()
    total_reward = 0
    is_done = False
    while not is_done:
        action = policy[state].item()
        state, reward, is_done, info = env.step(action)
        total_reward += reward
        if is_done:
            break
    return total_reward


n_episode = 4000
total_rewards = []
for episode in range(n_episode):
    total_reward = run_episode(env, optimal_policy)
    total_rewards.append(total_reward)

print('Average total reward under the optimal policy:', sum(total_rewards) / n_episode)

Optimal values:
 tensor([0.4133, 0.4259, 0.4450, 0.4673, 0.4915, 0.5158, 0.5345, 0.5402, 0.4103,
        0.4199, 0.4363, 0.4573, 0.4823, 0.5127, 0.5450, 0.5567, 0.3953, 0.3925,
        0.3743, 0.0000, 0.4209, 0.4930, 0.5605, 0.5852, 0.3679, 0.3517, 0.3055,
        0.1998, 0.3001, 0.0000, 0.5684, 0.6277, 0.3306, 0.2899, 0.1965, 0.0000,
        0.2888, 0.3615, 0.5343, 0.6892, 0.3035, 0.0000, 0.0000, 0.0861, 0.2136,
        0.2724, 0.0000, 0.7717, 0.2859, 0.0000, 0.0571, 0.0472, 0.0000, 0.2504,
        0.0000, 0.8776, 0.2772, 0.1984, 0.1257, 0.0000, 0.2395, 0.4863, 0.7371,
        0.0000])
Optimal policy:
 tensor([3., 2., 2., 2., 2., 2., 2., 2., 3., 3., 3., 3., 3., 2., 2., 1., 3., 3.,
        0., 0., 2., 3., 2., 1., 3., 3., 3., 1., 0., 0., 2., 2., 0., 3., 0., 0.,
        2., 1., 3., 2., 0., 0., 0., 1., 3., 0., 0., 2., 0., 0., 1., 0., 0., 0.,
        0., 2., 0., 1., 0., 0., 1., 2., 1., 0.])
Average total reward under the optimal policy: 0.63575
