<a href="https://colab.research.google.com/github/ydg1021/basicRL/blob/main/example_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

def value_iteration(transition_probabilities, rewards, gamma, theta=0.0001):
    num_states = transition_probabilities.shape[0]
    num_actions = transition_probabilities.shape[1]
    V = np.zeros(num_states)

    while True:
        delta = 0
        for s in range(num_states):
            v = V[s]
            V[s] = max([sum([transition_probabilities[s, a, s_prime] *
                             (rewards[s, a, s_prime] + gamma * V[s_prime])
                             for s_prime in range(num_states)])
                        for a in range(num_actions)])
            delta = max(delta, abs(v - V[s]))

        if delta < theta:
            break

    policy = np.zeros((num_states, num_actions))
    for s in range(num_states):
        action_values = np.array([sum([transition_probabilities[s, a, s_prime] *
                                       (rewards[s, a, s_prime] + gamma * V[s_prime])
                                       for s_prime in range(num_states)])
                                  for a in range(num_actions)])
        best_action = np.argmax(action_values)
        policy[s, best_action] = 1

    return policy, V

# 환경 설정
grid_size = (4, 4)
num_states = grid_size[0] * grid_size[1]
terminal_states = [(0, 0), (3, 3)]
gamma = 1

transition_probabilities = np.zeros((num_states, 4, num_states))
rewards = np.zeros((num_states, 4, num_states))
for s in range(num_states):
    for a in range(4):
        if (s % grid_size[1], s // grid_size[1]) in terminal_states:
            continue
        next_s = s
        if a == 0 and s // grid_size[1] > 0:  # Up
            next_s -= grid_size[1]
        elif a == 1 and s // grid_size[1] < grid_size[0] - 1:  # Down
            next_s += grid_size[1]
        elif a == 2 and s % grid_size[1] > 0:  # Left
            next_s -= 1
        elif a == 3 and s % grid_size[1] < grid_size[1] - 1:  # Right
            next_s += 1
        transition_probabilities[s, a, next_s] = 1
        rewards[s, a, next_s] = -1

# Value Iteration 실행
optimal_policy, optimal_value = value_iteration(transition_probabilities, rewards, gamma)

print("Optimal policy:")
print(np.argmax(optimal_policy, axis=1).reshape(grid_size))
print("Optimal value function:")
print(optimal_value.reshape(grid_size))


Optimal policy:
[[0 2 2 1]
 [0 0 0 1]
 [0 0 1 1]
 [0 3 3 0]]
Optimal value function:
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]
