<a href="https://colab.research.google.com/github/ydg1021/basicRL/blob/main/example_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import numpy as np

def iterative_policy_evaluation(policy, transition_probabilities, rewards, gamma, theta=0.0001):
    num_states = policy.shape[0]
    V = np.zeros(num_states)

    while True:
        delta = 0
        for s in range(num_states):
            if (s % grid_size[1], s // grid_size[1]) in terminal_states:
                continue  # 터미널 상태는 건너뜁니다.

            v = V[s]
            V[s] = sum([policy[s, a] * sum([transition_probabilities[s, a, s_prime] *
                                            (rewards[s, a, s_prime] + gamma * V[s_prime])
                                            for s_prime in range(num_states)])
                        for a in range(num_actions)])
            delta = max(delta, abs(v - V[s]))

        if delta < theta:
            break
    return V

def policy_improvement(policy, V, transition_probabilities, rewards, gamma):
    policy_stable = True
    new_policy = np.copy(policy)

    for s in range(num_states):
        old_action = np.argmax(policy[s])
        action_values = np.zeros(num_actions)
        for a in range(num_actions):
            action_values[a] = sum([transition_probabilities[s, a, s_prime] *
                                    (rewards[s, a, s_prime] + gamma * V[s_prime])
                                    for s_prime in range(num_states)])
        best_action = np.argmax(action_values)
        new_policy[s] = np.eye(num_actions)[best_action]

        if old_action != best_action:
            policy_stable = False

    return new_policy, policy_stable

# 환경 설정
grid_size = (4, 4)
num_states = grid_size[0] * grid_size[1]
num_actions = 4
terminal_states = [(0, 0), (3, 3)]
gamma = 1

transition_probabilities = np.zeros((num_states, num_actions, num_states))
rewards = np.zeros((num_states, num_actions, num_states))
for s in range(num_states):
    for a in range(num_actions):
        if (s % grid_size[1], s // grid_size[1]) in terminal_states:
            continue
        next_s = s
        if a == 0 and s // grid_size[1] > 0:  # Up
            next_s -= grid_size[1]
        elif a == 1 and s // grid_size[1] < grid_size[0] - 1:  # Down
            next_s += grid_size[1]
        elif a == 2 and s % grid_size[1] > 0:  # Left
            next_s -= 1
        elif a == 3 and s % grid_size[1] < grid_size[1] - 1:  # Right
            next_s += 1
        transition_probabilities[s, a, next_s] = 1
        rewards[s, a, next_s] = -1

# 초기 정책 설정
policy = np.ones([num_states, num_actions]) / num_actions

# 정책 반복
policy_stable = False
while not policy_stable:
    V = iterative_policy_evaluation(policy, transition_probabilities, rewards, gamma)
    policy, policy_stable = policy_improvement(policy, V, transition_probabilities, rewards, gamma)

print("Optimal policy:")
print(np.argmax(policy, axis=1).reshape(grid_size))
print("Optimal value function:")
print(V.reshape(grid_size))

# 여기서 각 숫자는 상태에서 취할 최적의 행동을 나타냄 (0: 상, 1: 하, 2: 좌, 3: 우).

Optimal policy:
[[0 2 2 1]
 [0 0 0 1]
 [0 0 1 1]
 [0 3 3 0]]
Optimal value function:
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]
