In [6]:
import numpy as np

# Discount factor
γ = 0.95
A = [0, 1]
S = [0, 1, 2]

# Transition matrix if dont' invest
P0 = np.array([[1, 0, 0],
               [0.1, .75, 0.15],
               [0.05, .1, 0.85]])

R0 = np.array([0, 1, 2])

# Transition matrix if invest
P1 = np.array([[1, 0, 0],
               [0.05, .75, 0.2],
               [0.02, .06, 0.92]])
R1 = np.array([0, 0.5, 1.5])


P = [P0, P1]
R = [R0, R1]


def construct_Rπ(R, π, S):
    Rπ = np.zeros(len(S))
    for s in S:
        Rπ[s] = R[π[s]][s]
    return Rπ


def construct_Pπ(P, π, S):
    Pπ = np.zeros((len(S), len(S)))
    for s in S:
        for s_prime in S:
            Pπ[s, s_prime] = P[π[s]][s, s_prime]
    return Pπ

In [7]:
π = [0, 0, 0]

# # Solution with linear algebra
# def policy_evaluation(π):
#   Rπ = construct_Rπ(R, π, S)
#   Pπ = construct_Pπ(P, π, S)
#   I = np.eye(3)
#   Vπ = np.linalg.solve(I - γ * Pπ, Rπ)
#   return Vπ

# Iterative policy evaluation
#Policy iteration = policy evaluation + policy improvement  

def policy_evaluation(π, Vπ):
    Rπ = construct_Rπ(R, π, S)
    Pπ = construct_Pπ(P, π, S)
    for iteration in range(1): #把這邊次數減少 
        Vπ = Rπ + γ * Pπ @ Vπ
    return Vπ


def policy_improvement(Vπ):
    # Compute Qπ using Vπ
    Qπ = np.zeros((3, 2))
    π_prime = np.zeros(3, dtype=np.int32)
    for s in S:
        for a in A:
            Qπ[s, a] = R[a][s] + γ * P[a][s] @ Vπ

    # Greedy updates
    for s in S:
        π_prime[s] = np.argmax(Qπ[s, :])
    return π_prime

In [16]:
Vπ = np.zeros(3)
for iteration in range(100000): #然後這邊次數增大
    Vπ = policy_evaluation(π, Vπ)
    π = policy_improvement(Vπ)
print(Vπ)

[ 0.         13.70348494 18.10395747]


In [17]:
π

array([0, 1, 1], dtype=int32)