### RMDPの実装

Sample Complexity of Robust Reinforcement Learning with a Generative Modelのアルゴリズム1を実装してみました。


In [62]:
import numpy as np

S = 2  # 状態数
A = 2  # 行動数
gamma = 0.9  # 割引率
K = 1000  # イテレーション回数
epsilon = 1  # KL上限
n = 1000  # Uncertainty setを作る際の遷移確率サンプルのサンプル数

P = np.zeros((S, A, S))  # 真の遷移確率
r = np.random.rand(S, A)  # 報酬

for s in range(S):
    for a in range(A):
        P[s, a] = np.random.rand(S)
        P[s, a] /= np.sum(P[s, a])

def generative_model(P, s, a):
    return np.random.choice(S, p=P[s, a])

def compute_emprical_P(n, P, P_hat):
    for s in range(S):
        for a in range(A):
            for _ in range(n):
                s_next = generative_model(P, s, a)
                P_hat[s, a, s_next] += 1
    for s in range(S):
        for a in range(A):
            if np.sum(P_hat[s, a]) > 0:
                P_hat[s, a] /= np.sum(P_hat[s, a])
    return P_hat

def KL_divergence(P, Q):
    return np.sum(P * np.log(np.clip(P / Q, 1e-10, None)))

def compute_uncertainty_set(P_hat, epsilon):
    uncertainty_set = []
    num_samples = 10 

    for _ in range(num_samples):
        candidate = np.zeros((S, A, S))
        for s in range(S):
            for a in range(A):
                candidate[s, a] = np.random.rand(S)
                candidate[s, a] /= np.sum(candidate[s, a])
        if np.all([KL_divergence(P_hat[s, a], candidate[s, a]) <= epsilon for s in range(S) for a in range(A)]):
            uncertainty_set.append(candidate)
    return uncertainty_set

def sigma_uncertainty(P_set, s, a, V_k):
    # 不確実性集合に基づく最小期待値を計算
    return np.min([np.sum(P[s, a, :] * V_k) for P in P_set])

# 推定遷移確率とUncertainty setの計算
P_hat = np.zeros_like(P)  
P_hat = compute_emprical_P(n, P, P_hat)
uncertainty_set = compute_uncertainty_set(P_hat, epsilon)

Q = np.zeros((S, A))  

for k in range(K):
    V = np.max(Q, axis=1)  
    Q_new = np.zeros_like(Q)  

    for s in range(S):
        for a in range(A):
            Q_new[s, a] = r[s, a] + gamma * sigma_uncertainty(uncertainty_set, s, a, V)

    # 収束判定 (θ = 0.0001)
    if np.max(np.abs(Q_new - Q)) < theta:
        break

    Q = Q_new  # Qを更新

# 最適方策の計算
pi = np.argmax(Q, axis=1)

# 出力
print("最適行動価値関数 Q:")
print(Q)
print("最適方策 π:")
print(pi)


最適行動価値関数 Q:
[[5.11584115 5.01856317]
 [5.40760664 5.52351439]]
最適方策 π:
[0 1]
