In [84]:
import numpy as np

# MDPの構築
from typing import NamedTuple ,Optional
np.random.seed(10)

S = 6# 状態数
A = 3 # 行動数
S_set = np.arange(S)
A_set = np.arange(A)
gamma = 0.9 # 割引率

rew = np.random.uniform(0,1,size=(S,A)) # 報酬
rew = np.array(rew)

P = np.random.rand(S,A,S) # 遷移確率
P = P.reshape(S*A,S)
P = P/np.sum(P,axis=1,keepdims=True) # 正規化
P = P.reshape(S,A,S)
np.testing.assert_allclose(P.sum(axis=-1), 1, atol=1e-6)

class MDP(NamedTuple):
    S_set: np.ndarray
    A_set: np.ndarray
    rew: np.ndarray
    P: np.ndarray
    gamma: float
    H: int
    K:int


    optimal_V: Optional[np.ndarray] = None

    @property
    def S(self):
        return len(self.S_set)

    @property
    def A(self):
        return len(self.A_set)

H = int (1/(1-gamma) + 20)
mdp = MDP(S_set,A_set,rew,P,gamma,H,K=50)

In [85]:
def sampler(mdp: MDP, policy,s,h):
    """状態sで行動aをとり、次の状態と報酬を返す"""
    a = policy[h,s]
    a = int(a)
    # print(a)
    next_s = np.random.choice(mdp.S_set, p=mdp.P[s, a])
    rew = mdp.rew[s, a]
    return next_s, rew

In [86]:
def feature_func(n:int,delta:float,mdp:MDP):
    if n == 0:
        return 1
    return np.min([1,np.sqrt((0.52/n) * (1.4*np.log(np.log(np.max([np.e,n]))) + np.log(26*mdp.S*mdp.A*(mdp.H+1+mdp.S)/delta)))])

In [87]:
def V_h_max(h:int,mdp:MDP):
    return (mdp.H-h+1)

In [88]:
def V_std(P,V):
    # return np.sqrt(np.sum(P*V**2,axis=-1) - np.sum(P*V,axis=-1)**2)
    # return np.sqrt(np.sum(P*(V-P*V)**2,axis=-1))
    # std = np.sqrt(np.sum(P*V**2,axis=-1) - np.sum(2*(P**2) * V**2,axis=-1) + np.sum(P**3 * V**2,axis=-1))
    PV = np.sum(P*V,axis=-1)
    std = np.sqrt(np.sum(P * (V-PV)**2,axis=-1))
    return std

In [89]:
def vector_1_norm(V1, V2):
    """
    Calculate the 1-norm of the difference between two vectors V1 and V2.

    :param V1: First vector.
    :param V2: Second vector.
    :return: The 1-norm of the difference between V1 and V2.
    """
    return np.sum(np.abs(V1 - V2))

In [90]:
def gzi_upper_(mdp:MDP,h:int,feature_func,n_k,s,a,delta,P_k,V_upper,V_lower):
    gzi1 = (V_h_max(h+1,mdp) + 1) *feature_func(n_k[s,a],delta,mdp)
    gzi2 = (1 + np.sqrt(12) * np.sqrt(V_std(P_k[s,a],V_upper[h+1]) ** 2 +np.sum(P_k[s,a] * (V_upper[h+1] - V_lower[h+1]) ** 2,axis=-1)) * feature_func(n_k[s,a],delta,mdp)) + 8.13*V_h_max(h+1,mdp) * feature_func(n_k[s,a],delta,mdp) ** 2
    gzi3 = (1 + np.sqrt(12) * V_std(P_k[s,a],V_upper[h+1])) * feature_func(n_k[s,a],delta,mdp) + 1/mdp.H * np.sum(P_k[s,a] * (V_upper[h+1] - V_lower[h+1])) + (20.13 * mdp.H * vector_1_norm(V_upper[h+1],V_lower[h+1]) * feature_func(n_k[s,a],delta,mdp)) ** 2
    gzi = np.min([gzi1,gzi2,gzi3])
    # print(gzi1)
    # print(gzi2)
    # print(gzi3)
    # print(gzi)
    # print('---')
    return gzi

In [124]:
def ORLC(mdp:MDP,delta:float):
    np.random.seed(10)
    n_k = np.zeros((mdp.S,mdp.A))
    n_k_p = np.zeros((mdp.S,mdp.A,mdp.S))
    r_hat_k = np.zeros((mdp.S,mdp.A))
    r_k = np.zeros((mdp.S,mdp.A))
    P_k = np.zeros((mdp.S,mdp.A,mdp.S))
    V_lower = np.zeros((mdp.H+1,mdp.S))
    Q_lower = np.zeros((mdp.H,mdp.S,mdp.A))
    V_upper = np.zeros((mdp.H+1,mdp.S))
    Q_upper = np.zeros((mdp.H,mdp.S,mdp.A))
    gzi = np.zeros((mdp.H,mdp.S,mdp.A))
    policy = np.zeros((mdp.H,mdp.S))
    epsilon = np.zeros(mdp.K)
    experience = []
    V_lower[mdp.H] = 0
    V_upper[mdp.H] = 0
    for k in range(mdp.K):
        for h in reversed(range(mdp.H)):
            for s in range(mdp.S):
                for a in range(mdp.A):
                    if h + 1 == mdp.H:
                        gzi[h,s,a] = 45*mdp.S*mdp.H**2 * feature_func(n_k[s,a],delta,mdp) **2
                        gzi_upper = gzi_upper_(mdp,h,feature_func,n_k,s,a,delta,P_k,V_upper,V_lower)
                        Q_upper[h,s,a] = np.max([0,r_k[s,a] + np.sum(P_k[s,a,:]*V_upper[h+1,:]) + gzi_upper])
                        Q_upper[h,s,a] = np.min([V_h_max(h,mdp),Q_upper[h,s,a]])
                        Q_lower[h,s,a] = np.max([0,r_k[s,a] + np.sum(P_k[s,a]*V_lower[h+1]) - gzi[h,s,a]])
                        Q_lower[h,s,a] = np.min([V_h_max(h,mdp),Q_lower[h,s,a]])
                    else:
                        gzi[h,s,a] = (1 + np.sqrt(12)*V_std(P_k[s,a],V_upper[h+1]) * feature_func(n_k[s,a],delta,mdp)) + 45*mdp.S*mdp.H**2 * feature_func(n_k[s,a],delta,mdp) **2 + 1/mdp.H *np.sum(P_k[s,a]*(V_upper[h+1] - V_lower[h+1]))
                        gzi_upper = gzi_upper_(mdp,h,feature_func,n_k,s,a,delta,P_k,V_upper,V_lower)
                        Q_upper[h,s,a] = np.max([0,r_k[s,a] + np.sum(P_k[s,a]*V_upper[h+1]) + gzi_upper])
                        Q_upper[h,s,a] = np.min([V_h_max(h,mdp),Q_upper[h,s,a]])
                        Q_lower[h,s,a] = np.max([0,r_k[s,a] + np.sum(P_k[s,a]*V_lower[h+1]) - gzi[h,s,a]])
                        Q_lower[h,s,a] = np.min([V_h_max(h,mdp),Q_lower[h,s,a]])
                        # print(Q_upper[h,s,a])
        for h in reversed(range(mdp.H)):
            for s in range(mdp.S):
                # policy[h-1,s] = np.argmax(Q_upper[h-1,s],axis=-1)
                max_value = np.max(Q_upper[h,s],axis=-1)
                max_indeices = [i for i,value in enumerate(Q_upper[h,s]) if value == max_value]
                a = np.random.choice(max_indeices)
                policy[h,s] = a
                # print(policy[h,s],a)
                V_upper[h,s] = Q_upper[h,s,a]
                V_lower[h,s] = Q_lower[h,s,a]
        epsilon[k] = np.abs(V_upper[0,0] - V_lower[0,0])
        s = 0
        # experience = []
        for h in range(mdp.H):
            s_dash,rew = sampler(mdp,policy,s,h)
            # experience.append([s,int(policy[h,s]),rew,s_dash])
            n_k[s, a] += 1
            n_k_p[s, a, s_dash] += 1
            r_hat_k[s, a] += rew
            r_k[s,a] = r_hat_k[s,a] / n_k[s,a]
                # P_k[s, a, s_dash] = n_k_p[s, a, s_dash] / n_k[s, a]
            P_k[s,a,:] = n_k_p[s,a,:] / n_k[s,a]
            s=s_dash
    return policy,epsilon,Q_upper,Q_lower,V_upper,V_lower,r_k,P_k

In [125]:
policy,epsilon,Qu,Ql,Vu,Vl,r,P = ORLC(mdp,0.01)

In [126]:
epsilon

array([ 1., 31., 31., 31., 31., 31., 31., 31., 31., 31., 31., 31., 31.,
       31., 31., 31., 31., 31., 31., 31., 31., 31., 31., 31., 31., 31.,
       31., 31., 31., 31., 31., 31., 31., 31., 31., 31., 31., 31., 31.,
       31., 31., 31., 31., 31., 31., 31., 31., 31., 31., 31.])

In [127]:
mdp.rew

array([[0.77132064, 0.02075195, 0.63364823],
       [0.74880388, 0.49850701, 0.22479665],
       [0.19806286, 0.76053071, 0.16911084],
       [0.08833981, 0.68535982, 0.95339335],
       [0.00394827, 0.51219226, 0.81262096],
       [0.61252607, 0.72175532, 0.29187607]])

In [128]:
r

array([[0.45869818, 0.48844094, 0.42214298],
       [0.50306127, 0.49078297, 0.43383418],
       [0.38370205, 0.37324716, 0.3699841 ],
       [0.5626646 , 0.53359543, 0.51441977],
       [0.48524248, 0.46369019, 0.48142501],
       [0.5626354 , 0.54145882, 0.53243794]])

In [133]:
policy[0]

array([0., 1., 2., 0., 0., 1.])

In [130]:
def Q_iteration(mdp:MDP):
    Q = np.zeros((mdp.S,mdp.A))
    for s in range(mdp.S):
        for a in range(mdp.A):
            Q[s,a] = mdp.rew[s,a] + mdp.gamma * np.sum(mdp.P[s,a,:] * np.max(Q,axis=-1))
    return Q
Q = Q_iteration(mdp)
a_index = np.argmax(Q,axis=-1)



In [131]:
a_index

array([2, 0, 1, 2, 2, 1])