# 動的計画法

強化学習において求められているのは、価値関数を求めること。  
- 小さな問題の場合: ベルマン方程式を用いて連立方程式を得て、連立方程式ソルバを使って価値関数を求められる
- 小さくない問題（状態と行動のパターンが多い）の場合: 動的計画法を用いて価値関数を評価できる

In [1]:
# 反復方策評価によって、価値関数を100回更新する。
# （関数を更新するといえば、メソッドの実装を書き換える意味に思えるが、そうではなく

V = {'L1': 0.0, 'L2': 0.0}
new_V = V.copy() # Vのコピー

for _ in range(100):
     new_V['L1'] = 0.5 * (-1 + 0.9 * V['L1']) + 0.5 * (1 + 0.9 * V['L2'])
     new_V['L2'] = 0.5 * (0 + 0.9 * V['L1']) + 0.5 * (-1 + 0.9 * V['L2'])

     V = new_V.copy()
     print(V)

{'L1': 0.0, 'L2': -0.5}
{'L1': -0.22499999999999998, 'L2': -0.725}
{'L1': -0.42749999999999994, 'L2': -0.9274999999999999}
{'L1': -0.6097499999999999, 'L2': -1.10975}
{'L1': -0.7737750000000001, 'L2': -1.273775}
{'L1': -0.9213975000000002, 'L2': -1.4213975}
{'L1': -1.05425775, 'L2': -1.55425775}
{'L1': -1.1738319750000001, 'L2': -1.6738319750000001}
{'L1': -1.2814487775, 'L2': -1.7814487775}
{'L1': -1.3783038997500001, 'L2': -1.8783038997500001}
{'L1': -1.4654735097750002, 'L2': -1.965473509775}
{'L1': -1.5439261587975002, 'L2': -2.0439261587975004}
{'L1': -1.61453354291775, 'L2': -2.11453354291775}
{'L1': -1.6780801886259753, 'L2': -2.1780801886259753}
{'L1': -1.735272169763378, 'L2': -2.2352721697633777}
{'L1': -1.7867449527870403, 'L2': -2.28674495278704}
{'L1': -1.833070457508336, 'L2': -2.3330704575083363}
{'L1': -1.8747634117575025, 'L2': -2.3747634117575025}
{'L1': -1.9122870705817523, 'L2': -2.412287070581752}
{'L1': -1.9460583635235769, 'L2': -2.446058363523577}
{'L1': -1.9764

In [2]:
# 反復方策評価（上書き方式による実装）
V = {'L1': 0.0, 'L2': 0.0}
cnt = 0

while True:
    # cnt = n+1 までの情報を織り込んだ状態L1の価値関数の値を...
    t = 0.5 * (-1 + 0.9 * V['L1']) + 0.5 * (1 + 0.9 * V['L2'])
    delta = abs(t - V['L1'])
    V['L1'] = t

    # cnt = n の状態L2の価値関数の更新時にすぐ使うことができる！
    # しかし、同じ状態を何度も通ることができるタスクでないと意味が薄いのでは？
    t = 0.5 * (0 + 0.9 * V['L1']) + 0.5 * (-1 + 0.9 * V['L2'])
    delta = max(delta, abs(t - V['L2']))
    V['L2'] = t

    cnt += 1
    if delta < 0.0001:
        print(V)
        print(cnt)
        break


In [2]:
# 3x4のグリッドワールドを考える

import numpy as np

UP, DOWN, LEFT, RIGHT = 0, 1, 2, 3

class MyGridWorld:
    def __init__(self):
        self.action_space = [UP, DOWN, LEFT, RIGHT]

        self.reward_map = np.array([
            [0,0,0,1.0],
            [0,None,0,-1.0],
            [0,0,0,0]
        ])
        self.goal_state = (0,3)
        self.wall_state = (1,1)
        self.start_state = (2,0)
        self.agent_state = self.start_state

    @property
    def height(self):
        return self.reward_map.shape[0]
    
    @property
    def width(self):
        return self.reward_map.shape[1]
    
    @property
    def shape(self):
        return self.reward_map.shape
    
    def actions(self):
        return self.action_space

    def states(self):
        for h in range(self.height):
            for w in range(self.width):
                yield (h,w)

    def next_state(self, current_state, action):
        action_move_map = [(-1,0), (1,0), (0,-1), (0,1)]
        move = action_move_map[action]
        next_state = (current_state[0]+move[0], current_state[1]+move[1])
        ny, nx = next_state

        if (0 <= ny < self.height and 0 <= nx < self.width) and next_state != self.wall_state:
            return next_state
        else:
            return current_state

    # 数式の r(s,a,s')　に対応させるため、今回は不要な引数が存在する。
    def reward(self, current_state, action, next_state):
        return self.reward_map[next_state]



In [3]:
my_env = MyGridWorld()
print(my_env.height)
print(my_env.width)
print(my_env.shape)


3
4
(3, 4)


In [4]:
for action in my_env.actions():
    print(action)

print('===')

for state in my_env.states():
    print(state)

0
1
2
3
===
(0, 0)
(0, 1)
(0, 2)
(0, 3)
(1, 0)
(1, 1)
(1, 2)
(1, 3)
(2, 0)
(2, 1)
(2, 2)
(2, 3)


In [6]:
from collections import defaultdict
pi = defaultdict(lambda: {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25})

state = (0, 1)
print(pi[state])

{0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25}


## 方策反復法