## 值迭代

&emsp;&emsp;值迭代的算法伪代码如下所示:

<img src="../images/11-value_iteration.png" width="50%">

首先是可以用任意值初始化值估计$V$。然后去更新值估计，最后当值估计收敛到一个比较小的值之后，我们就退出循环，将寻找最大的值估计作为我们的最终策略输出。

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
class GridWorld(object):
    def __init__(self, m, n, magicSquares):
        self.grid = np.zeros((m,n))
        self.m = m
        self.n = n
        self.stateSpace = [i for i in range(self.m*self.n)]
        self.stateSpace.remove(self.m * self.n -1)
        self.stateSpacePlus = [i for i in range(self.m*self.n)]
        self.possibleActions = ['U', 'D', 'L', 'R']
        self.actionSpace = {'U': -self.m, 'D': self.m, 'L': -1, 'R': 1}
        self.P = {}
        # dict with magic squares and resulting squares
        self.magicSquares = magicSquares
        self.initP()

    def initP(self):
        for state in self.stateSpace:
            for action in self.possibleActions:
                reward = -1
                state_ = state + self.actionSpace[action]
                if state_ in self.magicSquares.keys():
                    state_ = self.magicSquares[state_]
                if self.offGridMove(state_, state):
                    state_ = state
                if self.isTerminalState(state_):
                    reward = 0
                self.P[(state_, reward, state, action)] = 1

    def isTerminalState(self, state):
        return state in self.stateSpacePlus and state not in self.stateSpace

    def offGridMove(self, newState, oldState):
        # if we move into a row not in the grid
        if newState not in self.stateSpacePlus:
            return True
        # if we're trying to wrap around to next row
        elif oldState % self.n == 0 and newState  % self.n == self.n - 1:
            return True
        elif oldState % self.n == self.n - 1 and newState % self.n == 0:
            return True
        else:
            return False

In [3]:


def printV(V, grid):
    for idx, row in enumerate(grid.grid):
        for idy, _ in enumerate(row):
            state = grid.m * idx + idy
            print('%.2f' % V[state], end='\t')
        print('\n')
    print('--------------------')

def printPolicy(policy, grid):
    for idx, row in enumerate(grid.grid):
        for idy, _ in enumerate(row):
            state = grid.m * idx + idy
            if not grid.isTerminalState(state):
                if state not in grid.magicSquares.keys():
                    print('%s' % policy[state], end='\t')
                else:
                    print('%s' % '--', end='\t')
            else:
                print('%s' % '--', end='\t')
        print('\n')
    print('--------------------')


def iterateValues(grid, V, policy, GAMMA, THETA):
    converged = False  # 设置收敛判断条件
    i = 0
    while not converged:
        DELTA = 0
        for state in grid.stateSpace:
            i += 1
            oldV = V[state]  # 记录一下old state。
            newV = []
            for action in grid.actionSpace:
                for key in grid.P:
                    (newState, reward, oldState, act) = key  # 解压key
                    if state == oldState and action == act:
                        newV.append(grid.P[key]*(reward+GAMMA*V[newState]))
            newV = np.array(newV)
            bestV = np.where(newV == newV.max())[0]
            bestState = np.random.choice(bestV)
            V[state] = newV[bestState]
            DELTA = max(DELTA, np.abs(oldV-V[state]))
            converged = True if DELTA < THETA else False

    for state in grid.stateSpace:
        newValues = []
        actions = []
        i += 1
        for action in grid.actionSpace:
            for key in grid.P:
                (newState, reward, oldState, act) = key
                if state == oldState and action == act:
                    newValues.append(grid.P[key]*(reward+GAMMA*V[newState]))
            actions.append(action)
        newValues = np.array(newValues)
        bestActionIDX = np.where(newValues == newValues.max())[0]
        bestActions = actions[bestActionIDX[0]]
        policy[state] = bestActions
    print(i, 'sweeps of state space for value iteration')
    return V, policy

if __name__ == '__main__':
    # map magic squares to their connecting square
    magicSquares = {18: 54, 63: 14}
    env = GridWorld(9, 9, magicSquares)
    # model hyperparameters
    GAMMA = 1.0
    THETA = 1e-6 # convergence criteria

    V = {}  # 初始化状态值函数为0
    for state in env.stateSpacePlus:
        V[state] = 0

    policy = {}  # 初始化策略为等概率策略
    for state in env.stateSpace:
        # equiprobable random strategy
        policy[state] = env.possibleActions

    # 2 round of value iteration ftw
    for i in range(2):
        V, policy = iterateValues(env, V, policy, GAMMA, THETA)

    printV(V, env)
    printPolicy(policy, env)



1200 sweeps of state space for value iteration
160 sweeps of state space for value iteration
-11.00	-12.00	-13.00	-12.00	-11.00	-10.00	-9.00	-8.00	-7.00	

-10.00	-11.00	-12.00	-11.00	-10.00	-9.00	-8.00	-7.00	-6.00	

-11.00	-10.00	-11.00	-10.00	-9.00	-8.00	-7.00	-6.00	-5.00	

-10.00	-11.00	-10.00	-9.00	-8.00	-7.00	-6.00	-5.00	-4.00	

-11.00	-10.00	-9.00	-8.00	-7.00	-6.00	-5.00	-4.00	-3.00	

-10.00	-9.00	-8.00	-7.00	-6.00	-5.00	-4.00	-3.00	-2.00	

-9.00	-8.00	-7.00	-6.00	-5.00	-4.00	-3.00	-2.00	-1.00	

-8.00	-7.00	-6.00	-5.00	-4.00	-3.00	-2.00	-1.00	0.00	

-7.00	-6.00	-5.00	-4.00	-3.00	-2.00	-1.00	0.00	0.00	

--------------------
D	D	D	D	D	D	D	D	D	

D	D	D	D	D	D	D	D	D	

--	L	D	D	D	D	D	D	D	

U	U	D	D	D	D	D	D	D	

U	D	D	D	D	D	D	D	D	

D	D	D	D	D	D	D	D	D	

R	D	D	D	D	D	D	D	D	

--	D	D	D	D	D	D	D	D	

R	R	R	R	R	R	R	R	--	

--------------------
