# Chapter 4 Dynamic Programming

_**dynamic programming (DP)**_: a collection of algorithms that can
be used to compute optimal policies given a perfect model of the environment as
a Markov decision process (MDP)

In [71]:
import numpy as np
class GridWorld():
    def __init__(self, n_row = 4, n_col = 4, terminal_state = None):
        self.n_row_ = n_row
        self.n_col_ = n_col
        self.grid_ = np.zeros((self.n_row_, self.n_col_), dtype=np.float32)
        self.terminal_ = terminal_state
        for state in self.terminal_.keys():
            self.grid_[state] = self.terminal_[state]
        pass
    def __str__(self):
        repr_state = lambda s: '[%.1f]'%(self.grid_[s]) if s in self.terminal_ else ' %.1f '%(self.grid_[s])
        return '\n'.join(['  '.join([repr_state((r,c)) for c in  range(self.n_col_)]) for r in range(self.n_row_)])
    def next_states(self, state):
        if state in self.terminal_:
            return [state]
        states=[]
        r, c = state
        if r == 0: states.append(state)
        else: states.append((r-1,c))
        if c == 0: states.append(state)
        else: states.append((r,c-1))
        if r == self.n_row_-1: states.append(state)
        else: states.append((r+1,c))
        if c == self.n_col_-1: states.append(state)
        else: states.append((r,c+1))
        return states
    def policy_iter(self, in_place = False):
        if in_place:
            new_grid = self.grid_
        else:
            new_grid = np.zeros((self.n_row_, self.n_col_), dtype=np.float32)
        for r in range(self.n_row_):
            for c in  range(self.n_col_):
                if (r,c) in self.terminal_:
                    new_grid[(r,c)] = self.grid_[(r,c)]
                else:
                    new_grid[(r,c)] = np.average([self.grid_[s] for s in self.next_states((r,c))])-1
        self.grid_ = new_grid
    def value_iter(self, in_place = False):
        if in_place:
            new_grid = self.grid_
        else:
            new_grid = np.zeros((self.n_row_, self.n_col_), dtype=np.float32)
        for r in range(self.n_row_):
            for c in  range(self.n_col_):
                if (r,c) in self.terminal_:
                    new_grid[(r,c)] = self.grid_[(r,c)]
                else:
                    new_grid[(r,c)] = np.max([self.grid_[s] for s in self.next_states((r,c))])-1
        self.grid_ = new_grid
    
terminal_state = {
    (0,0):0,
    (3,3):0,
}
gw = GridWorld(4, 4, terminal_state = terminal_state)
for i in range(10):
    gw.value_iter()
    print(gw)


[0.0]   -1.0    -1.0    -1.0 
 -1.0    -1.0    -1.0    -1.0 
 -1.0    -1.0    -1.0    -1.0 
 -1.0    -1.0    -1.0    -1.0 
[0.0]   -1.0    -2.0    -2.0 
 -1.0    -2.0    -2.0    -2.0 
 -2.0    -2.0    -2.0    -2.0 
 -2.0    -2.0    -2.0    -2.0 
[0.0]   -1.0    -2.0    -3.0 
 -1.0    -2.0    -3.0    -3.0 
 -2.0    -3.0    -3.0    -3.0 
 -3.0    -3.0    -3.0    -3.0 
[0.0]   -1.0    -2.0    -3.0 
 -1.0    -2.0    -3.0    -4.0 
 -2.0    -3.0    -4.0    -4.0 
 -3.0    -4.0    -4.0    -4.0 
[0.0]   -1.0    -2.0    -3.0 
 -1.0    -2.0    -3.0    -4.0 
 -2.0    -3.0    -4.0    -5.0 
 -3.0    -4.0    -5.0    -5.0 
[0.0]   -1.0    -2.0    -3.0 
 -1.0    -2.0    -3.0    -4.0 
 -2.0    -3.0    -4.0    -5.0 
 -3.0    -4.0    -5.0    -6.0 
[0.0]   -1.0    -2.0    -3.0 
 -1.0    -2.0    -3.0    -4.0 
 -2.0    -3.0    -4.0    -5.0 
 -3.0    -4.0    -5.0    -6.0 
[0.0]   -1.0    -2.0    -3.0 
 -1.0    -2.0    -3.0    -4.0 
 -2.0    -3.0    -4.0    -5.0 
 -3.0    -4.0    -5.0    -6.0 
[0.0]   -1.0    