In [1]:
import math
import numpy as np
from MDPGridWorld import *

In [2]:
book_grid = [[' ',' ',' ',+100],
            [' ','#',' ',-100],
            ['@',' ',' ',' ']]

gw = MDPGridWorld(book_grid, silent=False)

_____	_____	_____	+100_	
_____	__#__	_____	-100_	
__@__	_____	_____	_____	



In [3]:
gw.curr_state_idx = gw.sample_next_state(gw.curr_state_idx, MDPGridWorld.North)
print(gw)

_____	_____	_____	+100_	
__@__	__#__	_____	-100_	
_____	_____	_____	_____	



In [4]:
print("States (fused into a single grid-bc it's possible to do so here):")
gw.disp_custom_grid(range(gw.nS), formatting=lambda x: str(x))

States (fused into a single grid-bc it's possible to do so here):
0	1	2	3	
4	5	6	7	
8	9	10	11	
Absorbing state: 12


In [5]:
vi = run_value_iteration(gw.T, gw.R, gw.gamma)
print("Optimal Value:")
gw.disp_custom_grid(vi.V, formatting=lambda x: "{:.3f}".format(x))
print("\nOptimal Policy:")
gw.disp_custom_grid(vi.policy, lambda x: "{:}".format(gw.actions_name[x]))

Optimal Value:
95.118	96.481	97.712	100.000	
93.919	-9999999905.440	89.378	-100.000	
92.592	91.425	90.174	81.809	
Absorbing state: 0.000

Optimal Policy:
East	East	East	North	
North	North	West	North	
North	West	West	South	
Absorbing state: North


## Sampling Trajectories

In [6]:
tau = gw.sample_trajectory(init_state_idx="random", max_length=5)
print(tau)
gw.interpret_trajectory(tau)

[(6, 3, -0.01), (6, 3, -0.01), (10, 3, -0.01), (9, 3, -0.01), (8, 0, -0.01), (4, 0, -0.01)]
s:
_____	_____	_____	+100_	
_____	__#__	__@__	-100_	
_____	_____	_____	_____	

a:  West
r:  -0.01
s:
_____	_____	_____	+100_	
_____	__#__	__@__	-100_	
_____	_____	_____	_____	

a:  West
r:  -0.01
s:
_____	_____	_____	+100_	
_____	__#__	_____	-100_	
_____	_____	__@__	_____	

a:  West
r:  -0.01
s:
_____	_____	_____	+100_	
_____	__#__	_____	-100_	
_____	__@__	_____	_____	

a:  West
r:  -0.01
s:
_____	_____	_____	+100_	
_____	__#__	_____	-100_	
__@__	_____	_____	_____	

a:  North
r:  -0.01
s:
_____	_____	_____	+100_	
__@__	__#__	_____	-100_	
_____	_____	_____	_____	

a:  North
r:  -0.01


In [7]:
tau_list = gw.sample_trajectories(10, max_length=10)
for i, tau in enumerate(tau_list): print("T{:03d}: {}".format(i, tau))

T000: [(6, 3, -0.01), (6, 3, -0.01), (2, 1, 100.0), (3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T001: [(11, 2, -0.01), (11, 2, -0.01), (10, 3, -0.01), (9, 3, -0.01), (8, 0, -0.01), (9, 3, -0.01), (9, 3, -0.01), (8, 0, -0.01), (4, 0, -0.01), (0, 1, -0.01), (1, 1, -0.01)]
T002: [(6, 3, -0.01), (6, 3, -0.01), (6, 3, -0.01), (6, 3, -0.01), (6, 3, -0.01), (6, 3, -0.01), (6, 3, -0.01), (2, 1, 100.0), (3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T003: [(0, 1, -0.01), (1, 1, -0.01), (2, 1, -0.01), (6, 3, -0.01), (2, 1, 100.0), (3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T004: [(6, 3, -0.01), (6, 3, -0.01), (6, 3, -0.01), (6, 3, -0.01), (6, 3, -0.01), (6, 3, -0.01), (6, 3, -0.01), (6, 3, -0.01), (10, 3, -0.01), (6, 3, -0.01), (10, 3, -0.01)]
T005: [(1, 1, -0.01), (2, 1, -0.01), (2, 1, 100.0), (3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0