In [1]:
import math
import numpy as np
from MDPGridWorld import *

In [2]:
np.random.seed(0)

In [3]:
book_grid = [[' ',' ',' ',+1],
            [' ','#',' ',-1],
            ['@',' ',' ',' ']]

gw = MDPGridWorld(book_grid, action_noise_dist=[0.1, 0.8, 0.1], obstacle_reward=-2, verbose=True)

_____	_____	_____	_+1__	
_____	__#__	_____	_-1__	
__@__	_____	_____	_____	



In [4]:
gw.curr_state_idx, _ = gw.sample_next_state(gw.curr_state_idx, MDPGridWorld.North)
print(gw)

_____	_____	_____	_+1__	
__@__	__#__	_____	_-1__	
_____	_____	_____	_____	



In [5]:
print("States (fused into a single grid-bc it's possible to do so here):")
gw.disp_custom_grid(range(gw.nS), formatting=lambda x: str(x))

States (fused into a single grid-bc it's possible to do so here):
------------
0	1	2	3	
4	5	6	7	
8	9	10	11	
Absorbing state: 12
------------



In [6]:
vi = run_value_iteration(gw.T, gw.R, gw.gamma)
print("Optimal Value:")
gw.disp_custom_grid(vi.V, formatting=lambda x: "{:.3f}".format(x))
print("\nOptimal Policy:")
gw.disp_custom_grid(vi.policy, lambda x: "{:}".format(gw.actions_name[x]))

Optimal Value:
------------
0.903	0.930	0.955	1.000	
0.880	-1.098	0.790	-1.000	
0.853	0.830	0.805	0.639	
Absorbing state: 0.000
------------


Optimal Policy:
------------
East	East	East	North	
North	North	West	North	
North	West	West	South	
Absorbing state: North
------------



## Sampling Trajectories

In [7]:
tau = gw.sample_trajectory(init_state_idx="random", max_length=5)
print(tau)
gw.interpret_trajectory(tau)

[(0, 1, -0.01), (1, 1, -0.01), (2, 1, 1.0), (3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
s:
__@__	_____	_____	_+1__	
_____	__#__	_____	_-1__	
_____	_____	_____	_____	

a:  East
r:  -0.01
s:
_____	__@__	_____	_+1__	
_____	__#__	_____	_-1__	
_____	_____	_____	_____	

a:  East
r:  -0.01
s:
_____	_____	__@__	_+1__	
_____	__#__	_____	_-1__	
_____	_____	_____	_____	

a:  East
r:  1.0
s:
_____	_____	_____	__@__	
_____	__#__	_____	_-1__	
_____	_____	_____	_____	

a:  North
r:  0.0


In [8]:
tau_list = gw.sample_trajectories(10, max_length=10)
for i, tau in enumerate(tau_list): print("T{:03d}: {}".format(i, tau))

T000: [(6, 3, -0.01), (2, 1, 1.0), (3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T001: [(8, 0, -0.01), (4, 0, -0.01), (4, 0, -0.01), (0, 1, -0.01), (1, 1, -0.01), (2, 1, -0.01), (6, 3, -0.01), (6, 3, -0.01), (6, 3, -0.01), (6, 3, -0.01), (6, 3, -0.01)]
T002: [(3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T003: [(1, 1, -0.01), (1, 1, -0.01), (2, 1, 1.0), (3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T004: [(11, 2, -0.01), (11, 2, -0.01), (11, 2, -0.01), (11, 2, -0.01), (11, 2, -0.01), (11, 2, -0.01), (11, 2, -0.01), (11, 2, -0.01), (11, 2, -0.01), (11, 2, -0.01), (11, 2, -0.01)]
T005: [(1, 1, -0.01), (2, 1, -0.01), (2, 1, 1.0), (3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T00

## Visiting obstacles

In [9]:
gw = MDPGridWorld(book_grid, action_noise_dist=[0.4, 0.2, 0.4], obstacle_reward=-0.2, visit_obstacles=True, verbose=True)

_____	_____	_____	_+1__	
_____	__#__	_____	_-1__	
__@__	_____	_____	_____	



In [10]:
gw.disp_policy(gw._get_optimal_policy())

------------
North	North	North	North	
West	North	West	North	
West	South	West	South	
Absorbing state: North
------------



In [11]:
tau_list = gw.sample_trajectories(10, init_state_idx=6, max_length=10)
for i, tau in enumerate(tau_list): print("T{:03d}: {}".format(i, tau))

T000: [(6, 3, -0.01), (2, 0, -0.01), (2, 0, 1.0), (3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T001: [(6, 3, -0.01), (2, 0, 1.0), (3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T002: [(6, 3, -0.2), (5, 0, -0.01), (1, 0, -0.01), (2, 0, -0.01), (1, 0, -0.01), (2, 0, -0.01), (1, 0, -0.01), (2, 0, 1.0), (3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T003: [(6, 3, -0.2), (5, 0, -0.01), (4, 3, -0.01), (0, 0, -0.01), (1, 0, -0.01), (1, 0, -0.01), (2, 0, 1.0), (3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T004: [(6, 3, -0.01), (2, 0, 1.0), (3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T005: [(6, 3, -0.01), (10, 3, -0.01), (10, 3, -0.01), (9, 2, -0.01), (10, 3, -0.01), (10, 3, -0.01), (9, 2, -0.01), (10, 3, -0.01), (6, 3, -0.01), (2, 0, 1.0), (3, 0, 1.0)]
T006: [(6, 3, -0.

In [12]:
# This trajectory visits the obstacle
gw.interpret_trajectory(tau_list[8])

s:
_____	_____	_____	_+1__	
_____	__#__	__@__	_-1__	
_____	_____	_____	_____	

a:  West
r:  -0.01
s:
_____	_____	_____	_+1__	
_____	__#__	_____	_-1__	
_____	_____	__@__	_____	

a:  West
r:  -0.01
s:
_____	_____	_____	_+1__	
_____	__#__	__@__	_-1__	
_____	_____	_____	_____	

a:  West
r:  -0.2
s:
_____	_____	_____	_+1__	
_____	__@__	_____	_-1__	
_____	_____	_____	_____	

a:  North
r:  -0.01
s:
_____	_____	_____	_+1__	
_____	__#__	__@__	_-1__	
_____	_____	_____	_____	

a:  West
r:  -0.01
s:
_____	_____	_____	_+1__	
_____	__#__	_____	_-1__	
_____	_____	__@__	_____	

a:  West
r:  -0.01
s:
_____	_____	_____	_+1__	
_____	__#__	_____	_-1__	
_____	_____	__@__	_____	

a:  West
r:  -0.01
s:
_____	_____	_____	_+1__	
_____	__#__	__@__	_-1__	
_____	_____	_____	_____	

a:  West
r:  -0.01
s:
_____	_____	_____	_+1__	
_____	__#__	_____	_-1__	
_____	_____	__@__	_____	

a:  West
r:  -0.01
s:
_____	_____	_____	_+1__	
_____	__#__	_____	_-1__	
_____	_____	__@__	_____	

a:  West
r:  -0.01
s:
_____	_____	_____

In [13]:
gw = MDPGridWorld(book_grid, action_noise_dist=[0.4, 0.2, 0.4], obstacle_reward=-10, visit_obstacles=True, verbose=True)

_____	_____	_____	_+1__	
_____	__#__	_____	_-1__	
__@__	_____	_____	_____	



In [14]:
gw.disp_policy(gw._get_optimal_policy())

------------
North	North	North	North	
West	West	East	North	
West	South	South	South	
Absorbing state: North
------------



^ policy Avoids obstacle at all cost bc obstacle reward is -10, so we will see obstacle states less likely to be visited during optimal agent trajectory.

In [15]:
tau_list = gw.sample_trajectories(10, init_state_idx=6, max_length=10)
for i, tau in enumerate(tau_list): print("T{:03d}: {}".format(i, tau))

T000: [(6, 1, -0.01), (2, 0, 1.0), (3, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T001: [(6, 1, -1.0), (7, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T002: [(6, 1, -1.0), (7, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0), (12, 0, 0.0)]
T003: [(6, 1, -0.01), (10, 2, -0.01), (11, 2, -0.01), (10, 2, -0.01), (9, 2, -0.01), (10, 2, -0.01), (9, 2, -0.01), (9, 2, -0.01), (8, 3, -0.01), (4, 3, -0.01), (0, 3, -0.01)]
T004: [(6, 1, -0.01), (10, 2, -0.01), (9, 2, -0.01), (8, 3, -0.01), (8, 3, -0.01), (8, 3, -0.01), (4, 3, -0.01), (0, 0, -0.01), (1, 0, -0.01), (1, 0, -0.01), (0, 0, -0.01)]
T005: [(6, 1, -0.01), (10, 2, -0.01), (11, 2, -0.01), (11, 2, -0.01), (11, 2, -0.01), (11, 2, -0.01), (10, 2, -0.01), (9, 2, -0.01), (8, 3, -0.01), (4, 3, -0.01), (8, 3, -