In [20]:
import sys 
sys.path.append("../")

import numpy as np 
from pprint import pprint
from typing import TypeVar, Iterable, Mapping, Dict
import matplotlib.pyplot as plt 
import rl.markov_process as mp
import itertools
import rl.markov_decision_process as mdp
import rl.td as td 
# import rl.chapter3.simple_inventory_mdp_cap as simdp 
from rl.chapter3.simple_inventory_mdp_cap import InventoryState, SimpleInventoryMDPCap
import rl.monte_carlo as mc
import rl.chapter11.control_utils as control

### The original MDP to test the Tabular MC with GLIE with:

In [21]:
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mdp: mdp.FiniteMarkovDecisionProcess[InventoryState, int] =\
    SimpleInventoryMDPCap(
        capacity=user_capacity,
        poisson_lambda=user_poisson_lambda,
        holding_cost=user_holding_cost,
        stockout_cost=user_stockout_cost
    )

from rl.dynamic_programming import policy_iteration_result
from rl.dynamic_programming import value_iteration_result

print("MDP Policy Iteration: Optimal Value Function and Optimal Policy")
print("--------------" * 5)
opt_vf_pi, opt_policy_pi = policy_iteration_result(
    si_mdp,
    gamma=user_gamma
)
pprint({k : round(v, 3) for k, v in opt_vf_pi.items()})
print()
print(opt_policy_pi)

MDP Policy Iteration: Optimal Value Function and Optimal Policy
----------------------------------------------------------------------
{NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -28.992,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -29.992,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -27.992,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.661,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -34.895,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.661}

For State InventoryState(on_hand=0, on_order=0): Do Action 1
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0



### Tabular Monte Carlo with GLIE:

In [41]:
num_traces = 100
num_episodes = 100

glie_it = control.glie_mc_finite_control_equal_wts(fmdp=si_mdp,
                                              gamma=user_gamma,
                                              epsilon_as_func_of_episodes=lambda k : k ** (-1.),
                                              episode_length_tolerance=1e-3)

*_, mc_vf = itertools.islice(glie_it, num_traces * num_episodes)

opt_vf_pi = {s : max((round(v, 3) for (st, a), v in mc_vf.values_map.items() if s == st)) for s in si_mdp.non_terminal_states}
opt_policy_pi = {s : max(((a, v) for (st, a), v in mc_vf.values_map.items() if s == st), key=lambda x : x[1])[0] for s in si_mdp.non_terminal_states}

print("Value Function from Tabular Monte Carlo Control with GLIE:")
print("--" * 20)
pprint(opt_vf_pi)
print()
pprint(opt_policy_pi)

Value Function from Tabular Monte Carlo Control with GLIE:
----------------------------------------
{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -34.906,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.003,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.012,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.695,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.117,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.675}

{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): 1,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): 0,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): 0,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): 1,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): 0,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): 1}
