Implement GLIE tabular MC control
* sample kth episode using $\pi : {S_0,A_0,R_1,S_1,A_1,...,R_T,S_T} \sim \pi$
* For each state $S_t$ and action $A_t$ in episode, updates at episode-end:
\begin{equation}
Count(S_t,A_t) \leftarrow Count(S_t,A_t) + 1 \\
Q(S_t,A_t) \leftarrow Q(S_t,A_t)+\frac{1}{Count(S_t,A_t)} \cdot (G_t - Q(S_t,A_t))
\end{equation}
* Improve policy at end of episode based on updated Q-Value function:
\begin{equation}
\epsilon \leftarrow \frac{1}{k} \\
\pi \leftarrow \epsilon -greedy(Q)
\end{equation}

In [1]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
sys.path.append(os.path.dirname(module_path))

from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite
from typing import Sequence, Iterable, Callable
from rl.function_approx import AdamGradient
from rl.function_approx import FunctionApprox, LinearFunctionApprox
from rl.distribution import Choose
from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite
from rl.chapter2.simple_inventory_mrp import InventoryState
from rl.chapter10.prediction_utils import (
    mc_prediction_learning_rate,
    td_prediction_learning_rate
)
import numpy as np
from itertools import islice

capacity: int = 2
poisson_lambda: float = 1.0
holding_cost: float = 1.0
stockout_cost: float = 10.0

gamma: float = 0.9
    
si_mrp: SimpleInventoryMRPFinite = SimpleInventoryMRPFinite(
    capacity=capacity,
    poisson_lambda=poisson_lambda,
    holding_cost=holding_cost,
    stockout_cost=stockout_cost
)

In [3]:
si_mrp.display_value_function(gamma=gamma)

{InventoryState(on_hand=0, on_order=0): -35.511,
 InventoryState(on_hand=1, on_order=0): -28.932,
 InventoryState(on_hand=0, on_order=1): -27.932,
 InventoryState(on_hand=0, on_order=2): -28.345,
 InventoryState(on_hand=2, on_order=0): -30.345,
 InventoryState(on_hand=1, on_order=1): -29.345}


In [4]:
si_mrp.non_terminal_states

[InventoryState(on_hand=0, on_order=0),
 InventoryState(on_hand=0, on_order=1),
 InventoryState(on_hand=0, on_order=2),
 InventoryState(on_hand=1, on_order=0),
 InventoryState(on_hand=1, on_order=1),
 InventoryState(on_hand=2, on_order=0)]

In [9]:
from typing import Sequence, Tuple, Mapping
from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite
from rl.chapter2.simple_inventory_mrp import InventoryState
from rl.chapter10.prediction_utils import (
    mc_finite_equal_wts_correctness,
    mc_finite_learning_rate_correctness,
    td_finite_learning_rate_correctness,
    compare_td_and_mc
)

In [10]:
initial_vf_dict: Mapping[InventoryState, float] = \
    {s: 0. for s in si_mrp.non_terminal_states}

In [11]:
initial_vf_dict

{InventoryState(on_hand=0, on_order=0): 0.0,
 InventoryState(on_hand=0, on_order=1): 0.0,
 InventoryState(on_hand=0, on_order=2): 0.0,
 InventoryState(on_hand=1, on_order=0): 0.0,
 InventoryState(on_hand=1, on_order=1): 0.0,
 InventoryState(on_hand=2, on_order=0): 0.0}