In [43]:
# import packages needed

import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import Iterable, Callable, Mapping, TypeVar, List, Tuple, Optional
from rl.markov_decision_process import TransitionStep, Policy,MarkovDecisionProcess
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.returns import returns
from rl.function_approx import Tabular, FunctionApprox
from rl.dynamic_programming import policy_iteration_result
from rl.iterate import last
from collections import defaultdict
from copy import deepcopy

from rl.chapter3.simple_inventory_mdp_cap import SimpleInventoryMDPCap, InventoryState
from rl.monte_carlo import mc_prediction, mc_control
from rl.td import td_prediction

from rl.distribution import Constant,Choose,Bernoulli,Distribution

In [44]:
S = TypeVar('S')
A = TypeVar('A')

### Problem 1

In [96]:
# Tabular first
def policy_from_q(q: Mapping[Tuple[S,A],float],
                  mdp: MarkovDecisionProcess[S, A],
                  eps: float = 0.0) -> Policy[S, A]:
    
    explore = Bernoulli(eps)

    class QPolicy(Policy[S, A]):
        def act(self, s: S) -> Optional[Distribution[A]]:
            if mdp.is_terminal(s):
                return None

            if explore.sample():
                return Choose(set(mdp.actions(s)))

            greedy = None
            max_q = -np.Inf
            for k in q:
                if k[0] == s and q[k] > max_q:
                    max_q = q[k]
                    greedy = k[1]
            return Constant(greedy)

    return QPolicy()

def tab_mc_control(mdp: MarkovDecisionProcess[S, A],
                   states: Distribution[S],
                   weight_func: Callable[[int],float],
                   gamma: float,
                   tol: float = 1e-6,
                   maxIter: int = 1000) -> List[Mapping[Tuple[S,A],float]]:
    
    curr_q:Mapping[Tuple[S,A],float] = defaultdict(lambda:0)
    q: List[Mapping[Tuple[S,A],float]] = [curr_q]
    p: Policy[S,A] = policy_from_q(q, mdp,1.)
    occurence: Mapping[Tuple[S,A],int] = defaultdict(lambda:0)

    for n in range(maxIter):
        trace: Iterable[TransitionStep[S, A]] = mdp.simulate_actions(states, p)
        episode = returns(trace,gamma,tol)
        for st in episode:
            occurence[(st.state,st.action)] += 1
            curr_q[(st.state,st.action)] = curr_q[(st.state,st.action)]*(1-\
                weight_func(occurence[(st.state,st.action)])) +\
                weight_func(occurence[(st.state,st.action)])*st.return_
        q.append(curr_q)
        p = policy_from_q(q[-1], mdp, 1./(n+2))
    
    return q

In [97]:
# Test correctness
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
    SimpleInventoryMDPCap(
        capacity=user_capacity,
        poisson_lambda=user_poisson_lambda,
        holding_cost=user_holding_cost,
        stockout_cost=user_stockout_cost
    )

In [98]:
res_1 = tab_mc_control(si_mdp,Choose(si_mdp.non_terminal_states),lambda n: 1./n,user_gamma)

In [99]:
opt_policy = defaultdict(lambda:-1)
opt_vf = defaultdict(lambda:-100)
for k in res_1[-1]:
    if opt_vf[k[0]] < res_1[-1][k]:
        opt_vf[k[0]] = res_1[-1][k]
        opt_policy[k[0]] = k[1]

In [100]:
res_2 = policy_iteration_result(si_mdp,user_gamma)

In [101]:
for k in opt_vf:
    print(f"State {k}: %.4f vs. %.4f"%(opt_vf[k],res_2[0][k]))
for k in opt_policy:
    print(f"State {k}: Action %d vs. %d"%(opt_policy[k],res_2[1].policy_map[k].value))

State InventoryState(on_hand=1, on_order=0): -28.9862 vs. -28.6610
State InventoryState(on_hand=0, on_order=0): -35.5641 vs. -34.8948
State InventoryState(on_hand=0, on_order=2): -28.4044 vs. -27.9919
State InventoryState(on_hand=0, on_order=1): -27.9477 vs. -27.6610
State InventoryState(on_hand=2, on_order=0): -30.4252 vs. -29.9919
State InventoryState(on_hand=1, on_order=1): -29.3745 vs. -28.9919
State InventoryState(on_hand=1, on_order=0): Action 1 vs. 1
State InventoryState(on_hand=0, on_order=0): Action 2 vs. 1
State InventoryState(on_hand=0, on_order=2): Action 0 vs. 0
State InventoryState(on_hand=0, on_order=1): Action 1 vs. 1
State InventoryState(on_hand=2, on_order=0): Action 0 vs. 0
State InventoryState(on_hand=1, on_order=1): Action 0 vs. 0


Question:<br>
Everything correct except InventoryState(on_hand=0, on_order=0), why?