In [30]:
# import packages needed
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import (Iterable, Callable, Mapping, TypeVar, 
                    List, Tuple, Optional,Sequence, Iterator)
from rl.markov_process import MarkovRewardProcess
from rl.markov_decision_process import Policy,MarkovDecisionProcess,FinitePolicy
from rl.function_approx import Tabular, FunctionApprox
from rl.dynamic_programming import policy_iteration_result,value_iteration_result
from rl.iterate import last,iterate
from rl.approximate_dynamic_programming import evaluate_mrp, value_iteration

from rl.chapter3.simple_inventory_mdp_cap import SimpleInventoryMDPCap, InventoryState
from rl.chapter7.asset_alloc_discrete import AssetAllocDiscrete 
from rl.monte_carlo import mc_prediction, mc_control
from rl.td import td_prediction

from rl.distribution import Constant,Distribution,Choose

In [31]:
S = TypeVar('S')
A = TypeVar('S')

## Problem 2

In [32]:
def policy_from_v(
        v: FunctionApprox[S],
        mdp: MarkovDecisionProcess[S, A]
) -> Policy[S, A]:
    '''Return a policy that chooses the action that maximizes the reward
    for each state in the given V function.
    Arguments:
      v -- approximation of the V function for the MDP
      mdp -- the process for which we're generating a policy
    Returns a greedy policy based on the given V function.
    '''

    class VPolicy(Policy[S, A]):
        def act(self, s: S) -> Optional[Distribution[A]]:
            if mdp.is_terminal(s):
                return None

            action = v.argmax(mdp.actions(s))
            return Constant(action)

    return VPolicy()

def policy_iteration(
    mdp: MarkovDecisionProcess[S, A],
    gamma: float,
    fa: FunctionApprox[S],
    non_terminal: Distribution[S],
    num_state = 1000) -> Iterator[Tuple[FunctionApprox[S], Policy[S, A]]]:
    '''Calculate the value function (V*) of the given MDP by improving
    the policy repeatedly after evaluating the value function for a policy
    '''

    def update(vf_policy: Tuple[FunctionApprox[S], Policy[S, A]])-> Tuple[FunctionApprox[S], Policy[S, A]]:

        vf, pi = vf_policy
        mrp: MarkovRewardProcess[S] = mdp.apply_policy(pi)
        policy_vf: FunctionApprox[S] = last(evaluate_mrp(mrp, gamma, vf, non_terminal, num_state))
        improved_pi: Policy[S, A] = policy_from_v(policy_vf,mdp)

        return policy_vf, improved_pi

    pi_0 = policy_from_v(fa,mdp)
    return iterate(update, (fa, pi_0))

In [34]:
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
    SimpleInventoryMDPCap(
        capacity=user_capacity,
        poisson_lambda=user_poisson_lambda,
        holding_cost=user_holding_cost,
        stockout_cost=user_stockout_cost
    )

fdp: FinitePolicy[InventoryState, int] = FinitePolicy(
    {InventoryState(alpha, beta):
     Constant(user_capacity - (alpha + beta)) for alpha in
     range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha)}
)

print("MDP Policy Iteration Optimal Value Function and Optimal Policy")
print("--------------")
opt_vf_pi, opt_policy_pi = policy_iteration_result(
    si_mdp,
    gamma=user_gamma
)
print(opt_vf_pi)
print(opt_policy_pi)
print()

print("MDP Value Iteration Optimal Value Function and Optimal Policy")
print("--------------")
opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp, gamma=user_gamma)
print(opt_vf_vi)
print(opt_policy_vi)
print()


MDP Policy Iteration Optimal Value Function and Optimal Policy
--------------
{InventoryState(on_hand=0, on_order=0): -34.89484641847035, InventoryState(on_hand=0, on_order=1): -27.660950868477816, InventoryState(on_hand=0, on_order=2): -27.991890728243845, InventoryState(on_hand=1, on_order=0): -28.660950868477816, InventoryState(on_hand=1, on_order=1): -28.991890728243845, InventoryState(on_hand=2, on_order=0): -29.991890728243845}
For State InventoryState(on_hand=0, on_order=0):
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=0, on_order=1):
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=0, on_order=2):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=1, on_order=0):
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=1, on_order=1):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=2, on_order=0):
  Do Action 0 with Probability 1.000


MDP Value Iteration Optimal Value Function a

In [None]:
fa = Tabular()
nt = Choose(si_mdp.non_terminal_states)
last(value_iteration(si_mdp,user_gamma,fa,nt,10))

last(policy_iteration(si_mdp,user_gamma,fa,nt,10))

Very slow and cannot get a result using approximate policy iteration and value iteration. Why?