## Solving the Simple Inventory Problem

In [1]:
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.abspath("."), os.pardir)))

from dataclasses import dataclass
import scipy.stats as ss
from pprint import pprint

from rl.distributions import Categorical
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.policy import FiniteDeterministicPolicy
from rl.dynamic_programming.policy_methods import (
    evaluate_mrp_result,
    policy_iteration_result,
)
from rl.dynamic_programming.value_methods import value_iteration_result

In [2]:
@dataclass(frozen=True)
class Inventory:
    on_hand: int
    on_order: int

    @property
    def total_inventory(self) -> str:
        return self.on_hand + self.on_order


class SimpleInventoryWithCapacityConstraintsMDP(
    FiniteMarkovDecisionProcess[Inventory, int]
):
    def __init__(
        self,
        capacity: int,
        poisson_lambda: float,
        holding_costs: float,
        stockout_costs: float,
    ):
        self.capacity: int = capacity
        self.poisson_lambda: float = poisson_lambda
        self.holding_costs: float = -holding_costs
        self.stockout_costs: float = -stockout_costs
        self.poisson_distr = ss.poisson(self.poisson_lambda)

        super().__init__(self.generate_action_transition_reward_map())

    def generate_action_transition_reward_map(
        self,
    ) -> dict[Inventory, dict[int, Categorical[tuple[Inventory, float]]]]:
        dist: dict[Inventory, dict[int, Categorical[tuple[Inventory, float]]]] = dict()

        for alpha in range(self.capacity + 1):
            for beta in range(self.capacity + 1 - alpha):
                state = Inventory(on_hand=alpha, on_order=beta)
                total_inventory = state.total_inventory
                base_reward: float = self.holding_costs * alpha
                sub_dist: dict[int, Categorical[tuple[Inventory, float]]] = dict()

                for order in range(self.capacity + 1 - total_inventory):
                    next_state_reward_dist = {
                        (
                            Inventory(on_hand=total_inventory - demand, on_order=order),
                            base_reward,
                        ): self.poisson_distr.pmf(demand)
                        for demand in range(total_inventory)
                    }

                    prob = 1.0 - self.poisson_distr.cdf(total_inventory - 1)
                    reward = base_reward + self.stockout_costs * (
                        self.poisson_lambda
                        - total_inventory
                        * (1 - self.poisson_distr.pmf(total_inventory) / prob)
                    )

                    next_state_reward_dist[
                        (Inventory(on_hand=0, on_order=order), reward)
                    ] = prob
                    sub_dist[order] = Categorical(next_state_reward_dist)

                dist[state] = sub_dist

        return dist

In [3]:
capacity = 2
poisson_lambda = 1.0
holding_costs = 1.0
stockout_costs = 10.0

si_mdp = SimpleInventoryWithCapacityConstraintsMDP(
    capacity=capacity,
    poisson_lambda=poisson_lambda,
    holding_costs=holding_costs,
    stockout_costs=stockout_costs,
)

In [4]:
fdp = FiniteDeterministicPolicy(
    {
        Inventory(alpha, beta): capacity - (alpha + beta)
        for alpha in range(capacity + 1)
        for beta in range(capacity + 1 - alpha)
    }
)

In [5]:
implied_mrp = si_mdp.apply_finite_policy(fdp)
pprint(evaluate_mrp_result(mrp=implied_mrp, gamma=0.9))

{NonTerminal(state=Inventory(on_hand=0, on_order=0)): np.float64(-43.59563313047814),
 NonTerminal(state=Inventory(on_hand=0, on_order=1)): np.float64(-37.971111794412636),
 NonTerminal(state=Inventory(on_hand=0, on_order=2)): np.float64(-37.32849043566549),
 NonTerminal(state=Inventory(on_hand=1, on_order=0)): np.float64(-38.971111794412636),
 NonTerminal(state=Inventory(on_hand=1, on_order=1)): np.float64(-38.32849043566549),
 NonTerminal(state=Inventory(on_hand=2, on_order=0)): np.float64(-39.32849043566549)}


In [6]:
optimal_vf, optimal_policy = policy_iteration_result(mdp=si_mdp, gamma=0.9)

pprint(optimal_vf)
print()
print(optimal_policy)

{NonTerminal(state=Inventory(on_hand=0, on_order=0)): np.float64(-43.59563313047814),
 NonTerminal(state=Inventory(on_hand=0, on_order=1)): np.float64(-37.971111794412636),
 NonTerminal(state=Inventory(on_hand=0, on_order=2)): np.float64(-37.32849043566549),
 NonTerminal(state=Inventory(on_hand=1, on_order=0)): np.float64(-38.971111794412636),
 NonTerminal(state=Inventory(on_hand=1, on_order=1)): np.float64(-38.32849043566549),
 NonTerminal(state=Inventory(on_hand=2, on_order=0)): np.float64(-39.32849043566549)}

For State Inventory(on_hand=0, on_order=0): Do Action 2
For State Inventory(on_hand=0, on_order=1): Do Action 1
For State Inventory(on_hand=0, on_order=2): Do Action 0
For State Inventory(on_hand=1, on_order=0): Do Action 1
For State Inventory(on_hand=1, on_order=1): Do Action 0
For State Inventory(on_hand=2, on_order=0): Do Action 0



In [7]:
optimal_vf, optimal_policy = value_iteration_result(mdp=si_mdp, gamma=0.9)

pprint(optimal_vf)
print()
print(optimal_policy)

{NonTerminal(state=Inventory(on_hand=0, on_order=0)): np.float64(-43.59563313047815),
 NonTerminal(state=Inventory(on_hand=0, on_order=1)): np.float64(-37.97111179441265),
 NonTerminal(state=Inventory(on_hand=0, on_order=2)): np.float64(-37.3284904356655),
 NonTerminal(state=Inventory(on_hand=1, on_order=0)): np.float64(-38.97111179441265),
 NonTerminal(state=Inventory(on_hand=1, on_order=1)): np.float64(-38.3284904356655),
 NonTerminal(state=Inventory(on_hand=2, on_order=0)): np.float64(-39.3284904356655)}

For State Inventory(on_hand=0, on_order=0): Do Action 2
For State Inventory(on_hand=0, on_order=1): Do Action 1
For State Inventory(on_hand=0, on_order=2): Do Action 0
For State Inventory(on_hand=1, on_order=0): Do Action 1
For State Inventory(on_hand=1, on_order=1): Do Action 0
For State Inventory(on_hand=2, on_order=0): Do Action 0

