## Bicycle Inventory (with capacity constraints): Finite Markov decision processes

In [None]:
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.abspath("."), os.pardir)))

from dataclasses import dataclass
import scipy.stats as ss

from rl.distributions import Categorical
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.policy import FiniteDeterministicPolicy

In [2]:
@dataclass(frozen=True)
class Inventory:
    on_hand: int
    on_order: int

    @property
    def total_inventory(self) -> str:
        return self.on_hand + self.on_order


class SimpleInventoryWithCapacityConstraintsMDP(
    FiniteMarkovDecisionProcess[Inventory, int]
):
    def __init__(
        self,
        capacity: int,
        poisson_lambda: float,
        holding_costs: float,
        stockout_costs: float,
    ):
        self.capacity: int = capacity
        self.poisson_lambda: float = poisson_lambda
        self.holding_costs: float = -holding_costs
        self.stockout_costs: float = -stockout_costs
        self.poisson_distr = ss.poisson(self.poisson_lambda)

        super().__init__(self.generate_action_transition_reward_map())

    def generate_action_transition_reward_map(
        self,
    ) -> dict[Inventory, dict[int, Categorical[tuple[Inventory, float]]]]:
        dist: dict[Inventory, dict[int, Categorical[tuple[Inventory, float]]]] = dict()

        for alpha in range(self.capacity + 1):
            for beta in range(self.capacity + 1 - alpha):
                state = Inventory(on_hand=alpha, on_order=beta)
                total_inventory = state.total_inventory
                base_reward: float = self.holding_costs * alpha
                sub_dist: dict[int, Categorical[tuple[Inventory, float]]] = dict()

                for order in range(self.capacity + 1 - total_inventory):
                    next_state_reward_dist = {
                        (
                            Inventory(on_hand=total_inventory - demand, on_order=order),
                            base_reward,
                        ): self.poisson_distr.pmf(demand)
                        for demand in range(total_inventory)
                    }

                    prob = 1.0 - self.poisson_distr.cdf(total_inventory - 1)
                    reward = base_reward + self.stockout_costs * (
                        self.poisson_lambda
                        - total_inventory
                        * (1 - self.poisson_distr.pmf(total_inventory) / prob)
                    )

                    next_state_reward_dist[
                        (Inventory(on_hand=0, on_order=order), reward)
                    ] = prob
                    sub_dist[order] = Categorical(next_state_reward_dist)

                dist[state] = sub_dist

        return dist

In [34]:
capacity = 5
poisson_lambda = 1.0
holding_cost = 1.0
stockout_cost = 10.0

si_mdp = SimpleInventoryWithCapacityConstraintsMDP(
    capacity=capacity,
    poisson_lambda=poisson_lambda,
    holding_costs=holding_cost,
    stockout_costs=stockout_cost,
)

In [None]:
fdp = FiniteDeterministicPolicy(
    {
        Inventory(on_hand=alpha, on_order=beta): ss.randint(
            0, capacity + 1 - (alpha + beta)
        ).rvs()
        for alpha in range(capacity + 1)
        for beta in range(capacity + 1 - alpha)
    }
)

In [45]:
implied_mrp = si_mdp.apply_finite_policy(policy=fdp)

In [46]:
implied_mrp

From State Inventory(on_hand=0, on_order=0):
	To State Inventory(on_hand=0, on_order=2) with Probability 1.000
From State Inventory(on_hand=0, on_order=1):
	To State Inventory(on_hand=1, on_order=3) with Probability 0.368
	To State Inventory(on_hand=0, on_order=3) with Probability 0.632
From State Inventory(on_hand=0, on_order=2):
	To State Inventory(on_hand=2, on_order=2) with Probability 0.368
	To State Inventory(on_hand=1, on_order=2) with Probability 0.368
	To State Inventory(on_hand=0, on_order=2) with Probability 0.264
From State Inventory(on_hand=0, on_order=3):
	To State Inventory(on_hand=3, on_order=0) with Probability 0.368
	To State Inventory(on_hand=2, on_order=0) with Probability 0.368
	To State Inventory(on_hand=1, on_order=0) with Probability 0.184
	To State Inventory(on_hand=0, on_order=0) with Probability 0.080
From State Inventory(on_hand=0, on_order=4):
	To State Inventory(on_hand=4, on_order=0) with Probability 0.368
	To State Inventory(on_hand=3, on_order=0) with P

In [47]:
implied_mrp.compute_stationary_distribution()

State Inventory(on_hand=0, on_order=0): 0.136
State Inventory(on_hand=0, on_order=1): 0.006
State Inventory(on_hand=0, on_order=2): 0.190
State Inventory(on_hand=0, on_order=3): 0.004
State Inventory(on_hand=0, on_order=4): -0.000
State Inventory(on_hand=0, on_order=5): -0.000
State Inventory(on_hand=1, on_order=0): 0.136
State Inventory(on_hand=1, on_order=1): 0.014
State Inventory(on_hand=1, on_order=2): 0.075
State Inventory(on_hand=1, on_order=3): 0.002
State Inventory(on_hand=1, on_order=4): -0.000
State Inventory(on_hand=2, on_order=0): 0.133
State Inventory(on_hand=2, on_order=1): 0.028
State Inventory(on_hand=2, on_order=2): 0.075
State Inventory(on_hand=2, on_order=3): -0.000
State Inventory(on_hand=3, on_order=0): 0.114
State Inventory(on_hand=3, on_order=1): 0.028
State Inventory(on_hand=3, on_order=2): -0.000
State Inventory(on_hand=4, on_order=0): 0.061
State Inventory(on_hand=4, on_order=1): -0.000
State Inventory(on_hand=5, on_order=0): -0.000

In [48]:
for state, reward in zip(
    implied_mrp.non_terminal_states,
    implied_mrp.reward_function_vector,
):
    print(f"State {state.state}: reward = {reward:.3f}")

State Inventory(on_hand=0, on_order=0): reward = -10.000
State Inventory(on_hand=0, on_order=1): reward = -3.679
State Inventory(on_hand=0, on_order=2): reward = -1.036
State Inventory(on_hand=0, on_order=3): reward = -0.233
State Inventory(on_hand=0, on_order=4): reward = -0.043
State Inventory(on_hand=0, on_order=5): reward = -0.007
State Inventory(on_hand=1, on_order=0): reward = -4.679
State Inventory(on_hand=1, on_order=1): reward = -2.036
State Inventory(on_hand=1, on_order=2): reward = -1.233
State Inventory(on_hand=1, on_order=3): reward = -1.043
State Inventory(on_hand=1, on_order=4): reward = -1.007
State Inventory(on_hand=2, on_order=0): reward = -3.036
State Inventory(on_hand=2, on_order=1): reward = -2.233
State Inventory(on_hand=2, on_order=2): reward = -2.043
State Inventory(on_hand=2, on_order=3): reward = -2.007
State Inventory(on_hand=3, on_order=0): reward = -3.233
State Inventory(on_hand=3, on_order=1): reward = -3.043
State Inventory(on_hand=3, on_order=2): reward 

In [49]:
for state, reward in zip(
    implied_mrp.non_terminal_states,
    implied_mrp.compute_value_function_vector(gamma=0.9),
):
    print(f"State {state.state}: reward = {reward:.3f}")

State Inventory(on_hand=0, on_order=0): reward = -38.625
State Inventory(on_hand=0, on_order=1): reward = -35.116
State Inventory(on_hand=0, on_order=2): reward = -31.805
State Inventory(on_hand=0, on_order=3): reward = -34.674
State Inventory(on_hand=0, on_order=4): reward = -34.369
State Inventory(on_hand=0, on_order=5): reward = -34.947
State Inventory(on_hand=1, on_order=0): reward = -39.845
State Inventory(on_hand=1, on_order=1): reward = -32.805
State Inventory(on_hand=1, on_order=2): reward = -33.717
State Inventory(on_hand=1, on_order=3): reward = -35.369
State Inventory(on_hand=1, on_order=4): reward = -35.947
State Inventory(on_hand=2, on_order=0): reward = -37.994
State Inventory(on_hand=2, on_order=1): reward = -36.674
State Inventory(on_hand=2, on_order=2): reward = -36.369
State Inventory(on_hand=2, on_order=3): reward = -36.947
State Inventory(on_hand=3, on_order=0): reward = -37.674
State Inventory(on_hand=3, on_order=1): reward = -37.369
State Inventory(on_hand=3, on_o