In [3]:
using POMDPs
using QuickPOMDPs
using Distributions

using Distributions: DiscreteUniform, pdf
using POMDPTools: Deterministic

const MAX_INVENTORY = 20
const MAX_STORE = 10
const ORDER_SIZE = 5

const holding_cost_store = 2
const holding_cost_parking = 4
const order_cost = 20
const stockout_penalty = 50

const demand_dist = DiscreteUniform(0, 10)

mdp = QuickMDP(
    states = 0:MAX_INVENTORY,
    actions = [0, 1],  # 0 = no order, 1 = order 5 bikes
    discount = 0.99,
    transition = function (s, a)
        outcomes = []
        order_qty = a == 1 ? min(ORDER_SIZE, MAX_INVENTORY - s) : 0
        new_stock = s + order_qty

        for d in support(demand_dist)
            sold = min(d, new_stock)
            next_state = new_stock - sold
            prob = pdf(demand_dist, d)
            push!(outcomes, (next_state, prob))
        end
        return outcomes
    end,
    reward = function (s, a, sp)
        order_qty = a == 1 ? min(ORDER_SIZE, MAX_INVENTORY - s) : 0
        new_stock = s + order_qty
        demand = new_stock - sp
        lost_sales = max(demand - new_stock, 0)

        in_store = min(sp, MAX_STORE)
        in_parking = max(sp - MAX_STORE, 0)

        cost = 0
        cost += a == 1 ? order_cost : 0
        cost += in_store * holding_cost_store + in_parking * holding_cost_parking
        cost += (max(demand - new_stock, 0)) * stockout_penalty

        return -cost  # negative cost = reward
    end,
    initialstate = Deterministic(10)
)


QuickMDP{Base.UUID("6f4fc768-1c24-44aa-8d4c-d5ea7aa3a9db"), Int64, Int64, @NamedTuple{stateindex::Dict{Int64, Int64}, isterminal::Bool, actionindex::Dict{Int64, Int64}, transition::var"#15#17", reward::var"#16#18", states::UnitRange{Int64}, actions::Vector{Int64}, discount::Float64, initialstate::Deterministic{Int64}}}((stateindex = Dict(5 => 6, 16 => 17, 7 => 8, 20 => 21, 12 => 13, 8 => 9, 17 => 18, 1 => 2, 19 => 20, 0 => 1…), isterminal = false, actionindex = Dict(0 => 1, 1 => 2), transition = var"#15#17"(), reward = var"#16#18"(), states = 0:20, actions = [0, 1], discount = 0.99, initialstate = Deterministic{Int64}(10)))

In [4]:
using POMDPTools

s = 14  # Start state
a = 1   # Action: order

for (sp, p) in transition(mdp, s, a)
    r = reward(mdp, s, a, sp)
    println("From $s --(a=$a)--> $sp  | P=$p, Reward=$r")
end


From 14 --(a=1)--> 19  | P=0.09090909090909091, Reward=-76
From 14 --(a=1)--> 18  | P=0.09090909090909091, Reward=-72
From 14 --(a=1)--> 17  | P=0.09090909090909091, Reward=-68
From 14 --(a=1)--> 16  | P=0.09090909090909091, Reward=-64
From 14 --(a=1)--> 15  | P=0.09090909090909091, Reward=-60
From 14 --(a=1)--> 14  | P=0.09090909090909091, Reward=-56
From 14 --(a=1)--> 13  | P=0.09090909090909091, Reward=-52
From 14 --(a=1)--> 12  | P=0.09090909090909091, Reward=-48
From 14 --(a=1)--> 11  | P=0.09090909090909091, Reward=-44
From 14 --(a=1)--> 10  | P=0.09090909090909091, Reward=-40
From 14 --(a=1)--> 9  | P=0.09090909090909091, Reward=-38
