In [4]:
using POMDPs
using QuickPOMDPs
using Distributions

using Distributions: DiscreteUniform, pdf
using POMDPTools: Deterministic

const MAX_INVENTORY = 20
const MAX_STORE = 10
const ORDER_SIZE = 5

const holding_cost_store = 2
const holding_cost_parking = 4
const order_cost = 20
const stockout_penalty = 50

const demand_dist = DiscreteUniform(0, 10)

mdp = QuickMDP(
    states = 0:MAX_INVENTORY,
    actions = [0, 1],  # 0 = no order, 1 = order 5 bikes
    discount = 0.99,
    transition = function (s, a)
        outcomes = []
        order_qty = a == 1 ? min(ORDER_SIZE, MAX_INVENTORY - s) : 0
        new_stock = s + order_qty

        for d in support(demand_dist)
            sold = min(d, new_stock)
            next_state = new_stock - sold
            prob = pdf(demand_dist, d)
            push!(outcomes, (next_state, prob))
        end
        return outcomes
    end,
    reward = function (s, a, sp)
        order_qty = a == 1 ? min(ORDER_SIZE, MAX_INVENTORY - s) : 0
        new_stock = s + order_qty
        demand = new_stock - sp
        lost_sales = max(demand - new_stock, 0)

        in_store = min(sp, MAX_STORE)
        in_parking = max(sp - MAX_STORE, 0)

        cost = 0
        cost += a == 1 ? order_cost : 0
        cost += in_store * holding_cost_store + in_parking * holding_cost_parking
        cost += (max(demand - new_stock, 0)) * stockout_penalty

        return -cost  # negative cost = reward
    end,
    initialstate = Deterministic(10)
)

QuickMDP{Base.UUID("84491737-5c7e-4634-847a-10e44092a817"), Int64, Int64, @NamedTuple{stateindex::Dict{Int64, Int64}, isterminal::Bool, actionindex::Dict{Int64, Int64}, transition::var"#12#14", reward::var"#13#15", states::UnitRange{Int64}, actions::Vector{Int64}, discount::Float64, initialstate::Deterministic{Int64}}}((stateindex = Dict(5 => 6, 16 => 17, 7 => 8, 20 => 21, 12 => 13, 8 => 9, 17 => 18, 1 => 2, 19 => 20, 0 => 1…), isterminal = false, actionindex = Dict(0 => 1, 1 => 2), transition = var"#12#14"(), reward = var"#13#15"(), states = 0:20, actions = [0, 1], discount = 0.99, initialstate = Deterministic{Int64}(10)))

In [6]:
using POMDPTools

s = 10  # Start state
a = 0   # Action: order

for (sp, p) in transition(mdp, s, a)
    r = reward(mdp, s, a, sp)
    println("From $s --(a=$a)--> $sp  | P=$p, Reward=$r")
end

From 10 --(a=0)--> 10  | P=0.09090909090909091, Reward=-20
From 10 --(a=0)--> 9  | P=0.09090909090909091, Reward=-18
From 10 --(a=0)--> 8  | P=0.09090909090909091, Reward=-16
From 10 --(a=0)--> 7  | P=0.09090909090909091, Reward=-14
From 10 --(a=0)--> 6  | P=0.09090909090909091, Reward=-12
From 10 --(a=0)--> 5  | P=0.09090909090909091, Reward=-10
From 10 --(a=0)--> 4  | P=0.09090909090909091, Reward=-8
From 10 --(a=0)--> 3  | P=0.09090909090909091, Reward=-6
From 10 --(a=0)--> 2  | P=0.09090909090909091, Reward=-4
From 10 --(a=0)--> 1  | P=0.09090909090909091, Reward=-2
From 10 --(a=0)--> 0  | P=0.09090909090909091, Reward=0


In [1]:
using POMDPs
using QuickPOMDPs
using POMDPTools
using TabularTDLearning
using Random

In [2]:
function train_q_learning_agent(mdp, alpha::Float64, epsilon::Float64; iterations=10_000)
    ql = QLearningSolver(
        max_iterations=iterations,
        alpha=alpha,
        epsilon=epsilon,
        gamma=mdp.discount
    )
    policy = solve(ql, mdp)
    return policy, ql.q
end

train_q_learning_agent (generic function with 1 method)

In [None]:
function evaluate_policy(mdp, policy; num_trials=100, max_steps=100)
    sim = RolloutSimulator(max_steps=max_steps)
    rewards = [simulate(sim, mdp, policy) for _ in 1:num_trials]
    return mean(rewards)
end