# Description du MDP et exemple d'utilisation 

In [1]:
using POMDPs
using QuickPOMDPs
using Distributions
using Random
using POMDPTools: Deterministic

const MAX_INVENTORY = 20
const MAX_STORE = 10
const ORDER_SIZE = 5

const holding_cost_store = 2
const holding_cost_parking = 4
const order_cost = 20
const stockout_penalty = 50

const demand_dist = DiscreteUniform(0, 10)

mdp = QuickMDP(
    
    states = 0:MAX_INVENTORY,
    actions = [0, 1],
    discount = 0.99,

    gen = function (s, a, rng)
        order_qty = a == 1 ? min(ORDER_SIZE, MAX_INVENTORY - s) : 0
        new_stock = s + order_qty

        d = rand(rng, demand_dist)
        sold = min(d, new_stock)
        sp = new_stock - sold

        lost_sales = max(d - new_stock, 0)

        in_store = min(sp, MAX_STORE)
        in_parking = max(sp - MAX_STORE, 0)

        cost = 0
        cost += a == 1 ? order_cost : 0
        cost += in_store * holding_cost_store
        cost += in_parking * holding_cost_parking
        cost += lost_sales * stockout_penalty

        r = -cost

        return (sp, r, d)
    end,
    initialstate = Deterministic(10)
)


QuickMDP{Base.UUID("279547a2-79cc-4c13-8ce8-0dfca7e8c6bd"), Int64, Int64, @NamedTuple{stateindex::Dict{Int64, Int64}, isterminal::Bool, actionindex::Dict{Int64, Int64}, initialstate::Deterministic{Int64}, states::UnitRange{Int64}, actions::Vector{Int64}, discount::Float64, gen::var"#11#12"}}((stateindex = Dict(5 => 6, 16 => 17, 7 => 8, 20 => 21, 12 => 13, 8 => 9, 17 => 18, 1 => 2, 19 => 20, 0 => 1â€¦), isterminal = false, actionindex = Dict(0 => 1, 1 => 2), initialstate = Deterministic{Int64}(10), states = 0:20, actions = [0, 1], discount = 0.99, gen = var"#11#12"()))

In [3]:
using Random
rng = MersenneTwister(42)
s = 8
a = 1
for i in 1:10
    sp, r, d = POMDPs.gen(mdp, s, a, rng)
    println("From $s --(a=$a)--> $sp,  Demand: $d, Reward = $r")
end


From 8 --(a=1)--> 11,  Demand: 2, Reward = -44
From 8 --(a=1)--> 4,  Demand: 9, Reward = -28
From 8 --(a=1)--> 9,  Demand: 4, Reward = -38
From 8 --(a=1)--> 6,  Demand: 7, Reward = -32
From 8 --(a=1)--> 6,  Demand: 7, Reward = -32
From 8 --(a=1)--> 6,  Demand: 7, Reward = -32
From 8 --(a=1)--> 3,  Demand: 10, Reward = -26
From 8 --(a=1)--> 5,  Demand: 8, Reward = -30
From 8 --(a=1)--> 11,  Demand: 2, Reward = -44
From 8 --(a=1)--> 12,  Demand: 1, Reward = -48


In [None]:
import Pkg; Pkg.add("TabularTDLearning")
using POMDPs
using QuickPOMDPs
using POMDPTools
using TabularTDLearning
using Random

[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `C:\Users\mathj\.julia\environments\v1.11\Project.toml`
[32m[1m  No Changes[22m[39m to `C:\Users\mathj\.julia\environments\v1.11\Manifest.toml`


In [None]:
function train_q_learning_agent(mdp, alpha::Float64, epsilon::Float64; iterations=10_000)
    ql = QLearningSolver(
        max_iterations=iterations,
        alpha=alpha,
        epsilon=epsilon,
        gamma=mdp.discount
    )
    policy = solve(ql, mdp)
    return policy, ql.q
end

train_q_learning_agent (generic function with 1 method)

In [None]:
function evaluate_policy(mdp, policy; num_trials=100, max_steps=100)
    sim = RolloutSimulator(max_steps=max_steps)
    rewards = [simulate(sim, mdp, policy) for _ in 1:num_trials]
    return mean(rewards)
end

evaluate_policy (generic function with 1 method)

# The value iteration algorithm to find the optimal policy

In [None]:
function value_iteration_gen(mdp; Î³=POMDPs.discount(mdp), Î¸=1e-3, max_iter=100000, n_samples=100)
    states = collect(POMDPs.states(mdp))
    actions_per_state = Dict(s => collect(POMDPs.actions(mdp, s)) for s in states)

    V = Dict(s => 0.0 for s in states)
    Ï€ = Dict(s => first(actions_per_state[s]) for s in states)

    rng = Random.MersenneTwister(42)  # fixe pour reproductibilitÃ©

    for iter in 1:max_iter
        Î” = 0.0
        V_new = copy(V)

        for s in states
            v_old = V[s]
            best_value = -Inf
            best_action = nothing

            for a in actions_per_state[s]
                total = 0.0
                for _ in 1:n_samples
                    sp, r, _ = POMDPs.gen(mdp, s, a, rng)
                    total += r + Î³ * V[sp]
                end
                value = total / n_samples

                if value > best_value
                    best_value = value
                    best_action = a
                end
            end

            V_new[s] = best_value
            Ï€[s] = best_action
            Î” = max(Î”, abs(v_old - best_value))
        end

        V = V_new
        if Î” < Î¸
            println("Convergence atteinte (Î” < Î¸ = $Î¸).")
            break
        end
    end

    return V, Ï€
end


value_iteration_gen (generic function with 1 method)

In [None]:
V_opt, Ï€_opt = value_iteration_gen(mdp)
for s in sort(collect(keys(Ï€_opt)))
    println("Stock $s â†’ Action optimale : ", Ï€_opt[s] == 1 ? "Commander" : "Ne rien faire")
end


Stock 0 â†’ Action optimale : Commander
Stock 1 â†’ Action optimale : Commander
Stock 2 â†’ Action optimale : Commander
Stock 3 â†’ Action optimale : Commander
Stock 4 â†’ Action optimale : Commander
Stock 5 â†’ Action optimale : Commander
Stock 6 â†’ Action optimale : Commander
Stock 7 â†’ Action optimale : Commander
Stock 8 â†’ Action optimale : Commander
Stock 9 â†’ Action optimale : Commander
Stock 10 â†’ Action optimale : Commander
Stock 11 â†’ Action optimale : Commander
Stock 12 â†’ Action optimale : Commander
Stock 13 â†’ Action optimale : Commander
Stock 14 â†’ Action optimale : Ne rien faire
Stock 15 â†’ Action optimale : Ne rien faire
Stock 16 â†’ Action optimale : Ne rien faire
Stock 17 â†’ Action optimale : Ne rien faire
Stock 18 â†’ Action optimale : Ne rien faire
Stock 19 â†’ Action optimale : Ne rien faire
Stock 20 â†’ Action optimale : Ne rien faire


# The linear programming formulation to find the optimal policy

In [4]:
using JuMP
using GLPK

states = 0:MAX_INVENTORY
actions = [0, 1]
Î³ = discount(mdp)

function transition(s, a)
    outcomes = []
    order_qty = a == 1 ? min(ORDER_SIZE, MAX_INVENTORY - s) : 0
    new_stock = s + order_qty

    for d in support(demand_dist)
        sold = min(d, new_stock)
        sp = new_stock - sold
        prob = pdf(demand_dist, d)
        push!(outcomes, (sp, prob, d))
    end
    return outcomes
end

function reward(s, a, sp, d)
    order_qty = a == 1 ? min(ORDER_SIZE, MAX_INVENTORY - s) : 0
    new_stock = s + order_qty

    lost_sales = max(d - new_stock, 0)
    in_store = min(sp, MAX_STORE)
    in_parking = max(sp - MAX_STORE, 0)

    cost = 0
    cost += a == 1 ? order_cost : 0
    cost += in_store * holding_cost_store
    cost += in_parking * holding_cost_parking
    cost += lost_sales * stockout_penalty

    return -cost
end


reward (generic function with 1 method)

In [6]:
model = Model(GLPK.Optimizer)
@variable(model, v[s in states])  # v(s) pour chaque Ã©tat

# Contraintes : Bellman optimality
for s in states
    for a in actions
        expected_value = 0.0
        for (sp, prob, d) in transition(s, a)
            r = reward(s, a, sp, d)
            expected_value += prob * (r + Î³ * v[sp])
        end
        @constraint(model, v[s] â‰¥ expected_value)
    end
end

# Objectif : minimiser somme pondÃ©rÃ©e (Î±(s) = 1 âˆ€s)
@objective(model, Min, sum(v[s] for s in states))

optimize!(model)

# Valeur optimale pour chaque Ã©tat
V_lp = Dict(s => JuMP.value(v[s]) for s in states)



Dict{Int64, Float64} with 21 entries:
  5  => -4880.4
  16 => -4810.99
  20 => -4810.05
  12 => -4829.17
  8  => -4845.59
  17 => -4809.17
  1  => -4982.53
  19 => -4808.83
  0  => -5017.85
  6  => -4865.76
  11 => -4830.99
  9  => -4838.93
  14 => -4818.93
  3  => -4923.22
  7  => -4854.32
  4  => -4899.66
  13 => -4825.59
  15 => -4814.15
  2  => -4950.9
  â‹®  => â‹®

In [7]:
Ï€_lp = Dict()

for s in states
    best_value = -Inf
    best_action = nothing
    for a in actions
        total = 0.0
        for (sp, prob, d) in transition(s, a)
            r = reward(s, a, sp, d)
            total += prob * (r + Î³ * V_lp[sp])
        end
        if total > best_value
            best_value = total
            best_action = a
        end
    end
    Ï€_lp[s] = best_action
end

println("ðŸ“Š Politique optimale obtenue par programmation linÃ©aire :")
for s in states
    println("Stock $s â†’ ", Ï€_lp[s] == 1 ? "Commander" : "Ne rien faire")
end


ðŸ“Š Politique optimale obtenue par programmation linÃ©aire :
Stock 0 â†’ Commander
Stock 1 â†’ Commander
Stock 2 â†’ Commander
Stock 3 â†’ Commander
Stock 4 â†’ Commander
Stock 5 â†’ Commander
Stock 6 â†’ Commander
Stock 7 â†’ Commander
Stock 8 â†’ Commander
Stock 9 â†’ Commander
Stock 10 â†’ Commander
Stock 11 â†’ Commander
Stock 12 â†’ Commander
Stock 13 â†’ Ne rien faire
Stock 14 â†’ Ne rien faire
Stock 15 â†’ Ne rien faire
Stock 16 â†’ Ne rien faire
Stock 17 â†’ Ne rien faire
Stock 18 â†’ Ne rien faire
Stock 19 â†’ Ne rien faire
Stock 20 â†’ Ne rien faire
