## Multi-Caregiver Crying Baby
Partially Observable Markov Games

In [1]:
import Pkg
if !haskey(Pkg.installed(), "JuMP") 
    Pkg.add("JuMP")
end
if !haskey(Pkg.installed(), "Ipopt")
    Pkg.add("Ipopt")
end

using JuMP, Ipopt, Random

‚îî @ Pkg C:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.7\Pkg\src\Pkg.jl:595
‚îî @ Pkg C:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.7\Pkg\src\Pkg.jl:595


## Init

In [2]:

SING = "SING"
CRYING = "CRYING"
QUIET = "QUIET"
FEED = "FEED"
SATED = "SATED"
HUNGRY = "HUNGRY"

p_cry_when_hungry_in_sing = 0.9
p_cry_when_hungry = 0.9
p_cry_when_not_hungry = 0.0
p_become_hungry = 0.5

r_hungry = 10.0
r_sing = 0.5
r_feed = 5.0

struct POMG
    Œ≥  # discount factor
    ‚Ñê  # agents
    ùíÆ  # state space
    ùíú  # joint action space 
    ùí™  # joint observation space
    T  # transition function
    O  # joint observation function
    R  # joint reward function

    function POMG(discount, agents, states, jointAction, jointObservation, transitionFunc, jointObservationFunc, jointRewardFunc)
        new(discount, agents, states, jointAction, jointObservation, transitionFunc, jointObservationFunc, jointRewardFunc)
    end
end

struct ConditionalPlan
    a   # action to take at root
    subplans    # dictionary mapping observations to subplans 
end

struct SimpleGame
    Œ≥  # discount factor
    ‚Ñê  # agents
    ùíú  # joint action space
    R  # joint reward function
end

struct NashEquilibrium end

# The general structure of Simple game 
# source: Algorithms for Decision Making book
struct SimpleGamePolicy
    p # dictionary mapping actions to probabilities

    # Returns a random policy
    function SimpleGamePolicy(p::Base.Generator)
        return SimpleGamePolicy(Dict(p))
    end
    # Return policy from dict
    function SimpleGamePolicy(p::Dict)
        vs = collect(values(p))
        vs ./= sum(vs)
        return new(Dict(k => v for (k,v) in zip(keys(p), vs)))
    end

    SimpleGamePolicy(ai) = new(Dict(ai => 1.0))
end

ConditionalPlan(a) = ConditionalPlan(a, Dict())

(œÄ::ConditionalPlan)() = œÄ.a
(œÄ::ConditionalPlan)(o) = œÄ.subplans[o]

## Transition

In [3]:
function transition(s, a, s‚Ä≤)
    # Regardless, feeding makes the baby sated.
    if a[1] == "FEED" || a[2] == "FEED" 
        if s‚Ä≤ == "SATED" 
            return 1.0
        else 
            return 0.0
        end
    else
        # If neither caretaker feed, then one of two things happens.
        # First, a baby that is hungry remains hungry 
        if s == "HUNGRY"
            if s‚Ä≤ == "HUNGRY"
                return 1.0
            else 
                return 0.0
            end
        # Otherwise, it becomes hungry with a fixed probability.
        else
            if s‚Ä≤ == "SATED"
                return 1.0 - p_become_hungry
            else
                return p_become_hungry
            end 
        end 
    end
end

transition (generic function with 1 method)

## Observation

In [4]:
function joint_observation(a, s‚Ä≤, o)
    # If at least one caregiver sings, then both observe the result.
    if a[1] == "SING" || a[2] == "SING"
        # If the baby is hungry, then the caregivers both observe crying/silent together.
        if s‚Ä≤ == "HUNGRY"
            if o[1] == "CRYING" && o[2] == "CRYING"
                return p_cry_when_hungry_in_sing
            elseif o[1] == "QUIET" && o[2] == "QUIET"
                return 1.0 - p_cry_when_hungry_in_sing
            else 
                return 0.0
            end
        # Otherwise the baby is sated
        else
            if o[1] == "QUIET" && o[2] == "QUIET"
                return 1.0
            else 
                return 0.0
            end
        end
    # Otherwise the caregivers fed and/or ignored the baby
    else 
        # If the baby is hungry, then there‚Ä≤s a probability it cries
        if s‚Ä≤ == "HUNGRY"
            if o[1] == "CRYING" && o[2] == "CRYING"
                return p_cry_when_hungry 
            elseif o[1] == "QUIET" && o[2] == "QUIET"
                return 1.0 - p_cry_when_hungry
            else 
                return 0.0
            end 
        # If the baby is sated, then there‚Ä≤s no probability it cries
        else
            if o[1] == "CRYING" && o[2] == "CRYING" 
                return p_cry_when_not_hungry
            elseif o[1] == "QUIET" && o[2] == "QUIET"
                return 1.0 - p_cry_when_not_hungry
            else 
                return 0.0
            end
        end 
    end
end

joint_observation (generic function with 1 method)

## Reward

In [5]:
function joint_reward(s, a) 
    r = [0.0, 0.0]
    
    # Both caregivers do not want the child to be hungry
    if s == "HUNGRY"
        r -= [r_hungry, r_hungry]
    end

    # the first caregiver favors feeding 
    if a[1] == "FEED" 
        r[1] -= r_feed / 2.0 
    elseif a[1] == "SING"
        r[1] -= r_sing
    end

    # the second caregiver favors singing
    if a[2] == "SING"
        r[2] -= r_sing / 2
    elseif a[2] == "FEED"
        r[2] -= r_feed
    end
    
    return r
end

joint_reward (generic function with 1 method)

## Evaluating Conditional Plans

In [6]:
#  The lookahead function below is used to calculate the evaluate plan
function lookahead(ùí´::POMG, U, s, a) 
    ùíÆ, ùí™, T, O, R, Œ≥ = ùí´.ùíÆ, joint(ùí´.ùí™), ùí´.T, ùí´.O, ùí´.R, ùí´.Œ≥
    u‚Ä≤ = sum(T(s,a,s‚Ä≤)*sum(O(a,s‚Ä≤,o)*U(o,s‚Ä≤) for o in ùí™) for s‚Ä≤ in ùíÆ)
    return R(s,a) + Œ≥*u‚Ä≤
end

#  The lookahead function below is used to calculate the utility
function evaluate_plan(ùí´::POMG, œÄ, s)
    a = Tuple(œÄi() for œÄi in œÄ)
    U(o,s‚Ä≤) = evaluate_plan(ùí´, [œÄi(oi) for (œÄi, oi) in zip(œÄ,o)], s‚Ä≤)
    return isempty(first(œÄ).subplans) ? ùí´.R(s,a) : lookahead(ùí´, U, s, a)
end

# used to calculate utility with initial belief b when executing joint policy in POMG ùí´
function utility(ùí´::POMG, b, œÄ)
    u = [evaluate_plan(ùí´, œÄ, s) for s in ùí´.ùíÆ]
    return sum(bs * us for (bs, us) in zip(b, u))
end

utility (generic function with 1 method)

## Nash Equilibrium

In [7]:
function expand_conditional_plans(ùí´, Œ†)
    ‚Ñê, ùíú, ùí™ = ùí´.‚Ñê, ùí´.ùíú, ùí´.ùí™
    return [[ConditionalPlan(ai, Dict(oi => œÄi for oi in ùí™[i]))
        for œÄi in Œ†[i] for ai in ùíú[i]] for i in ‚Ñê]
end

joint(X) = vec(collect(Iterators.product(X...)))
joint(œÄ, œÄi, i) = [i == j ? œÄi : œÄj for (j, œÄj) in enumerate(œÄ)]

# Returns the format tensor of ùí´
function tensorform(ùí´::SimpleGame)
    ‚Ñê, ùíú, R = ùí´.‚Ñê, ùí´.ùíú, ùí´.R
    ‚Ñê‚Ä≤ = eachindex(‚Ñê)
    ùíú‚Ä≤ = [eachindex(ùíú[i]) for i in ‚Ñê]
    R‚Ä≤ = [R(a) for a in joint(ùíú)]
    return ‚Ñê‚Ä≤, ùíú‚Ä≤, R‚Ä≤
end

# Find the Nash Equilibrium
function solve(M::NashEquilibrium, ùí´::SimpleGame)
    ‚Ñê, ùíú, R = tensorform(ùí´)
    model = Model(Ipopt.Optimizer)
    #  declaration
    @variable(model, U[‚Ñê])
    # constraint 3
    @variable(model, œÄ[i=‚Ñê, ùíú[i]] ‚â• 0)
    # objective function
    @NLobjective(model, Min,
        sum(U[i] - sum(prod(œÄ[j,a[j]] for j in ‚Ñê) * R[y][i]
            for (y,a) in enumerate(joint(ùíú))) for i in ‚Ñê))
    # constraint 1
    @NLconstraint(model, [i=‚Ñê, ai=ùíú[i]],
        U[i] ‚â• sum(
            prod(j==i ? (a[j]==ai ? 1.0 : 0.0) : œÄ[j,a[j]] for j in ‚Ñê)
            * R[y][i] for (y,a) in enumerate(joint(ùíú))))
    # constrain 2
    @constraint(model, [i=‚Ñê], sum(œÄ[i,ai] for ai in ùíú[i]) == 1)
    # Model optimization
    optimize!(model)
    œÄi‚Ä≤(i) = SimpleGamePolicy(ùí´.ùíú[i][ai] => value(œÄ[i,ai]) for ai in ùíú[i])
    return [œÄi‚Ä≤(i) for i in ‚Ñê]
end

solve (generic function with 1 method)

## Dynamic Programming

In [8]:
struct POMGDynamicProgramming
    b # initial belief
    d # depth of conditional plans
end

# used to determine which branch is dominated by another branch
function is_dominated(ùí´::POMG, Œ†, i, œÄi)
    ‚Ñê, ùíÆ = ùí´.‚Ñê, ùí´.ùíÆ
    jointŒ†noti = joint([Œ†[j] for j in ‚Ñê if j ‚â† i])
    œÄ(œÄi‚Ä≤, œÄnoti) = [j==i ? œÄi‚Ä≤ : œÄnoti[j>i ? j-1 : j] for j in ‚Ñê]
    Ui = Dict((œÄi‚Ä≤, œÄnoti, s) => evaluate_plan(ùí´, œÄ(œÄi‚Ä≤, œÄnoti), s)[i]
            for œÄi‚Ä≤ in Œ†[i], œÄnoti in jointŒ†noti, s in ùíÆ)
    model = Model(Ipopt.Optimizer)
    @variable(model, Œ¥)
    @variable(model, b[jointŒ†noti, ùíÆ] ‚â• 0)
    @objective(model, Max, Œ¥)
    @constraint(model, [œÄi‚Ä≤=Œ†[i]],
        sum(b[œÄnoti, s] * (Ui[œÄi‚Ä≤, œÄnoti, s] - Ui[œÄi, œÄnoti, s])
        for œÄnoti in jointŒ†noti for s in ùíÆ) ‚â• Œ¥)
    @constraint(model, sum(b) == 1)
    optimize!(model)
    return value(Œ¥) ‚â• 0
end

# use to cut branch
function prune_dominated!(Œ†, ùí´::POMG)
    done = false
    while !done
        done = true
        for i in shuffle(ùí´.‚Ñê)
            for œÄi in shuffle(Œ†[i])
                if length(Œ†[i]) > 1 && is_dominated(ùí´, Œ†, i, œÄi)
                    filter!(œÄi‚Ä≤ -> œÄi‚Ä≤ ‚â† œÄi, Œ†[i])
                    done = false
                    break
                end
            end
        end
    end
end

# Dynamic programming computes a Nash equilibrium œÄ for a POMG ùí´, given an initial belief b and horizon depth d. 
function solve(M::POMGDynamicProgramming, ùí´::POMG)
    ‚Ñê, ùíÆ, ùíú, R, Œ≥, b, d = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú, ùí´.R, ùí´.Œ≥, M.b, M.d
    Œ† = [[ConditionalPlan(ai) for ai in ùíú[i]] for i in ‚Ñê]
    for t in 1:d
        Œ† = expand_conditional_plans(ùí´, Œ†)
        prune_dominated!(Œ†, ùí´)
    end
    ùí¢ = SimpleGame(Œ≥, ‚Ñê, Œ†, œÄ -> utility(ùí´, b, œÄ))
    œÄ = solve(NashEquilibrium(), ùí¢)
    return Tuple(argmax(œÄi.p) for œÄi in œÄ)
end


solve (generic function with 2 methods)

## Result

In [9]:
multiCaregiver = POMG(0.9, 
                [1, 2], 
                ["HUNGRY", "SATED"], 
                [["FEED", "SING", "IGNORE"], ["FEED", "SING", "IGNORE"]], 
                [["CRYING", "QUIET"], ["CRYING", "QUIET"]], 
                transition, 
                joint_observation, 
                joint_reward);
                b = [0.5, 0.5];

dyP = POMGDynamicProgramming(b, 1);
result = solve(dyP, multiCaregiver);
print(result)