# Model-based Methods

In [None]:
using Match, Plots
include("operators.jl")

# Definition of the environment (MDP)

State:

In [None]:
S = [if i==j && i==2 () else (i,j) end for i=1:3,j=1:4 ]

Actions

In [None]:
A = ["↑","↓","←","→"]

Function for checking that actions are valid within our environment:

In [None]:
OK(xy) = if xy[1] > 0 && xy[2] > 0 && xy[1] <= size(S,1) && xy[2] <= size(S,2) && S[xy[1],xy[2]] != ()  true else false end


Function returning the next state if we take a given action:

In [None]:
Af((i,j),a) =  @match a begin
                
    "↑" =>  if OK((i-1,j)) 
                (i-1,j) 
            else (i,j) end
    "↓" =>  if OK((i+1,j))
                (i+1,j) 
            else (i,j) end
    "←" => if OK((i,j-1)) 
                (i,j-1) 
            else (i,j)  end
    "→" => if OK((i,j+1))
            (i,j+1)
        else (i,j)  end
    end

Transition Probabilities:

In [None]:
P(a) = @match a begin 
                "↑" => [0.8,0.0,0.1,0.1]
                "↓" => [0.0,0.8,0.1,0.1]
                "←" => [0.1,0.1,0.8,0.0]
                "→" => [0.1,0.1,0.0,0.8]
             end

Reward function

In [None]:
R = [@match (i,j) begin (2,2)=> nothing
                        (1,4) => 1. 
                        (2,4) => -1. 
                        _ => 0.
                    end for i=1:3,j=1:4 ]

The reward for a particular state:

In [None]:
r(s) = R[s[1],last(s)]

In [None]:
r((2,4))

The discount factor ($\gamma$)

In [None]:
γ = 0.9

Γ(s) = @match s begin
            (1,4) => 0.0 #Terminal state
            (2,4) => 0.0 #Terminal state
            _ => γ
        end


Calculating the value of the next state ($V_{s\prime}$)

In [None]:
Vₛ′(V,s) = [ V[first(s′),last(s′)] for s′ in Af.([s],A) ]

Calculating the quality of an action/value of state:

In [None]:
qₐ(s,a,V) = if s != () && a != ""
    sum(P(a) .* (r(s) .+ Γ(s) .* Vₛ′(V,s)))
  else
    ()
  end

**Bellman equation**

$$ v_*(s) =  \underset a \max [ R^a_s + \gamma \sum P^a_{ss'}v_*(s') ], \gamma \in (0,1) $$

In [None]:
v(s,V) = maximum(qₐ.([s],A,[V]))

# Value Iteration

Parameters:

In [None]:
Vᵥ = copy(R) # Arbitrary initialisation (0.0)
Δᵥ = [] # value difference
Θ = 0.00001 # threshold
k = 100 #iterations

Main algorithm:

In [None]:
for i = 1:k
    Vᵥ′ = v.(S,[Vᵥ])
    push!(Δᵥ,first(findmax(Vᵥ′ .- Vᵥ)))
    global Vᵥ = copy(Vᵥ′)
    if last(Δᵥ) <= Θ
        break
    end
end

Convergence:

In [None]:
plot(Δᵥ)

In [None]:
Vᵥ

## Policy Extraction

Quality of actions in state:

In [None]:
Qπ(s,V) = qₐ.([s],A,[V])

Extracting the policy from best action:

In [None]:
Πᵒ(s,V) = A[last(findmax(Qπ(s,V)))]

In [None]:
Πᵒ.(S,[Vᵥ])