In [1]:
from lib.util import *
from lib.policy import *
from lib.mdp import *
from lib.mrp import *
from lib.env import *

# $\varepsilon$-Greedy Policy Improvement

###### Theorem
For any $\varepsilon$-greedy policy $\pi$, the $\varepsilon$-greedy policy $\pi'$ with respect to $q_\pi$ is an improvement, $v_{\pi'}(s) \geq v_{\pi}(s)$

###### Proof
\begin{align*}
v_{\pi'}(s) &= \sum_{a \in A} \pi'(a\mid s)q_{\pi}(s, a) \\
&= \frac{\varepsilon}{m} \sum_{a \in A} q_{\pi}(s, a) + (1-\varepsilon)\max_{a \in A}q_{\pi}(s, a) \\
&\geq  \frac{\varepsilon}{m}\sum_{a \in A} q_{\pi}(s, a) + (1-\varepsilon)\mathbb{E}_{\sim (\pi - \varepsilon/m)/(1-\varepsilon)}[q_{\pi}(s, a)] \\
&= \frac{\varepsilon}{m}\sum_{a \in A} q_{\pi}(s, a) + (1-\varepsilon)\sum_{a \in A} \frac{\pi(a\mid s)-\varepsilon/m}{1-\varepsilon}q_{\pi}(s, a) \\
&= \sum_{a \in A} \pi(a\mid s)q_{\pi}(s, a) \\
&= v_\pi(s)
\end{align*}

# GLIE (Greedy in the Limit with Infinite Exploration)

All state-action pairs are explored infinitely many times,
$$ lim_{k\rightarrow \infty} N_k (s, a) = \infty $$
The policy converges on a greedy policy,
$$ lim_{k\rightarrow \infty} \pi_k (a\mid s) = \mathbb{1}[a = \text{argmax}_{a' \in A} Q_k (s, a'))] $$

# SARSA

In [2]:
from lib.sarsa import *

# Q-learning

In [3]:
from lib.q_learning import *

# Example

In [4]:
n = 5
n_episodes = 50
gamma = 0.9

In [5]:
P = generate_stochastic_matrix(n)
R = generate_reward_vector(n)
mrp = MRP(P, R, gamma)
mdp = MDP(gamma, [mrp]*n)
Q = generate_stochastic_matrix(n)
policy = Policy(Q)

In [6]:
env = Env(mdp)

In [7]:
sarsa(env, n_episodes, policy, gamma)

defaultdict(float,
            {(2, 0): 5.255554505229622,
             (4, 1): 5.371815661292635,
             (4, 2): 5.37245104530005,
             (1, 3): 5.359127460985131,
             (1, 2): 5.3080752777947495,
             (0, 2): 5.370819571397378,
             (0, 4): 5.3749971396650125,
             (3, 4): 5.357324721816808,
             (3, 2): 5.30810839103836,
             (3, 1): 5.341268632384486,
             (2, 2): 5.2856603300225995,
             (1, 1): 5.329937524176479,
             (0, 0): 5.373468549409457,
             (2, 1): 5.267547396111219,
             (4, 4): 5.320989495041044,
             (1, 4): 5.296409181647264,
             (3, 0): 5.358047848344734,
             (2, 4): 5.273695777669573,
             (0, 3): 5.365308775747046,
             (4, 3): 5.298296622038616,
             (3, 3): 5.361022507904353,
             (2, 3): 5.184043081767655,
             (0, 1): 5.379814922916327,
             (1, 0): 5.366330691058405,
             (4, 0):

In [8]:
qLearning(env, n_episodes, policy, gamma)

defaultdict(float,
            {(0, 2): 5.577850141381746,
             (3, 0): 5.557718054335265,
             (3, 1): 5.559811932532149,
             (3, 2): 5.598494113753515,
             (3, 3): 5.572985907082252,
             (3, 4): 5.565204071126154,
             (4, 0): 5.577245206536512,
             (4, 1): 5.5681106283452415,
             (4, 2): 5.599632076419162,
             (4, 3): 5.562098817110833,
             (4, 4): 5.584706708824207,
             (1, 0): 5.517823104865967,
             (1, 1): 5.543793056437715,
             (1, 2): 5.5734643140125355,
             (1, 3): 5.508933520154736,
             (1, 4): 5.53873081185546,
             (2, 0): 5.46376875634113,
             (2, 1): 5.525759286084648,
             (2, 2): 5.5161474274179625,
             (2, 3): 5.221219776255717,
             (2, 4): 5.469836844432971,
             (0, 0): 5.581410916734324,
             (0, 1): 5.605840357763873,
             (0, 3): 5.596331300564556,
             (0, 4):