In [6]:
import sys 
sys.path.append("../")

import numpy as np 
from pprint import pprint
from typing import TypeVar, Iterable, Mapping, Dict, Callable, Iterator
import matplotlib.pyplot as plt 
import rl.markov_process as mp
import itertools
import rl.markov_decision_process as mdp
import rl.monte_carlo as mc
import rl.td as td 
from rl.distribution import Choose, Categorical
from rl.function_approx import LinearFunctionApprox, Tabular
import rl.chapter11.control_utils as control
import rl.iterate as iterate
import rl.policy as policy
from rl.approximate_dynamic_programming import QValueFunctionApprox, NTStateDistribution

### Implementing SARSA:

In [7]:
# S = TypeVar("S")
# A = TypeVar("A")


# def sarsa_control(mdp : mdp.MarkovDecisionProcess[S, A],
#                   start_state_distribution : NTStateDistribution[S],
#                   approx_0 : QValueFunctionApprox[S, A],
#                   gamma : float,
#                   eps_decay_func : Callable[[int], float],
#                   max_episode_length : int) -> Iterator[QValueFunctionApprox[S, A]]:
#     q : QValueFunctionApprox[S, A] = approx_0
#     num_episodes : int = 0
#     yield q

#     while True:
#         num_episodes += 1
#         eps : float = eps_decay_func(num_episodes)
#         state : S = start_state_distribution.sample()
#         action : A = td.epsilon_greedy_action(q=q, 
#                                               nt_state=state, 
#                                               actions=set(mdp.actions(state)), 
#                                               ε=eps)
        
#         steps : int = 0

#         while isinstance(state, mp.NonTerminal) and steps < max_episode_length:
#             next_state, reward = mdp.step(state, action).sample()
#             if isinstance(next_state, mp.NonTerminal):
#                 next_act : A = td.epsilon_greedy_action(q=q, 
#                                                         nt_state=next_state, 
#                                                         actions=set(mdp.actions(next_state)), 
#                                                         ε=steps)
#                 q.update([((state, action), reward + gamma * q((next_state, next_act)))])
#                 action = next_act
#             else:
#                 q.update([((state, action), reward)])
            
#             yield q
#             state = next_state
#             steps += 1

In [8]:
from rl.chapter3.simple_inventory_mdp_cap import *
from rl.dynamic_programming import policy_iteration_result

user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0
user_gamma = 0.9

si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
    SimpleInventoryMDPCap(
        capacity=user_capacity,
        poisson_lambda=user_poisson_lambda,
        holding_cost=user_holding_cost,
        stockout_cost=user_stockout_cost
    )

print("MDP Policy Iteration Optimal Value Function and Optimal Policy")
print("--------------" * 2)

opt_vf_pi, opt_policy_pi = policy_iteration_result(si_mdp, gamma=user_gamma)

pprint({k : round(v, 3) for k, v in opt_vf_pi.items()})
print()
print(opt_policy_pi)

MDP Policy Iteration Optimal Value Function and Optimal Policy
----------------------------
{NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -29.992,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -28.992,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.661,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -27.992,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.661,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -34.895}

For State InventoryState(on_hand=0, on_order=0): Do Action 1
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0



### Using a Tabular approximation for the Q-value function and implementing SARSA:

In [9]:
num_episodes = 1000
num_steps = 1000
# ffs = [(lambda x, s=s, a=a : 1.0 if x[0] == s and x[1] == a else 0.0) for s in si_mdp.non_terminal_states for a in si_mdp.actions(s)]
# approx_0 = LinearFunctionApprox.create(feature_functions=ffs, regularization_coeff=0.01)
approx_0 = Tabular()

# sarsa_qvf = sarsa_control(mdp=si_mdp, start_state_distribution=Choose(si_mdp.non_terminal_states), approx_0=approx_0,
#                           gamma=user_gamma, eps_decay_func=lambda k : 1. / k, max_episode_length=num_steps)
sarsa_qvf = td.glie_sarsa(mdp=si_mdp, states=Choose(si_mdp.non_terminal_states), approx_0=approx_0,
                          γ=user_gamma, ϵ_as_func_of_episodes=lambda k : 1. / k, max_episode_length=num_steps)

*_, opt_qvf = itertools.islice(sarsa_qvf, num_episodes)
opt_vf, opt_policy = control.get_vf_and_policy_from_qvf(mdp=si_mdp, qvf=opt_qvf)

pprint({s : round(v, 3) for s, v in opt_vf.items()})
print()
print(opt_policy)

{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -21.543,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -14.053,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -15.139,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -14.666,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -15.537,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -17.051}

For State InventoryState(on_hand=0, on_order=0): Do Action 2
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0



### Implementing Q-learning

In [14]:
num_episodes = 1000
num_steps = 1000
# ffs = [(lambda x, s=s, a=a : 1.0 if x[0] == s and x[1] == a else 0.0) for s in si_mdp.non_terminal_states for a in si_mdp.actions(s)]
# approx_0 = LinearFunctionApprox.create(feature_functions=ffs, regularization_coeff=0.01)
approx_0 = Tabular()

qlearning_qvf = td.q_learning(mdp=si_mdp, policy_from_q=lambda q, m : mc.epsilon_greedy_policy(q=q, mdp=m, ε=0.01), 
                              states=Choose(si_mdp.non_terminal_states), approx_0=approx_0, γ=user_gamma, max_episode_length=num_steps)

*_, opt_qvf = itertools.islice(qlearning_qvf, num_episodes)
opt_vf, opt_policy = control.get_vf_and_policy_from_qvf(mdp=si_mdp, qvf=opt_qvf)

pprint({s : round(v, 3) for s, v in opt_vf.items()})
print()
print(opt_policy)

{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -18.574,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -12.228,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -11.603,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -12.853,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -13.161,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -14.639}

For State InventoryState(on_hand=0, on_order=0): Do Action 2
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0

