In [1]:
import sys
sys.path.append("../")

import itertools
import numpy as np
from pprint import pprint
from collections import defaultdict
from typing import Iterable, Iterator, TypeVar, Mapping, List, Tuple, Sequence, Callable

from rl.returns import returns
from rl.td import td_prediction
from rl.distribution import Choose
from rl.monte_carlo import mc_prediction
from rl.markov_process import TransitionStep, ReturnStep
from rl.function_approx import LinearFunctionApprox, AdamGradient, FunctionApprox, Tabular
from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite, InventoryState
from rl.chapter10.prediction_utils import mc_prediction_learning_rate, td_prediction_learning_rate

# Question 1: Implementing Tabular MC for Prediction

In [2]:
S = TypeVar("S")

def TabularMCPrediction(traces : Iterable[Iterable[TransitionStep[S]]],
                        gamma : float = 0.9, episode_tol : float = 1e-6, num_traces : int = 1000) -> Mapping[S, float]:
    episodes : Iterator[ReturnStep[S]] = (returns(trace, gamma, episode_tol) for trace in traces)
    vf : Dict[S, float] = defaultdict(float)
    counts : Dict[S, int] = defaultdict(lambda : int(1))

    num_episodes = int(np.log(episode_tol) / np.log(gamma))

    for n, episode in enumerate(itertools.islice(episodes, num_traces)):
        for tr in itertools.islice(episode, num_episodes):
            vf[tr.state] += (tr.return_ - vf[tr.state]) / (counts[tr.state] * n + 1)
            counts[tr.state] += 1

    return vf

# Question 2: Implementing Tabular TD for Prediction

In [3]:
def TabularTDPrediction(traces: Iterable[TransitionStep[S]], vf : Mapping[S, float], gamma : float = 0.9,
                        init_lr : float = 0.05, half_life : float = 1000.0, exp : float = 0.5) -> Mapping[S, float]:
    counts : Dict[S, int] = defaultdict(lambda : int(1))

    for n, tr in enumerate(traces):
        lr = init_lr / (1 + ((counts[tr.state] - 1) / half_life) ** exp)
        vf[tr.state] += lr * (tr.reward + gamma * vf[tr.next_state] - vf[tr.state])
        counts[tr.state] += 1
    
    return vf

# Question 3A: testing `TabularMCPrediction` on `SimpleInventoryMRPFinite`:

In [4]:
from rl.chapter2.simple_inventory_mrp import *

user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0
user_gamma = 0.9

si_mrp = SimpleInventoryMRPFinite(capacity=user_capacity,
                                  poisson_lambda=user_poisson_lambda,
                                  holding_cost=user_holding_cost,
                                  stockout_cost=user_stockout_cost)

print("Exact Value Function:")
print("--" * 20)
si_mrp.display_value_function(gamma=user_gamma)
print()

Exact Value Function:
----------------------------------------
{NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.345,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.345,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.932,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.932,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.345,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.511}



### Tabular MC Prediction using the repo's MC Prediction:

In [5]:
ffs = [(lambda x, s=s : float(x==s)) for s in si_mrp.non_terminal_states]
mc_adam : AdamGradient = AdamGradient(learning_rate=0.005, decay1=0.9, decay2=0.999)
mc_func_approx : FunctionApprox = LinearFunctionApprox.create(feature_functions=ffs, adam_gradient=mc_adam)

mc_iterator : Iterator[FunctionApprox[InventoryState]] = mc_prediction(traces=si_mrp.reward_traces(Choose(si_mrp.non_terminal_states)),
                                                                       approx_0=mc_func_approx,
                                                                       γ=user_gamma, episode_length_tolerance=1e-6)

*_, last_mc = itertools.islice(mc_iterator, 1000)
print("MC Prediction using LinearFuncApprox:")
print("--" * 20)
pprint({s : round(last_mc.evaluate([s])[0], 3) for s in si_mrp.non_terminal_states})

MC Prediction using LinearFuncApprox:
----------------------------------------
{NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.257,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.341,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.778,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.771,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.174,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.516}


### Tabular MC Prediction using own function:

In [6]:
traces : Iterable[Iterable[TransitionStep[InventoryState]]] = si_mrp.reward_traces(Choose(si_mrp.non_terminal_states))

mc_vf = TabularMCPrediction(traces=traces, gamma=user_gamma, episode_tol=1e-6, num_traces=10000)
print("Value Function from Tabular MC Prediction:")
print("--" * 20)
pprint({s : round(mc_vf[s], 3) for s in si_mrp.non_terminal_states})
print()

Value Function from Tabular MC Prediction:
----------------------------------------
{NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -32.039,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -29.814,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -30.89,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -28.142,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -31.581,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -36.854}



### Tabular TD Prediction using repo's TD Prediction:

In [7]:
ffs = [(lambda x, s=s : float(x == s)) for s in si_mrp.non_terminal_states]
td_adam : AdamGradient = AdamGradient(learning_rate=0.005, decay1=0.9, decay2=0.999)
td_func_approx : FunctionApprox = LinearFunctionApprox.create(feature_functions=ffs, adam_gradient=td_adam)

num_traces = 1000
num_episodes = 1000

traces = itertools.chain.from_iterable(itertools.islice(tr, num_episodes) for tr in si_mrp.reward_traces(Choose(si_mrp.non_terminal_states)))

td_iterator : Iterator[FunctionApprox[InventoryState]] = td_prediction(transitions=traces,
                                                                       approx_0=td_func_approx,
                                                                       γ=user_gamma)

*_, last_td = itertools.islice(td_iterator, num_traces * num_episodes)
print("TD Prediction using LinearFuncApprox:")
print("--" * 20)
pprint({s : round(last_td.evaluate([s])[0], 3) for s in si_mrp.non_terminal_states})

TD Prediction using LinearFuncApprox:
----------------------------------------
{NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.326,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.317,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.932,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.958,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.393,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.475}


### Tabular TD Prediction using own function:

In [10]:
num_traces = 1000
num_episodes = 1000

traces : Iterable[Iterable[TransitionStep[InventoryState]]] = si_mrp.reward_traces(Choose(si_mrp.non_terminal_states))
traces = [j for i in itertools.islice(traces, num_episodes) for j in itertools.islice(i, num_traces)]

td_vf = TabularTDPrediction(traces=traces, gamma=user_gamma, vf={s : 0.0 for s in si_mrp.non_terminal_states})
print("Value Function from Tabular MC Prediction:")
print("--" * 20)
pprint({s : round(td_vf[s], 3) for s in si_mrp.non_terminal_states})
print()

Value Function from Tabular MC Prediction:
----------------------------------------
{NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.583,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.465,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -29.033,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -28.038,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.496,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.665}

