In [25]:
# import packages needed
import sys
sys.path.append('/Users/yujiehe/Desktop/Stanford/Win2021/CME241/RL-book/')

import itertools
import numpy as np

from typing import Iterable, Callable, Mapping, TypeVar
from rl.markov_process import TransitionStep
from rl.returns import returns
from rl.function_approx import Tabular
from rl.iterate import last
from collections import defaultdict

from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite, InventoryState
from rl.monte_carlo import mc_prediction
from rl.td import td_prediction

from rl.distribution import Constant

In [2]:
S = TypeVar('S')

### Problem 1 & 3: Tabular MC Prediction from scratch

In [34]:
def tab_mc_prediction(trs: Iterable[Iterable[TransitionStep[S]]],
                      weight_func: Callable[[int],float],
                      gamma: float,
                      tol: float = 1e-06):
    
    episodes = [returns(trace,gamma,tol) for trace in trs]
    
    v: List[Mapping[S,float]] = []
    occurence: Mapping[S,int] = defaultdict(lambda:0)
        
    for episode in episodes:
        if len(v) == 0:
            curr_v : Mapping[S,float] = defaultdict(lambda:0)
        else:
            curr_v : Mapping[S,float] = {k:var for (k,var) in v[-1].items()}
        for st in episode:
            occurence[st.state] += 1
            curr_v[st.state] = curr_v[st.state]*(1-weight_func(occurence[st.state])) +\
                weight_func(occurence[st.state])*st.return_
        v.append(curr_v)
        
    return v

#### Verify Correctness

In [35]:
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mrp = SimpleInventoryMRPFinite(
    capacity=user_capacity,
    poisson_lambda=user_poisson_lambda,
    holding_cost=user_holding_cost,
    stockout_cost=user_stockout_cost
)

start = InventoryState(on_hand = 0, on_order = 0)
sample = [list(itertools.islice(si_mrp.simulate_reward(Constant(start)),1000)) for _ in range(100)]

In [36]:
res_1 = tab_mc_prediction(sample, lambda n: 1./n, user_gamma)[-1]

In [37]:
approx = Tabular()
res_2 = last(mc_prediction(sample,approx,user_gamma)).values_map

In [38]:
for k in res_1:
    print(f"State {k}: %.4f vs. %.4f"%(res_1[k],res_2[k]))

State InventoryState(on_hand=0, on_order=0): -35.7238 vs. -35.7238
State InventoryState(on_hand=0, on_order=2): -28.5955 vs. -28.5955
State InventoryState(on_hand=1, on_order=0): -28.9087 vs. -28.9087
State InventoryState(on_hand=0, on_order=1): -27.9152 vs. -27.9152
State InventoryState(on_hand=1, on_order=1): -29.3579 vs. -29.3579
State InventoryState(on_hand=2, on_order=0): -30.6022 vs. -30.6022


### Problem 2 & 3: Tabular TD Prediction from scratch

In [39]:
def tab_td_prediction(trans: Iterable[TransitionStep[S]],
                      weight_func: Callable[[int],float],
                      gamma: float):
    
    v: Mapping[S,float] = defaultdict(lambda:0)
    occurence: Mapping[S,int] = defaultdict(lambda:0)
        
    for st in trans:
            occurence[st.state] += 1
            v[st.state] += weight_func(occurence[st.state])*(st.reward + gamma*v[st.next_state] - v[st.state])

    return v

#### Verify correctness

In [30]:
start = InventoryState(on_hand = 0, on_order = 0)
sample = list(itertools.islice(si_mrp.simulate_reward(Constant(start)),100000))

In [31]:
res_1 = tab_td_prediction(sample, lambda n: 1./n, user_gamma)

In [32]:
approx = Tabular()
res_2 = last(td_prediction(sample,approx,user_gamma)).values_map

In [33]:
for k in res_1:
    print(f"State {k}: %.4f vs. %.4f"%(res_1[k],res_2[k]))

State InventoryState(on_hand=0, on_order=0): -24.7799 vs. -24.7799
State InventoryState(on_hand=0, on_order=2): -17.6182 vs. -17.6182
State InventoryState(on_hand=1, on_order=0): -18.2559 vs. -18.2559
State InventoryState(on_hand=0, on_order=1): -17.2822 vs. -17.2822
State InventoryState(on_hand=1, on_order=1): -18.6401 vs. -18.6401
State InventoryState(on_hand=2, on_order=0): -19.6305 vs. -19.6305


### Problem 4: Extend RandomWalkMRP