In [119]:
# import packages needed
import sys
sys.path.append('/Users/yujiehe/Desktop/Stanford/Win2021/CME241/RL-book/')

import itertools
import numpy as np

from typing import Iterable, Callable, Mapping, TypeVar
from rl.markov_process import TransitionStep
from rl.returns import returns
from rl.function_approx import Tabular
from rl.iterate import last

from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite, InventoryState
from rl.monte_carlo import mc_prediction
from rl.td import td_prediction

from rl.distribution import Constant

In [120]:
S = TypeVar('S')

### Problem 1 & 3: Tabular MC Prediction from scratch

In [79]:
def tab_mc_prediction(trs: Iterable[Iterable[TransitionStep[S]]],
                      weight_func: Callable[[int],float],
                      gamma: float,
                      tol: float = 1e-06):
    
    episodes = [returns(trace,gamma,tol) for trace in trs]
    
    v: List[Mapping[S,float]] = []
    occurence: Mapping[S,int] = {}
        
    for episode in episodes:
        if len(v) == 0:
            curr_v : Mapping[S,float] = {}
        else:
            curr_v : Mapping[S,float] = {k:var for (k,var) in v[-1].items()}
        for st in episode:
            if st.state in curr_v:
                occurence[st.state] += 1
                curr_v[st.state] = curr_v[st.state]*(1-weight_func(occurence[st.state])) +\
                    weight_func(occurence[st.state])*st.return_
            else:
                occurence[st.state] = 1
                curr_v[st.state] = st.return_
        v.append(curr_v)
        
    return v

#### Verify Correctness

In [80]:
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mrp = SimpleInventoryMRPFinite(
    capacity=user_capacity,
    poisson_lambda=user_poisson_lambda,
    holding_cost=user_holding_cost,
    stockout_cost=user_stockout_cost
)

start = InventoryState(on_hand = 0, on_order = 0)
sample = [list(itertools.islice(si_mrp.simulate_reward(Constant(start)),1000)) for _ in range(100)]

In [81]:
res_1 = tab_mc_prediction(sample, lambda n: 1./n, user_gamma)[-1]

In [82]:
approx = Tabular()
res_2 = last(mc_prediction(sample,approx,user_gamma)).values_map

In [83]:
for k in res_1:
    print(f"State {k}: %.4f vs. %.4f"%(res_1[k],res_2[k]))

State InventoryState(on_hand=0, on_order=0): -35.4726 vs. -35.4726
State InventoryState(on_hand=0, on_order=2): -28.3194 vs. -28.3194
State InventoryState(on_hand=2, on_order=0): -30.3644 vs. -30.3644
State InventoryState(on_hand=1, on_order=0): -28.8852 vs. -28.8852
State InventoryState(on_hand=1, on_order=1): -29.2384 vs. -29.2384
State InventoryState(on_hand=0, on_order=1): -27.8437 vs. -27.8437


### Problem 2 & 3: Tabular TD Prediction from scratch

In [138]:
def tab_td_prediction(trans: Iterable[TransitionStep[S]],
                      weight_func: Callable[[int],float],
                      gamma: float,
                      tol: float = 1e-06):
    
    v: Mapping[S,float] = {}
    occurence: Mapping[S,int] = {}
        
    for st in trans:
        if st.state not in v or st.state not in occurence:
            occurence[st.state] = 1
            v[st.state] = st.reward            
        else:
            if st.next_state not in v:
                v[st.next_state] = 0
            occurence[st.state] += 1
            v[st.state] += weight_func(occurence[st.state])*(st.reward + gamma*v[st.next_state] - v[st.state])

    return v

#### Verify correctness

In [134]:
start = InventoryState(on_hand = 0, on_order = 0)
sample = list(itertools.islice(si_mrp.simulate_reward(Constant(start)),10000))

In [139]:
res_1 = tab_td_prediction(sample, lambda n: 1./n, user_gamma)

In [140]:
approx = Tabular()
res_2 = last(td_prediction(sample,approx,user_gamma)).values_map

In [141]:
for k in res_1:
    print(f"State {k}: %.4f vs. %.4f"%(res_1[k],res_2[k]))

State InventoryState(on_hand=0, on_order=0): -22.3340 vs. -22.5776
State InventoryState(on_hand=0, on_order=2): -15.2498 vs. -15.4944
State InventoryState(on_hand=1, on_order=0): -15.9268 vs. -16.1674
State InventoryState(on_hand=0, on_order=1): -14.8432 vs. -15.0843
State InventoryState(on_hand=1, on_order=1): -16.3485 vs. -16.5944
State InventoryState(on_hand=2, on_order=0): -17.2552 vs. -17.5019


Why different?

### Problem 4: Extend RandomWalkMRP