In [1]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
sys.path.append(os.path.dirname(module_path))

import itertools as it
import numpy as np

from typing import Iterable, Callable, Mapping, TypeVar, List
from rl.markov_process import TransitionStep
from rl.returns import returns
from rl.function_approx import Tabular
from rl.iterate import last
from collections import defaultdict

from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite, InventoryState
from rl.monte_carlo import mc_prediction
from rl.td import td_prediction

from rl.distribution import Constant

S = TypeVar('S')

# Q 1,3

In [2]:
def tabular_mc_prediction(trs: Iterable[Iterable[TransitionStep[S]]],
                      weight_func: Callable[[int],float],
                      gamma: float,
                      tol: float = 1e-06) -> List[Mapping[S,float]]:
    
    episodes = [returns(trace,gamma,tol) for trace in trs]
    v: Mapping[S,float] = defaultdict(float)
    occurence: Mapping[S,int] = defaultdict(lambda:0)
        
    for episode in episodes:
        for st in episode:
            occurence[st.state] += 1
            v[st.state] = (1-weight_func(occurence[st.state]))* v[st.state] +\
                weight_func(occurence[st.state])*st.return_
    return v

In [3]:
capacity: int = 2
poisson_lambda: float = 1.0
holding_cost: float = 1.0
stockout_cost: float = 10.0

gamma: float = 0.9

si_mrp: SimpleInventoryMRPFinite = SimpleInventoryMRPFinite(
    capacity=capacity,
    poisson_lambda=poisson_lambda,
    holding_cost=holding_cost,
    stockout_cost=stockout_cost,
)

In [6]:
si_mrp.display_value_function(gamma=gamma)

{InventoryState(on_hand=0, on_order=2): -28.345,
 InventoryState(on_hand=2, on_order=0): -30.345,
 InventoryState(on_hand=1, on_order=1): -29.345,
 InventoryState(on_hand=1, on_order=0): -28.932,
 InventoryState(on_hand=0, on_order=1): -27.932,
 InventoryState(on_hand=0, on_order=0): -35.511}


In [10]:
start = InventoryState(on_hand = 0, on_order = 0)
sample = [list(it.islice(si_mrp.simulate_reward(Constant(start)),1000)) 
          for _ in range(100)]

self_func = tabular_mc_prediction(sample, lambda n: 1./n, gamma)

In [13]:
self_func

defaultdict(float,
            {InventoryState(on_hand=0, on_order=0): -35.29706579811771,
             InventoryState(on_hand=0, on_order=2): -28.11146076434423,
             InventoryState(on_hand=1, on_order=0): -28.881851892029175,
             InventoryState(on_hand=0, on_order=1): -28.004776225243578,
             InventoryState(on_hand=1, on_order=1): -29.331205300794107,
             InventoryState(on_hand=2, on_order=0): -30.33612111279546})

In [15]:
approx = Tabular()
last(mc_prediction(sample,approx,gamma)).values_map

{InventoryState(on_hand=0, on_order=0): -35.29706579811771,
 InventoryState(on_hand=0, on_order=2): -28.11146076434423,
 InventoryState(on_hand=1, on_order=0): -28.881851892029175,
 InventoryState(on_hand=0, on_order=1): -28.004776225243578,
 InventoryState(on_hand=1, on_order=1): -29.331205300794107,
 InventoryState(on_hand=2, on_order=0): -30.33612111279546}

# Q2,3 

In [22]:
def tabular_td_prediction(trans: Iterable[TransitionStep[S]],
                      weight_func: Callable[[int],float],
                      gamma: float) -> List[Mapping[S,float]]:
    v: Mapping[S,float] = defaultdict(float)
    occurence: Mapping[S,int] = defaultdict(int)
        
        
    for st in trans: ##we are not given a whole episode now
        occurence[st.state] += 1
        v[st.state] += weight_func(occurence[st.state])*(st.reward + gamma*v[st.next_state]
                                                         - v[st.state]) ##use 1 step reward instead of return

    return v

In [25]:
start = InventoryState(on_hand = 0, on_order = 0)
sample = list(itertools.islice(si_mrp.simulate_reward(Constant(start)),100000))
res_1 = tabular_td_prediction(sample, lambda n: 1./n, gamma)

In [26]:
res_1

defaultdict(float,
            {InventoryState(on_hand=0, on_order=0): -24.750359610543377,
             InventoryState(on_hand=0, on_order=2): -17.57790833807345,
             InventoryState(on_hand=1, on_order=0): -18.15714676168148,
             InventoryState(on_hand=1, on_order=1): -18.563091921139982,
             InventoryState(on_hand=0, on_order=1): -17.160240499372055,
             InventoryState(on_hand=2, on_order=0): -19.56436209341991})

In [29]:
approx = Tabular()
last(td_prediction(sample,approx,gamma)).values_map

{InventoryState(on_hand=0, on_order=0): -24.750359610543335,
 InventoryState(on_hand=0, on_order=2): -17.577908338073506,
 InventoryState(on_hand=1, on_order=0): -18.157146761681386,
 InventoryState(on_hand=1, on_order=1): -18.563091921139804,
 InventoryState(on_hand=0, on_order=1): -17.160240499371913,
 InventoryState(on_hand=2, on_order=0): -19.564362093419764}

In [30]:
si_mrp.display_value_function(gamma=gamma)

{InventoryState(on_hand=0, on_order=2): -28.345,
 InventoryState(on_hand=2, on_order=0): -30.345,
 InventoryState(on_hand=1, on_order=1): -29.345,
 InventoryState(on_hand=1, on_order=0): -28.932,
 InventoryState(on_hand=0, on_order=1): -27.932,
 InventoryState(on_hand=0, on_order=0): -35.511}
