# Assignment 12

In [1]:
# import packages needed
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import Iterable, Callable, Mapping, TypeVar, List
from rl.markov_process import TransitionStep
from rl.returns import returns
from rl.function_approx import Tabular, FunctionApprox
from rl.dynamic_programming import evaluate_mrp_result
from rl.iterate import last
from collections import defaultdict
from copy import deepcopy

from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite, InventoryState
from rl.monte_carlo import mc_prediction
from rl.td import td_prediction

from rl.distribution import Constant

In [2]:
S = TypeVar('S')

### Problem 1

In [3]:
# Tabular First
def tab_n_bootstrap(trans: Iterable[TransitionStep[S]],
                    n: int,
                    weight_func: Callable[[int],float],
                    gamma: float) -> List[Mapping[S,float]]:
    
    v: Mapping[S,float] = defaultdict(lambda:0)
    occurence: Mapping[S,int] = defaultdict(lambda:0)

    all_v:List[Mapping[S,float]] = []
    
    data:list = list(trans)
    
    for i in range(len(data)):
        
        if i+n < len(data):
            G_tn:float = data[i].reward - v[data[i].state]
            for j in range(1,n):
                G_tn += gamma**i*data[j+i].reward
            G_tn += gamma**n*v[data[n+i].state]
            occurence[data[i].state] += 1
            v[data[i].state] += weight_func(occurence[data[i].state])*G_tn
            all_v.append(v)

    return all_v

In [4]:
# Using Function Approx
def fa_n_bootstrap(trans: Iterable[TransitionStep[S]],
                   n: int,
                   approx: FunctionApprox[S],
                   gamma: float)-> List[FunctionApprox[S]]:
    
    data:list = list(trans)
        
    all_approx:List[FunctionApprox[S]] = [approx]
    
    for i in range(len(data)):
        
        if i+n < len(data):
            G_tn:float = data[i].reward
            for j in range(1,n):
                G_tn += gamma**i*data[j+i].reward
            G_tn += gamma**n*all_approx[-1](data[n+i].state)
            all_approx.append(all_approx[-1].update([(data[i].state,G_tn)]))

    return all_approx

In [5]:
# Test above correctness
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mrp = SimpleInventoryMRPFinite(
    capacity=user_capacity,
    poisson_lambda=user_poisson_lambda,
    holding_cost=user_holding_cost,
    stockout_cost=user_stockout_cost
)

start = InventoryState(on_hand = 0, on_order = 0)
sample = list(itertools.islice(si_mrp.simulate_reward(Constant(start)),100000))

In [6]:
res_1 = tab_n_bootstrap(sample, 3, lambda n: 1./n, user_gamma)[-1]

In [7]:
approx = Tabular()
res_2 = fa_n_bootstrap(sample,3,approx,user_gamma)[-1].values_map

In [8]:
for k in res_1:
    print(f"State {k}: %.4f vs. %.4f"%(res_1[k],res_2[k]))

State InventoryState(on_hand=0, on_order=0): -17.5345 vs. -17.5345
State InventoryState(on_hand=0, on_order=1): -10.0553 vs. -10.0553
State InventoryState(on_hand=0, on_order=2): -8.3230 vs. -8.3230
State InventoryState(on_hand=1, on_order=1): -9.3154 vs. -9.3154
State InventoryState(on_hand=1, on_order=0): -11.0223 vs. -11.0223
State InventoryState(on_hand=2, on_order=0): -10.3309 vs. -10.3309


### Problem 2

In [9]:
# Tabular First
def tab_td_lambda(trans: Iterable[TransitionStep[S]],
                  weight_func: Callable[[int],float],
                  gamma: float,
                  lambda_: float) -> List[Mapping[S,float]]:
    
    v: Mapping[S,float] = defaultdict(lambda:0)
    occurence: Mapping[S,int] = defaultdict(lambda:0)
    e: Mapping[S,float] = defaultdict(lambda:0)
    last_t: Mapping[S,int] = defaultdict(lambda:0)

    all_v:List[Mapping[S,float]] = []
    
    count = 0
    for st in trans:
        count += 1
        occurence[st.state] += 1
        e[st.state] = (gamma*lambda_)**(count - last_t[st.state])*e[st.state] + 1
        last_t[st.state] = count
        v[st.state] += weight_func(occurence[st.state])*(st.reward + gamma*v[st.next_state]
                                                         - v[st.state])*e[st.state]
        all_v.append(v)

    return all_v

In [10]:
# Using Function Approx
# TODO: Design for LinearApprox only
def fa_td_lambda(trans: Iterable[TransitionStep[S]],
                 n: int,
                 approx: FunctionApprox[S],
                 gamma: float):
        
    class wrapper_FA(FunctionApprox[S]):
        
        def __init__(self, approx: FunctionApprox[S]):
            self.inner_approx: FunctionApprox[S] = approx
                
            self.v: Mapping[S,float] = defaultdict(lambda:0)
            self.e: Mapping[S,float] = defaultdict(lambda:0)
        
        def representational_gradient(self, x_value: X) -> FunctionApprox[X]:
            return approx.representational_gradient(X)
        
        def evaluate(self, x_values_seq: Iterable[X]) -> np.ndarray:
            return approx.evaluate(x_values_seq)
        
        def update(self,xy_vals_seq: Iterable[Tuple[X, float]]) -> FunctionApprox[X]:
            gradient: np.ndarray = self.regularized_loss_gradient(xy_vals_seq)
            for k in self.e:
                self.e[k] = gamma*lambda_*self.e[k]+gradient(k)
            
        

    all_v:list[Mapping[S,float]] = []
    
    count = 0
    for st in trans:
        count += 1
        occurence[st.state] += 1
        e[st.state] = (gamma*lambda_)**(count - last_t[st.state])*e[st.state] + 1
        last_t[st.state] = count
        v[st.state] += weight_func(occurence[st.state])*(st.reward + gamma*v[st.next_state]
                                                         - v[st.state])*e[st.state]
        all_v.append(v)

    return all_v

Question:<br>
How to construct TD($\lambda$) for function approximation without creating a new FunctionApprox sub-class? If needed, how can we construct the sub-class without knowing what type the passed-in FunctionApprox is?

### Problem 4

In [11]:
# Test above correctness
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mrp = SimpleInventoryMRPFinite(
    capacity=user_capacity,
    poisson_lambda=user_poisson_lambda,
    holding_cost=user_holding_cost,
    stockout_cost=user_stockout_cost
)

start = InventoryState(on_hand = 0, on_order = 0)
sample = [list(itertools.islice(si_mrp.simulate_reward(Constant(start)),1000)) 
          for _ in range(100)]
sample2 = [x for sublist in sample for x in sublist]

In [12]:
approx = Tabular()
res_mc = [x.values_map for x in mc_prediction(sample,approx,user_gamma)][-1]
res_td = [x.values_map for x in td_prediction(sample2,approx,user_gamma)][-1]
res_tdl = tab_td_lambda(sample2, lambda n: 1./n, user_gamma,0.9)[-1]
res_dp = evaluate_mrp_result(si_mrp,user_gamma)

In [13]:
for k in res_mc:
    print(f"State {k}: %.4f vs. %.4f vs. %.4f vs. %.4f"%(
        res_dp[k],res_mc[k],res_td[k],res_tdl[k]))

State InventoryState(on_hand=0, on_order=0): -35.5105 vs. -35.5298 vs. -25.2566 vs. -29.8098
State InventoryState(on_hand=0, on_order=2): -28.3450 vs. -28.3826 vs. -18.1018 vs. -22.7377
State InventoryState(on_hand=1, on_order=0): -28.9322 vs. -28.9036 vs. -18.7014 vs. -23.5237
State InventoryState(on_hand=0, on_order=1): -27.9322 vs. -27.8913 vs. -17.6830 vs. -22.8020
State InventoryState(on_hand=1, on_order=1): -29.3450 vs. -29.3663 vs. -19.0849 vs. -23.7462
State InventoryState(on_hand=2, on_order=0): -30.3450 vs. -30.4458 vs. -20.1092 vs. -24.8970


Question:
1. Why do the value functions converge from larger numbers to smaller ones?
2. How to plot convergence?

Observation:
As $\lambda$ approaches 0, TD($\lambda$) approaches TD. As $\lambda$ approaches 1, TD($\lambda$) approaches MC. 