# Assignment 13

In [53]:
# import packages needed
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import Iterable, Callable, Mapping, TypeVar, List, Tuple, Optional,Sequence
from rl.markov_decision_process import TransitionStep, Policy,MarkovDecisionProcess
from rl.markov_decision_process import FiniteMarkovDecisionProcess,policy_from_q
from rl.returns import returns
from rl.function_approx import Tabular, FunctionApprox
from rl.function_approx import DNNSpec, AdamGradient, DNNApprox
from rl.dynamic_programming import policy_iteration_result
from rl.iterate import last
from collections import defaultdict
from copy import deepcopy

from rl.chapter3.simple_inventory_mdp_cap import SimpleInventoryMDPCap, InventoryState
from rl.chapter7.asset_alloc_discrete import AssetAllocDiscrete 
from rl.monte_carlo import mc_prediction, mc_control
from rl.td import td_prediction

from rl.distribution import Constant,Choose,Bernoulli,Distribution,Gaussian

In [54]:
S = TypeVar('S')
A = TypeVar('A')

### Problem 1

In [63]:
# Tabular first: consulted RL-book implementation
def policy_from_q_st(q: Mapping[Tuple[S,A],float],
                     mdp: MarkovDecisionProcess[S, A],
                     eps: float = 0.0) -> Policy[S, A]:
    
    explore = Bernoulli(eps)

    class QPolicy(Policy[S, A]):
        def act(self, s: S) -> Optional[Distribution[A]]:
            if mdp.is_terminal(s):
                return None

            if explore.sample():
                return Choose(set(mdp.actions(s)))

            greedy = None
            max_q = -np.Inf
            for k in q:
                if k[0] == s and q[k] > max_q:
                    max_q = q[k]
                    greedy = k[1]
            return Constant(greedy)

    return QPolicy()

def tab_mc_control(mdp: MarkovDecisionProcess[S, A],
                   states: Distribution[S],
                   weight_func: Callable[[int],float],
                   gamma: float,
                   tol: float = 1e-6,
                   maxIter: int = 10000) -> List[Mapping[Tuple[S,A],float]]:
    
    curr_q:Mapping[Tuple[S,A],float] = defaultdict(int)
    q: List[Mapping[Tuple[S,A],float]] = [curr_q]
    p: Policy[S,A] = policy_from_q_st(q, mdp,1.)
    occurence: Mapping[Tuple[S,A],int] = defaultdict(int)

    for n in range(maxIter):
        trace: Iterable[TransitionStep[S, A]] = mdp.simulate_actions(states, p)
        episode = returns(trace,gamma,tol)
        for st in episode:
            occurence[(st.state,st.action)] += 1
            curr_q[(st.state,st.action)] = curr_q[(st.state,st.action)]*(1-\
                weight_func(occurence[(st.state,st.action)])) +\
                weight_func(occurence[(st.state,st.action)])*st.return_
        q.append(curr_q)
        p = policy_from_q_st(q[-1], mdp, 1./(n+2))
    
    return q

In [64]:
# Test correctness
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
    SimpleInventoryMDPCap(
        capacity=user_capacity,
        poisson_lambda=user_poisson_lambda,
        holding_cost=user_holding_cost,
        stockout_cost=user_stockout_cost
    )

In [65]:
res_1 = tab_mc_control(si_mdp,Choose(si_mdp.non_terminal_states),lambda n: 1./n,user_gamma)

In [66]:
opt_policy = defaultdict(lambda:-1)
opt_vf = defaultdict(lambda:-100)
for k in res_1[-1]:
    if opt_vf[k[0]] < res_1[-1][k]:
        opt_vf[k[0]] = res_1[-1][k]
        opt_policy[k[0]] = k[1]

In [67]:
res_2 = policy_iteration_result(si_mdp,user_gamma)

In [68]:
for k in opt_vf:
    print(f"State {k}: %.4f vs. %.4f"%(opt_vf[k],res_2[0][k]))
for k in opt_policy:
    print(f"State {k}: Action %d vs. %d"%(opt_policy[k],res_2[1].policy_map[k].value))

State InventoryState(on_hand=1, on_order=0): -28.9119 vs. -28.6610
State InventoryState(on_hand=0, on_order=1): -27.9186 vs. -27.6610
State InventoryState(on_hand=0, on_order=0): -35.5073 vs. -34.8948
State InventoryState(on_hand=1, on_order=1): -29.3131 vs. -28.9919
State InventoryState(on_hand=0, on_order=2): -28.3397 vs. -27.9919
State InventoryState(on_hand=2, on_order=0): -30.3331 vs. -29.9919
State InventoryState(on_hand=1, on_order=0): Action 1 vs. 1
State InventoryState(on_hand=0, on_order=1): Action 1 vs. 1
State InventoryState(on_hand=0, on_order=0): Action 2 vs. 1
State InventoryState(on_hand=1, on_order=1): Action 0 vs. 0
State InventoryState(on_hand=0, on_order=2): Action 0 vs. 0
State InventoryState(on_hand=2, on_order=0): Action 0 vs. 0


In [69]:
res_1[-1]

defaultdict(<function __main__.tab_mc_control.<locals>.<lambda>()>,
            {(InventoryState(on_hand=1, on_order=0), 0): -35.994149875802826,
             (InventoryState(on_hand=1, on_order=0), 1): -28.91185152063077,
             (InventoryState(on_hand=0, on_order=1), 0): -34.94270166698971,
             (InventoryState(on_hand=0, on_order=0), 1): -40.63495526273648,
             (InventoryState(on_hand=0, on_order=1), 1): -27.918579723898635,
             (InventoryState(on_hand=1, on_order=1), 0): -29.313114149099746,
             (InventoryState(on_hand=0, on_order=0), 0): -45.95170888025482,
             (InventoryState(on_hand=0, on_order=0), 2): -35.50732172722377,
             (InventoryState(on_hand=0, on_order=2), 0): -28.33967608248426,
             (InventoryState(on_hand=2, on_order=0), 0): -30.33308220164507})

Question:<br>
Everything correct except InventoryState(on_hand=0, on_order=0), why?

In [46]:
# Function Approx: consulted RL-book implementation
# TODO: design for AssetAllocDiscrete only!
def mc_control(mdp: MarkovDecisionProcess[S, A],
               states: Distribution[S],
               approx_0: FunctionApprox[Tuple[S, A]],
               gamma: float,
               tol: float = 1e-6,
               maxIter: int = 1000) -> List[Mapping[Tuple[S,A],float]]:
    
    curr_q:FunctionApprox[Tuple[S, A]] = approx_0
    q: List[FunctionApprox[Tuple[S, A]]] = [curr_q]
    p: Policy[S,A] = policy_from_q(q, mdp,1.)

    for n in range(maxIter):
        trace: Iterable[markov_decision_process.TransitionStep[S, A]] = mdp.simulate_actions(states, p)
        print(next(trace))
        curr_q = curr_q.update(
            ((st.state, st.action), st.return_)
            for st in returns(trace, gamma, tol)
        )
        q.append(curr_q)
        p = markov_decision_process.policy_from_q(q[-1], mdp, 1/(n+2.))
    
    return q

In [52]:
steps: int = 4
μ: float = 0.13
σ: float = 0.2
r: float = 0.07
a: float = 1.0
init_wealth: float = 1.0
init_wealth_var: float = 0.1

excess: float = μ - r
var: float = σ * σ
base_alloc: float = excess / (a * var)

risky_ret: Sequence[Gaussian] = [Gaussian(μ=μ, σ=σ) for _ in range(steps)]
riskless_ret: Sequence[float] = [r for _ in range(steps)]
utility_function: Callable[[float], float] = lambda x: - np.exp(-a * x) / a
alloc_choices: Sequence[float] = np.linspace(
    2 / 3 * base_alloc,
    4 / 3 * base_alloc,
    11
)
feature_funcs: Sequence[Callable[[Tuple[float, float]], float]] = \
    [
        lambda _: 1.,
        lambda w_x: w_x[0],
        lambda w_x: w_x[1],
        lambda w_x: w_x[1] * w_x[1]
    ]
dnn: DNNSpec = DNNSpec(
    neurons=[],
    bias=False,
    hidden_activation=lambda x: x,
    hidden_activation_deriv=lambda y: np.ones_like(y),
    output_activation=lambda x: - np.sign(a) * np.exp(-x),
    output_activation_deriv=lambda y: -y
)
init_wealth_distr: Gaussian = Constant(init_wealth)

aad: AssetAllocDiscrete = AssetAllocDiscrete(
    risky_return_distributions=risky_ret,
    riskless_returns=riskless_ret,
    utility_func=utility_function,
    risky_alloc_choices=alloc_choices,
    feature_functions=feature_funcs,
    dnn_spec=dnn,
    initial_wealth_distribution=init_wealth_distr
)

More to be done..

In [48]:
class AssetAllocMDPwrap(MarkovDecisionProcess[Tuple[float,int], float]):
    
    def __init__(self,assetalloc: AssetAllocDiscrete):
#        self.assetalloc: AssetAllocDiscrete = assetalloc
        self.alloc_choices: Sequence[float] = assetalloc.risky_alloc_choices
        self.steps: int = assetalloc.time_steps()
        self.utility_f: Callable[[float], float] = assetalloc.utility_func

    def step(self,state: Tuple[float,int],action: float
            ) -> Optional[Distribution[Tuple[Tuple[float,int], float]]]:

        def sr_sampler_func(state=state,alloc=action) -> Tuple[Tuple[float,int], float]:
            
            wealth=state[0]
            time = state[1]
            
            distr: Distribution[float] = self.assetalloc.risky_return_distributions[time]
            rate: float = self.assetalloc.riskless_returns[time]
            
            next_wealth: float = alloc * (1 + distr.sample()) \
                + (wealth - alloc) * (1 + rate)
            reward: float = self.utility_f(next_wealth) \
                if t == self.steps - 1 else 0.
            return ((next_wealth,time+1), reward)

            return SampledDistribution(
                sampler=sr_sampler_func,
                expectation_samples=1000
            )

    def actions(self, wealth: float) -> Sequence[float]:
        return self.alloc_choices

In [49]:
aad_wrapped = AssetAllocMDPwrap(aad)