# Assignment 14

In [4]:
# import packages needed
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import (Iterable, Callable, Mapping, TypeVar, 
                    List, Tuple, Optional,Sequence, Iterator)
from rl.markov_decision_process import Policy,MarkovDecisionProcess
from rl.markov_decision_process import FiniteMarkovDecisionProcess,policy_from_q
from rl.markov_decision_process import TransitionStep as TransitionStepA
from rl.markov_process import TransitionStep
from rl.returns import returns
from rl.function_approx import Tabular, FunctionApprox
from rl.function_approx import DNNSpec, AdamGradient, DNNApprox
from rl.dynamic_programming import policy_iteration_result
from rl.iterate import last
from collections import defaultdict
from copy import deepcopy
from dataclasses import replace

from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite, InventoryState
from rl.chapter7.asset_alloc_discrete import AssetAllocDiscrete 
from rl.monte_carlo import mc_prediction, mc_control
from rl.td import td_prediction

from rl.distribution import Constant,Choose,Bernoulli,Distribution,Gaussian

In [6]:
S = TypeVar('S')
A = TypeVar('S')

## Problem 1

In [8]:
def LSTD(trans: Iterable[TransitionStep[S]],
         feature_functions: Sequence[Callable[[S], float]],
         gamma: float)-> Mapping[S,float]:
    
    m:int = len(feature_functions)
    A:np.ndarray = np.zeros((m,m))
    b:np.ndarray = np.zeros((m,1))
    v:Mapping[S,float] = {}
    
    for s in trans:
        if s.state not in v:
            v[s.state] = 0.0
        feature_val_s:np.ndarray = np.reshape(np.array([f(s.state) for f in feature_functions]),(m,1))
        feature_val_ns:np.ndarray = np.reshape(np.array([f(s.next_state) for f in feature_functions]),(m,1))
        A = A+np.dot(feature_val_s,(feature_val_s-gamma*feature_val_ns).T)
        b = b+feature_val_s*s.reward
        
    w_star:np.ndarray = np.dot(np.linalg.inv(A),b)
    for k in v:
        feature_val_s:np.ndarray = np.reshape(np.array([f(k) for f in feature_functions]),(m,1))
        v[s] = np.dot(feature_val_s.T,w_star)

    return v

## Problem 2

In [9]:
def LSPI(trans: Iterable[TransitionStepA[S,A]],
         feature_functions: Sequence[Callable[[S,A], float]],
         gamma: float)-> Mapping[Tuple[S,A],float]:
    
    m:int = len(approx.feature_functions)
    Ainv:np.ndarray = np.zeros((m,m))
    A:np.ndarray = np.zeros((m,m))
    b:np.ndarray = np.zeros((m,1))
    q:Mapping[S,Mapping[A,float]] = {}
        
    def find_a(s:S, q:Mapping[S,Mapping[A,float]]) -> A:
        best_vf: float = -np.inf
        best_act: A = None
        for a in q[s]:
            if q[s][a] > best_vf:
                best_vf = q[s][a]
                best_act = a
                
        return best_act            
    
    count = 0
    for s in trans:
        if s.state not in q:
            q[s.state] = {}
            q[s.state][s.action] = 0.0
        elif s.action not in q[s.state]:
            q[s.state][s.action] = 0.0
        u:np.ndarray = np.reshape(np.array([f(s.state,s.action) for f in approx.feature_functions]),(m,1))
        best_act = find_a(s.next_state,q)
        if best_act is None:
            Ainv = Ainv-np.dot(np.dot(Ainv,np.dot(u,u.T)),Ainv)/(1+np.dot(np.dot(u.T,Ainv),u))
        else:
            v:np.ndarray = np.reshape(np.array([f(s.next_state,best_act) 
                                                for f in approx.feature_functions]),(m,1))
            Ainv = Ainv-np.dot(np.dot(Ainv,np.dot(u,v.T)),Ainv)/(1+np.dot(np.dot(v.T,Ainv),u))
        b = b+u*s.reward
        if count == 0:
            Ainv = np.linalg.inv(A)
            w_star:np.ndarray = np.dot(Ainv,b)
            count += 1
        else:
            w_star:np.ndarray = np.dot(Ainv,b)
                
        for st in q:
            for a in q[st]:
                q[st][a] = np.dot(u.T,w_star)

    return q

## Problem 3