# Stanford CME 241 (Winter 2021) - Assignment 14

### LSTD Algorithm

In [8]:
import numpy as np 

class LSTD:
    
    def __init__(self, n, epsilon=0):        
        self.n = n # the number of features
        self.reset(epsilon)
    
    def reset(self, epsilon=0):
        self.z = np.zeros(self.n) # the eligibility trace vector
        self.A = np.eye(self.n) * epsilon # a matrix with shape (n,n)
        self.b = np.zeros(self.n) # a vector of length n
    
    def theta(self):
        thet = np.dot(np.linalg.pinv(self.A), self.b)
        return thet

    def update(self, x, reward, xp, gm, gm_p, lm):
        # x: current timestep observation
        # r: reward from the transition
        # xp: next timestep observation
        # gm: discount factor for the current state
        # gm_p: discount factor for the next state
        # lm: bootstrapping parameter
        self.z = (gm * lm * self.z + x) # lambda
        self.A += np.outer(self.z, (x - gm_p*xp))
        self.b += self.z * reward 

### LSPI Algorithm

In [14]:
import numpy as np

class LSPI:
    
    def __init__(self, basis_functions, gamma, epsilon, w, env, n_trial_samples, n_timestep_samples):        
        self.basis_functions = basis_functions
        self.gamma = gamma
        self.epsilon = epsilon
        self.w = w
        self.env = env
        self.n_trial_samples: int = n_trial_samples
        self.n_timestep_samples: int = n_timestep_samples
    
    def LSPI_func(self, basis_functions, gamma, epsilon, w, env, n_trial_samples=1000, n_timestep_samples=20):
        w0 = []    
        samples = self.generate_samples(self.env, self.n_trial_samples, self.n_timestep_samples)
        while True:
            w_prev = self.w
            w = self.LSTDQ(samples, self.basis_functions, self.gamma, self.w, self.env)
            if self.converged(w, w_prev, self.epsilon):
                break 
            else:
                w_prev = w
            w0.append(w[0])
            print (w[0])
        return w, w0

    def converged(self, w, w_prev, epsilon):
        return np.linalg.norm(w - w_prev) < epsilon

    def LSTDQ(self, samples, basis_functions, gamma, w, env):
        k = len(basis_functions)
        A = np.identity(k) * 0.01
        b = np.zeros(k)
        for s, a, r, sp in samples:
            phi = np.array([basis(s, a) for basis in basis_functions])
            a_p = self.get_policy_action(sp, w, basis_functions, env, method)
            phi_p = np.array([basis(sp, a_p) for basis in basis_functions])
            A += np.outer(phi, (phi - gamma*phi_p))
            b = b + phi*r 
        w = np.dot(np.linalg.inv(A),b)
        return w
    
    def get_policy_action(self, s, w, basis_functions, env):
        a_max = None
        max_score = float("-inf")  
        action_space = [0, 1]  
        # gradient descent
        for a in action_space:
            score = np.dot(_compute_phi(s, a, basis_functions), w)
            # update
            if score > max_score:
                max_score = score
                a_max = a
        return a_max    

    def generate_samples(self, env, n_samples, n_steps=100):
        samples = []
        for i in range(n_samples):
            env.reset()
            for j in range(n_steps):
                s= env.env.state
                a = env.action_space.sample()     
                sp,r, _,_ = env.step(a)       
                sample = (s, a, r, sp)
                samples.append(sample)
        return np.array(samples)