In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import numpy as np

# custom libraries 
from envs import BasicGrid
from functools import partial
from utils import * 
from agents import Pi1Agent, LambdaR
from runners import * 
from tqdm import tqdm

colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

# Create a ListedColormap from the extracted colors
# Define the color segments for the colormap
segments = [(i/(len(colors)-1), colors[i]) for i in range(len(colors))]

# Create a LinearSegmentedColormap from the color segments
cmap = LinearSegmentedColormap.from_list(name='my_colormap', colors=segments)

%load_ext autoreload
%autoreload 2

In [None]:
# dynamic programming
def compute_LF(LR, Phi, P, pi, lambda_, gamma, max_iters, q_true=None, r=None, tol=1e-2):
    S, A, _ = P.shape
    _, D = Phi.shape
    deltas, mses = [], []
    for _ in tqdm(range(max_iters)):
        delta = 0.0
        for s in range(S):
            feat = Phi[s]
            for a in range(A):
                lr = np.zeros(D)
                for stp1 in range(S):
                    for atp1 in range(A):
                        lr += pi[stp1, atp1] * P[s, a, stp1] * feat * (
                            1 + lambda_ * gamma * LR[stp1, atp1, :])
                        lr += pi[stp1, atp1] * P[s, a, stp1] * gamma * (
                            1 - feat) * LR[stp1, atp1, :]
                delta = max(np.max(delta), np.max(np.abs(LR[s, a, :] - lr)))
                LR[s, a, :] = lr
        deltas.append(delta)
        if q_true is not None and r is not None:
            q_pred = LR @ r
            mses.append(np.mean((q_true - q_pred.flatten())**2))
        if delta < tol:
            break
    return LR, deltas, mses

In [None]:
env = BasicGrid(lambda_=0.5)
policy = Pi1Agent(env._number_of_states, env.get_obs(s=env._start_state)).q

In [None]:
# get optimal Q
LR = np.zeros((env._number_of_states, env._number_of_actions, env._number_of_states))
LR, deltas, _ = compute_LF(LR, np.eye(env._number_of_states), env.P, policy, env.lambda_, env.discount, 100, tol=5e-2)
q_opt = LR @ env.r
v_opt = np.max(q_opt, axis=1)
q_opt = q_opt.flatten()


In [None]:
lambdas = [0.0, 0.5, 1.0]
dp_results = dict()
for i, lambda_ in enumerate(lambdas):
    dp_results[lambda_] = dict()
    LR = np.zeros((env._number_of_states, env._number_of_actions, env._number_of_states))
    LR, deltas, mses = compute_LF(LR, np.eye(env._number_of_states), env.P, policy, lambda_, env.discount, 10, q_true=q_opt, r=env.r, tol=-5)
    dp_results[lambda_]['LR'] = LR
    dp_results[lambda_]['deltas'] = deltas
    dp_results[lambda_]['mses'] = mses

In [None]:
# tabular TD learning
env = BasicGrid(lambda_=0.5)
policy = Pi1Agent(env._number_of_states, env.get_obs(s=env._start_state))

In [None]:
Pi = [policy]
td_LRs_all = {}
lambdas = [0.5, 1.0]
# multiple start states to get DR
start_states = [7, 8, 9, 10, 13, 14, 16, 19, 20, 21, 22, 25, 26, 27, 28]
n_repeats = 3
for lambda_ in lambdas:
  td_LRs_all[lambda_] = []
  for i in range(n_repeats):
    print(f"\nlambda {lambda_}, expt ", i)
    np.random.seed(i)
    LRs = []
    for i, pi in enumerate(Pi):
        lr_agent = LambdaR(env._layout.size, 4, env.get_obs(),
                  policy=partial(epsilon_greedy, epsilon=0.2), q=pi.q, 
                  step_size=0.1, sa=True, lambda_=lambda_)
        for _ in range(100):
          for start in start_states:
              lr_agent._state = start
              grid = BasicGrid(start_state=start, discount=env.discount, lambda_=0.5)
              results = run_experiment_episodic(grid, lr_agent, 1, display_eps=1, respect_done=False)
          
              LRs += results['lambdaR_hist']

    td_LRs_all[lambda_].append(LRs)

  td_LRs_all[lambda_] = np.stack(td_LRs_all[lambda_])