In [2]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import tqdm
from stable_baselines3 import PPO

In [3]:
model_PPO = PPO.load("ppo_cartpole")


In [24]:
n_samples = 1000
feature_dim = 10  # Example feature dimension
repeat = 1
gamma = 0.9

In [6]:
env = gym.make('CartPole-v1')
env.reset()


(array([ 0.01502095,  0.03861119, -0.04368586,  0.01309333], dtype=float32),
 {})

In [5]:
# model = PPO('MlpPolicy', env, verbose=1)
# model.learn(total_timesteps=100000)
# model.save("ppo_cartpole")

In [12]:
obs, _ = env.reset()
obs
action = model_PPO.predict(obs)[0]
action

array(0)

In [13]:
observation, reward, terminated, truncated, info = env.step(action)


In [None]:
def policy_unif(s):
  a = env.action_space.sample()
  return a

def policy_PPO(s):
  action = model_PPO.predict(s)[0]
  return action

def rbf_random_fourier_features(state, action, feature_dim = feature_dim, length_scale=1.0):
    np.random.seed(0)
    state_array = np.array(state[0], dtype=np.float32).reshape(-1)
    action_array = np.array([float(action)])
    state_action = np.concatenate((state_array, action_array))
    dim = state_action.shape[0]
    
    # Handle even/odd feature dimensions
    if feature_dim % 2 == 0:
        d_cos = d_sin = feature_dim // 2
    else:
        d_cos = (feature_dim + 1) // 2
        d_sin = (feature_dim - 1) // 2
    
    omega = np.random.normal(scale=1.0/length_scale, size=(dim, d_cos))
    bias = np.random.uniform(0, 2 * np.pi, size=d_cos)
    z = state_action @ omega + bias
    cos_features = np.cos(z)
    sin_features = np.sin(z[:d_sin]) if d_sin > 0 else np.array([])
    feature = np.sqrt(1.0 / feature_dim) * np.concatenate([cos_features, sin_features])
    return feature

def collect_trajectory(policy, feature_dim):
    s0, _ = env.reset()
    traj_list = [s0]
    while True:
        a0 = policy(s0)
        phi_sa = rbf_random_fourier_features(s0, a0, feature_dim)
        traj_list.append(phi_sa)
        s1, r0,  terminated, truncated, _ = env.step(a0)
        traj_list.append(r0)
        traj_list.append(s1)
        s0 = s1
        if terminated or truncated:
            break
    # print(len(traj_list))
    return traj_list[:-1]  # removing the terminal state

def collect_data(n, policy, feature_dim=feature_dim):
    data = []
    while len(data) < n:
        trajectory = collect_trajectory(policy, feature_dim)
        i = 0
        while i < len(trajectory)-3:
            state = trajectory[i]
            action = policy(state)
            phi_sa = rbf_random_fourier_features(state, action, feature_dim)
            reward = trajectory[i+2]
            next_state = trajectory[i+3]
            next_action = policy(next_state)
            phi_sa_prime = rbf_random_fourier_features(next_state, next_action, feature_dim)
            
            data.append((phi_sa, reward, phi_sa_prime))
            i += 3
            if len(data) >= n:
                break

    return data[:n]  # Return exactly n samples as a single array

def Q(state, action, theta,feature_dim=feature_dim):
    phi_sa = rbf_random_fourier_features(state, action, feature_dim)
    return np.dot(theta, phi_sa)

def policy_eval_LSTD(theta_init,data, feature_dim=feature_dim, alpha=0.01):
    '''Use TD(0) which converges to the solution of LSTD'''
    theta_lstd = np.copy(theta_init)
    for phi_sa, reward, phi_sa_prime in data:
        Q_sa = np.dot(theta_lstd, phi_sa)
        Q_sa_prime = np.dot(theta_lstd, phi_sa_prime)
        td_error = reward + gamma * Q_sa_prime - Q_sa
        theta_lstd += alpha * td_error * phi_sa
    
    # def Q(state, action):
    #     phi_sa = rbf_random_fourier_features(state, action, feature_dim)
    #     return np.dot(theta_lstd, phi_sa)
    
    return theta_lstd

def policy_eval_BRM(theta_init, data,  feature_dim=feature_dim, learning_rate=0.1):
    theta_BRM = np.copy(theta_init)
    for phi_sa, reward, phi_sa_prime in data:
        x_sa = phi_sa - gamma * phi_sa_prime
        gradient = -2 * (reward - np.dot(x_sa, theta_BRM)) * x_sa
        theta_BRM -= learning_rate * gradient
        
    # def Q(state, action):
    #     phi_sa = rbf_random_fourier_features(state, action, feature_dim)
    #     return np.dot(theta_BRM, phi_sa)
    
    return theta_BRM

In [30]:
offline_data = collect_data(n_samples, policy_PPO, feature_dim)


In [31]:
state, _ = env.reset()
action = env.action_space.sample()
print(state, action)

[-0.01382144  0.01666469  0.04897724  0.01250008] 1


In [None]:
theta_init = np.zeros(feature_dim)
Q_lstd = policy_eval_LSTD(theta_init, offline_data)
Q_BRM = policy_eval_BRM(theta_init, offline_data)
estimated_value_lstd_gradient = Q_lstd(state, action)
estimated_value_BRM = Q_BRM(state, action)


3.231422191006378 5.327932805150623


In [59]:
def index_to_state_action(i, n_grid_points=5):
    """
    Maps index i (0 to 2*n_grid_points^4-1) to a state-action pair
    
    Returns:
    - state: np.array of shape (4,)
    - action: int (0 or 1)
    """
    # State bounds
    state_bounds = [
        [-4.8, 4.8],     # cart position
        [-10.0, 10.0],   # cart velocity
        [-0.418, 0.418], # pole angle
        [-10.0, 10.0]    # pole angular velocity
    ]
    
    # Total states per dimension
    n_states = n_grid_points**4
    
    # Determine action (0 for first half indices, 1 for second half)
    action = 1 if i >= n_states else 0
    
    # Get state index (map back to state space)
    state_idx = i % n_states
    
    # Convert to grid coordinates
    idx_4 = state_idx % n_grid_points
    idx_3 = (state_idx // n_grid_points) % n_grid_points
    idx_2 = (state_idx // (n_grid_points**2)) % n_grid_points
    idx_1 = state_idx // (n_grid_points**3)
    
    # Convert grid coordinates to actual state values
    state = np.array([
        np.linspace(state_bounds[0][0], state_bounds[0][1], n_grid_points)[idx_1],
        np.linspace(state_bounds[1][0], state_bounds[1][1], n_grid_points)[idx_2],
        np.linspace(state_bounds[2][0], state_bounds[2][1], n_grid_points)[idx_3],
        np.linspace(state_bounds[3][0], state_bounds[3][1], n_grid_points)[idx_4]
    ])
    
    return state, action

# Example usage:
total_pairs = 2 * 5**4  # 1250 total state-action pairs
for i in range(total_pairs):
    state, action = index_to_state_action(i)
    # Use these for Q-value estimation

In [62]:
def create_grid_evaluation_pairs(n_grid_points=5):
    """
    Create uniform grid of state-action pairs and estimate Q-values
    """
    # Step 1: Define state bounds
    state_bounds = [
        [-4.8, 4.8],     # cart position
        [-10.0, 10.0],   # cart velocity 
        [-0.418, 0.418], # pole angle
        [-10.0, 10.0]    # pole angular velocity
    ]
    
    # Step 2: Create grid points for each dimension
    grids = []
    for bound in state_bounds:
        grids.append(np.linspace(bound[0], bound[1], n_grid_points))
    
    # Step 3: Create all combinations using meshgrid
    cart_pos, cart_vel, pole_ang, pole_vel = np.meshgrid(*grids)
    
    # Step 4: Reshape to get state matrix (n_grid_points^4 x 4)
    states = np.column_stack((
        cart_pos.flatten(),
        cart_vel.flatten(),
        pole_ang.flatten(),
        pole_vel.flatten()
    ))
    
    # Step 5: Create state-action pairs (duplicate states for both actions)
    n_states = len(states)
    
    # Step 6: Estimate Q-values using Monte Carlo sampling
    n_eval_episodes = 50
    gamma = 0.99
    Q_values = np.zeros(2 * n_states)
    env = gym.make('CartPole-v1')
    
    for i in range(2*n_states):
        returns = []
        state, action = index_to_state_action(i)
        
        for _ in range(n_eval_episodes):
            env.reset()
            env.state = state
            next_state, reward, terminated, truncated, _ = env.step(int(action))
            
            # First reward from taking action a
            G = reward
            curr_state = next_state
            step = 0
            
            # Follow policy until episode end
            while not terminated and not truncated and step < 500:
                policy_action = policy_PPO(curr_state)
                curr_state, reward, terminated, truncated, _ = env.step(policy_action)
                G += (gamma ** (step + 1)) * reward
                step += 1
            
            returns.append(G)
        
        Q_values[i] = np.mean(returns)
    
    env.close()
    return  Q_values.reshape(-1, 1)

# Usage
Q_values = create_grid_evaluation_pairs()
print(Q_values.shape)  # (1250, 1)

KeyboardInterrupt: 

In [None]:
iter = int( n / 50 )
loss_LSTD = [0] * int(n / iter)
loss_BRM = [0] * int(n / iter)

l2_norm_diff_BRM_list = []
l2_norm_diff_LSTD_list = []
l2_norm_diff_FQI_list = []

theta_init = np.zeros(feature_dim)
theta_lstd = np.copy(theta_init)
theta_BRM = np.copy(theta_init)
for m in range(iter, n + 1, iter):
    
    offline_data = collect_data(iter, policy_PPO, feature_dim)
    theta_lstd = policy_eval_LSTD(theta_lstd, offline_data)
    theta_BRM = policy_eval_BRM(theta_BRM, offline_data)
    
    