In [18]:
import gymnasium as gym
import numpy as np

In [33]:
env = gym.make("CartPole-v1")

In [36]:
obs, _ = env.reset()

In [None]:
import numpy as np

class Bin2D:
    def __init__(self, posx, posy, size):
        self.posx = posx
        self.posy = posy
        self.size = size  # side length of the bin

    def check(self, x, y):
        if self.posx <= x < self.posx + self.size and self.posy <= y < self.posy + self.size:
            return 1
        return 0


class Tiling2D:
    def __init__(self, offset_x, offset_y, width, height, n_bins):
        self.offset_x = offset_x
        self.offset_y = offset_y
        self.width = width
        self.height = height
        self.n_bins = n_bins

    def setup(self):
        """Create grid of bins for this tiling."""
        self.bins = []
        cell_w = self.width / self.n_bins
        cell_h = self.height / self.n_bins

        for i in range(self.n_bins):
            for j in range(self.n_bins):
                x = self.offset_x + i * cell_w
                y = self.offset_y + j * cell_h
                self.bins.append(Bin2D(x, y, cell_w))

    def check(self, x, y):
        """Return binary vector for this tiling."""
        return [b.check(x, y) for b in self.bins]


class Tile2D:
    def __init__(self, x_range, y_range, n_tilings, n_bins):
        self.x_range = x_range
        self.y_range = y_range
        self.n_tilings = n_tilings
        self.n_bins = n_bins

    def setup(self):
        """Create multiple slightly offset tilings."""
        self.tilings = []
        x_min, x_max = self.x_range
        y_min, y_max = self.y_range
        width = x_max - x_min
        height = y_max - y_min

        for i in range(self.n_tilings):
            # small offset for each tiling (staggered grids)
            offset_x = x_min + (i / self.n_tilings) * (width / self.n_bins)
            offset_y = y_min + (i / self.n_tilings) * (height / self.n_bins)
            tiling = Tiling2D(offset_x, offset_y, width, height, self.n_bins)
            tiling.setup()
            self.tilings.append(tiling)

    def check(self, x, y):
        """Return flattened binary vector of all tilings."""
        features = []
        for tiling in self.tilings:
            features.extend(tiling.check(x, y))
        return np.array(features, dtype=np.float32)


def x_of_s_a(s, a, tile2d, n_actions=3):
    """Return full state-action feature vector x(s,a)."""
    phi = tile2d.check(*s)
    n = len(phi)
    x = np.zeros(n_actions * n)
    x[a * n:(a + 1) * n] = phi
    return x

pos_range = (-2.4, 2.4)
pol_angle_range = (-0.2095, 0.2095)
cart_velocity = (-4, 4)
pol_ang_vel = (-4, 4)

tile1 = Tile2D(pos_range, cart_velocity, n_tilings=4, n_bins=4)
tile2 = Tile2D(pol_angle_range, pol_ang_vel, n_tilings=4, n_bins=4)

tile1.setup()
tile2.setup()

tiles = [tile1, tile2]

def create_feature_vector_ntiles(obs, action):
    global tiles
    feat_v = []
    feat_v.extend(x_of_s_a(obs[:2], action, tiles[0], 2))
    feat_v.extend(x_of_s_a(obs[2:], action, tiles[1], 2))
    return np.array(feat_v)

weights = np.random.random(256)
weights = np.append(weights, 1)

EPS = 0.05
DF = 0.99
LAMBDA = 0.8
LR = 10e-4

def getQValue(feature_vect):
    global weights
    return np.dot(weights[:-1], feature_vect) + weights[-1]

def getEGreedyAction(obs):
    global weights
    global tiles
    if np.random.random() <= EPS:
        return np.random.randint(0, 2)
    else:
        return np.argmax([getQValue(create_feature_vector_ntiles(obs, 0)),
                          getQValue(create_feature_vector_ntiles(obs, 1))])

def cartpole(env, n_episodes):
    global weights
    global tiles
    truncated = terminated = False
    z = np.zeros(257)
    v_old = 0
    action = 0

    for _ in range(n_episodes):
        obs, _ = env.reset()
        while not (truncated and terminated):
            old_action = action
            action = getEGreedyAction(obs)
            old_obs = obs
            obs, r, terminated, truncated, _ = env.step(action)
            old_f_v = create_feature_vector_ntiles(old_obs, old_action)
            v = np.dot(weights[:-1], old_f_v) + weights[-1]
            v_next = np.dot(weights[:-1], create_feature_vector_ntiles(obs, action)) + weights[-1]
            td_error = r + DF*v_next - v
            z = DF*LAMBDA*z + (1 - LR*DF*LAMBDA*np.dot(z, old_f_v))*old_f_v
            weights += LR*(td_error + v - v_old)*z - LR*(v - v_old)*old_f_v
            v_old = v_next

    env.close()
