### This example demonstrates the use of experience replay with XCSF to solve the cart-pole problem from the OpenAI Gym.

In [1]:
from __future__ import annotations

import random
from collections import deque
from typing import Final

import gym
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import rcParams

import xcsf

### Initialise OpenAI Gym problem environment

In [2]:
env = gym.make("CartPole-v0")
X_DIM: Final[int] = env.observation_space.shape[0]
N_ACTIONS: Final[int] = env.action_space.n

### Initialise XCSF

In [3]:
# constructor = (x_dim, y_dim, n_actions)
xcs: xcsf.XCS = xcsf.XCS(X_DIM, N_ACTIONS, 1)  # Supervised: i.e, single action

xcs.OMP_NUM_THREADS = 8  # number of CPU cores to use
xcs.POP_INIT = False  # use covering to initialise
xcs.MAX_TRIALS = 1  # one trial per fit
xcs.POP_SIZE = 200  # maximum population size
xcs.E0 = 0.001  # target error
xcs.BETA = 0.05  # classifier parameter update rate
xcs.ALPHA = 1  # accuracy offset
xcs.NU = 5  # accuracy slope
xcs.EA_SUBSUMPTION = False
xcs.SET_SUBSUMPTION = False
xcs.THETA_EA = 100  # EA invocation frequency
xcs.THETA_DEL = 100  # min experience before fitness used for deletion

condition_layers: Final[dict] = {
    "layer_0": {  # hidden layer
        "type": "connected",
        "activation": "selu",
        "evolve_weights": True,
        "evolve_neurons": True,
        "n_init": 1,
        "n_max": 100,
        "max_neuron_grow": 1,
    },
    "layer_1": {  # output layer
        "type": "connected",
        "activation": "linear",
        "evolve_weights": True,
        "n_init": 1,
    },
}

xcs.condition("neural", condition_layers)  # neural network conditions
xcs.action("integer")  # (dummy) integer actions
xcs.prediction("rls_quadratic")  # Quadratic RLS

GAMMA: Final[float] = 0.95  # discount rate for delayed reward
epsilon: float = 1  # initial probability of exploring
EPSILON_MIN: Final[float] = 0.1  # the minimum exploration rate
EPSILON_DECAY: Final[float] = 0.98  # the decay of exploration after each batch replay
REPLAY_TIME: Final[int] = 1  # perform replay update every n episodes

print(xcs.json_parameters())

{
	"version":	"1.1.6",
	"x_dim":	4,
	"y_dim":	2,
	"n_actions":	1,
	"omp_num_threads":	8,
	"pop_init":	false,
	"max_trials":	1,
	"perf_trials":	1000,
	"pop_size":	200,
	"loss_func":	"mae",
	"set_subsumption":	false,
	"theta_sub":	100,
	"e0":	0.001,
	"alpha":	1,
	"nu":	5,
	"beta":	0.05,
	"delta":	0.1,
	"theta_del":	100,
	"init_fitness":	0.01,
	"init_error":	0,
	"m_probation":	10000,
	"stateful":	true,
	"compaction":	false,
	"ea":	{
		"select_type":	"roulette",
		"theta_ea":	100,
		"lambda":	2,
		"p_crossover":	0.8,
		"err_reduc":	1,
		"fit_reduc":	0.1,
		"subsumption":	false,
		"pred_reset":	false
	},
	"condition":	{
		"type":	"neural",
		"args":	{
			"layer_0":	{
				"type":	"connected",
				"activation":	"selu",
				"n_inputs":	4,
				"n_init":	1,
				"evolve_weights":	true,
				"evolve_functions":	false,
				"evolve_connect":	false,
				"evolve_neurons":	true,
				"n_max":	100,
				"max_neuron_grow":	1,
				"sgd_weights":	false
			},
			"layer_1":	{
				"type":	"connected",
				"activ

### Execute experiment

In [4]:
total_steps: int = 0  # total number of steps performed
MAX_EPISODES: Final[int] = 2000  # maximum number of episodes to run
N: Final[int] = 100  # number of episodes to average performance
memory: deque[tuple[np.ndarray, int, float, np.ndarray, bool]] = deque(maxlen=50000)
scores: deque[float] = deque(maxlen=N)  # used to calculate moving average


def replay(replay_size: int = 5000) -> None:
    """Performs experience replay updates"""
    batch_size: Final[int] = min(len(memory), replay_size)
    batch = random.sample(memory, batch_size)
    for state, action, reward, next_state, done in batch:
        y_target = reward
        if not done:
            prediction_array = xcs.predict(next_state.reshape(1, -1))[0]
            y_target += GAMMA * np.max(prediction_array)
        target = xcs.predict(state.reshape(1, -1))[0]
        target[action] = y_target
        xcs.fit(state.reshape(1, -1), target.reshape(1, -1), True)


def egreedy_action(state: np.ndarray) -> int:
    """Selects an action using an epsilon greedy policy"""
    if np.random.rand() < epsilon:
        return random.randrange(N_ACTIONS)
    prediction_array = xcs.predict(state.reshape(1, -1))[0]
    return int(np.argmax(prediction_array))


def episode(episode_nr: int) -> tuple[float, int]:
    """Executes a single episode, saving to memory buffer"""
    episode_score: int = 0
    episode_steps: int = 0
    state: np.ndarray = env.reset()
    while True:
        action = egreedy_action(state)
        next_state, reward, done, _ = env.step(action)
        episode_steps += 1
        episode_score += reward
        memory.append((state, action, reward, next_state, done))
        if done:
            break
        state = next_state
    return episode_score, episode_steps


# learning episodes
for ep in range(MAX_EPISODES):
    # execute a single episode
    ep_score, ep_steps = episode(ep)
    # perform experience replay updates
    if ep % REPLAY_TIME == 0:
        replay()
    # display performance
    total_steps += ep_steps
    scores.append(ep_score)
    mean_score = np.mean(scores)
    print(
        f"episodes={ep} "
        f"steps={total_steps} "
        f"score={mean_score:.2f} "
        f"epsilon={epsilon:.5f} "
        f"error={xcs.error():.5f} "
        f"msize={xcs.mset_size():.2f}"
    )
    # is the problem solved?
    if ep > N and mean_score > env.spec.reward_threshold:
        print(
            f"solved after {ep} episodes: "
            f"mean score {mean_score:.2f} > {env.spec.reward_threshold:.2f}"
        )
        break
    # decay the exploration rate
    if epsilon > EPSILON_MIN:
        epsilon *= EPSILON_DECAY

episodes=0 steps=22 score=22.00 epsilon=1.00000 error=0.45890 msize=0.96
episodes=1 steps=64 score=32.00 epsilon=0.98000 error=0.74623 msize=1.00
episodes=2 steps=78 score=26.00 epsilon=0.96040 error=0.95108 msize=3.00
episodes=3 steps=95 score=23.75 epsilon=0.94119 error=0.63647 msize=5.00
episodes=4 steps=117 score=23.40 epsilon=0.92237 error=0.47357 msize=7.00
episodes=5 steps=140 score=23.33 epsilon=0.90392 error=0.50257 msize=10.82
episodes=6 steps=153 score=21.86 epsilon=0.88584 error=0.40620 msize=13.00
episodes=7 steps=166 score=20.75 epsilon=0.86813 error=0.37299 msize=16.99
episodes=8 steps=194 score=21.56 epsilon=0.85076 error=0.36840 msize=20.98
episodes=9 steps=212 score=21.20 epsilon=0.83375 error=0.27203 msize=25.00
episodes=10 steps=236 score=21.45 epsilon=0.81707 error=0.28501 msize=29.00
episodes=11 steps=249 score=20.75 epsilon=0.80073 error=0.27686 msize=34.96
episodes=12 steps=279 score=21.46 epsilon=0.78472 error=0.24191 msize=40.07
episodes=13 steps=294 score=21.

episodes=106 steps=13972 score=138.19 epsilon=0.11748 error=0.10366 msize=190.23
episodes=107 steps=14172 score=140.06 epsilon=0.11513 error=0.37629 msize=185.12
episodes=108 steps=14372 score=141.78 epsilon=0.11283 error=0.10355 msize=175.23
episodes=109 steps=14572 score=143.60 epsilon=0.11057 error=0.13927 msize=185.08
episodes=110 steps=14772 score=145.36 epsilon=0.10836 error=0.06223 msize=183.87
episodes=111 steps=14972 score=147.23 epsilon=0.10619 error=0.08393 msize=187.36
episodes=112 steps=15172 score=148.93 epsilon=0.10407 error=0.08879 msize=190.52
episodes=113 steps=15372 score=150.78 epsilon=0.10199 error=0.06556 msize=192.29
episodes=114 steps=15572 score=152.59 epsilon=0.09995 error=0.11410 msize=190.35
episodes=115 steps=15772 score=153.78 epsilon=0.09995 error=0.13014 msize=185.98
episodes=116 steps=15972 score=155.31 epsilon=0.09995 error=0.11234 msize=190.39
episodes=117 steps=16172 score=157.04 epsilon=0.09995 error=0.08933 msize=194.99
episodes=118 steps=16372 sco

### Final exploit episode

In [5]:
epsilon = 0
ep_score, ep_steps = episode(ep)
print(f"score = {ep_score}, steps = {ep_steps}")

# close Gym
env.close()

score = 200.0, steps = 200
