### This example demonstrates the use of experience replay with XCSF to solve the cart-pole problem from the OpenAI Gym.

In [7]:
from __future__ import annotations

import random
from collections import deque
from typing import Final

import gym
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import rcParams

import xcsf

### Initialise OpenAI Gym problem environment

In [8]:
env = gym.make("CartPole-v0")
X_DIM: Final[int] = env.observation_space.shape[0]
N_ACTIONS: Final[int] = env.action_space.n

### Initialise XCSF

In [10]:
# constructor = (x_dim, y_dim, n_actions)
xcs: xcsf.XCS = xcsf.XCS(X_DIM, N_ACTIONS, 1)  # Supervised: i.e, single action

xcs.OMP_NUM_THREADS = 8  # number of CPU cores to use
xcs.POP_INIT = False  # use covering to initialise
xcs.MAX_TRIALS = 1  # one trial per fit
xcs.POP_SIZE = 200  # maximum population size
xcs.E0 = 0.001  # target error
xcs.BETA = 0.05  # classifier parameter update rate
xcs.ALPHA = 1  # accuracy offset
xcs.NU = 5  # accuracy slope
xcs.EA_SUBSUMPTION = False
xcs.SET_SUBSUMPTION = False
xcs.THETA_EA = 100  # EA invocation frequency
xcs.THETA_DEL = 100  # min experience before fitness used for deletion

condition_layers: Final[dict] = {
    "layer_0": {  # hidden layer
        "type": "connected",
        "activation": "selu",
        "evolve_weights": True,
        "evolve_neurons": True,
        "n_init": 1,
        "n_max": 100,
        "max_neuron_grow": 1,
    },
    "layer_1": {  # output layer
        "type": "connected",
        "activation": "linear",
        "evolve_weights": True,
        "n_init": 1,
    },
}

xcs.condition("neural", condition_layers)  # neural network conditions
xcs.action("integer")  # (dummy) integer actions
xcs.prediction("rls_quadratic")  # Quadratic RLS

GAMMA: Final[float] = 0.95  # discount rate for delayed reward
epsilon: float = 1  # initial probability of exploring
EPSILON_MIN: Final[float] = 0.1  # the minimum exploration rate
EPSILON_DECAY: Final[float] = 0.98  # the decay of exploration after each batch replay
REPLAY_TIME: Final[int] = 1  # perform replay update every n episodes

print(xcs.json_parameters())

{
	"version":	"1.1.6",
	"x_dim":	4,
	"y_dim":	2,
	"n_actions":	1,
	"omp_num_threads":	8,
	"pop_init":	false,
	"max_trials":	1,
	"perf_trials":	1000,
	"pop_size":	200,
	"loss_func":	"mae",
	"set_subsumption":	false,
	"theta_sub":	100,
	"e0":	0.001,
	"alpha":	1,
	"nu":	5,
	"beta":	0.05,
	"delta":	0.1,
	"theta_del":	100,
	"init_fitness":	0.01,
	"init_error":	0,
	"m_probation":	10000,
	"stateful":	true,
	"compaction":	false,
	"ea":	{
		"select_type":	"roulette",
		"theta_ea":	100,
		"lambda":	2,
		"p_crossover":	0.8,
		"err_reduc":	1,
		"fit_reduc":	0.1,
		"subsumption":	false,
		"pred_reset":	false
	},
	"condition":	{
		"type":	"neural",
		"args":	{
			"layer_0":	{
				"type":	"connected",
				"activation":	"selu",
				"n_inputs":	4,
				"n_init":	1,
				"evolve_weights":	true,
				"evolve_functions":	false,
				"evolve_connect":	false,
				"evolve_neurons":	true,
				"n_max":	100,
				"max_neuron_grow":	1,
				"sgd_weights":	false
			},
			"layer_1":	{
				"type":	"connected",
				"activ

### Execute experiment

In [11]:
total_steps: int = 0  # total number of steps performed
MAX_EPISODES: Final[int] = 2000  # maximum number of episodes to run
N: Final[int] = 100  # number of episodes to average performance
memory: deque[tuple[np.ndarray, int, float, np.ndarray, bool]] = deque(maxlen=50000)
scores: deque[float] = deque(maxlen=N)  # used to calculate moving average


def replay(replay_size: int = 5000) -> None:
    """Performs experience replay updates"""
    batch_size: Final[int] = min(len(memory), replay_size)
    batch = random.sample(memory, batch_size)
    for state, action, reward, next_state, done in batch:
        y_target = reward
        if not done:
            prediction_array = xcs.predict(next_state.reshape(1, -1))[0]
            y_target += GAMMA * np.max(prediction_array)
        target = xcs.predict(state.reshape(1, -1))[0]
        target[action] = y_target
        xcs.fit(state.reshape(1, -1), target.reshape(1, -1), True)


def egreedy_action(state: np.ndarray) -> int:
    """Selects an action using an epsilon greedy policy"""
    if np.random.rand() < epsilon:
        return random.randrange(N_ACTIONS)
    prediction_array = xcs.predict(state.reshape(1, -1))[0]
    return int(np.argmax(prediction_array))


def episode(episode_nr: int) -> tuple[float, int]:
    """Executes a single episode, saving to memory buffer"""
    episode_score: int = 0
    episode_steps: int = 0
    state: np.ndarray = env.reset()
    while True:
        action = egreedy_action(state)
        next_state, reward, done, _ = env.step(action)
        episode_steps += 1
        episode_score += reward
        memory.append((state, action, reward, next_state, done))
        if done:
            break
        state = next_state
    return episode_score, episode_steps


# learning episodes
for ep in range(MAX_EPISODES):
    # execute a single episode
    ep_score, ep_steps = episode(ep)
    # perform experience replay updates
    if ep % REPLAY_TIME == 0:
        replay()
    # display performance
    total_steps += ep_steps
    scores.append(ep_score)
    mean_score = np.mean(scores)
    print(
        f"episodes={ep} "
        f"steps={total_steps} "
        f"score={mean_score:.2f} "
        f"epsilon={epsilon:.5f} "
        f"error={xcs.error():.5f} "
        f"msize={xcs.mset_size():.2f}"
    )
    # is the problem solved?
    if ep > N and mean_score > env.spec.reward_threshold:
        print(
            f"solved after {ep} episodes: "
            f"mean score {mean_score:.2f} > {env.spec.reward_threshold:.2f}"
        )
        break
    # decay the exploration rate
    if epsilon > EPSILON_MIN:
        epsilon *= EPSILON_DECAY

episodes=0 steps=22 score=22.00 epsilon=1.00000 error=0.54950 msize=1.85
episodes=1 steps=35 score=17.50 epsilon=0.98000 error=0.55636 msize=1.97
episodes=2 steps=68 score=22.67 epsilon=0.96040 error=0.70579 msize=3.95
episodes=3 steps=92 score=23.00 epsilon=0.94119 error=0.57417 msize=5.82
episodes=4 steps=104 score=20.80 epsilon=0.92237 error=0.45187 msize=7.89
episodes=5 steps=126 score=21.00 epsilon=0.90392 error=0.47313 msize=9.91
episodes=6 steps=139 score=19.86 epsilon=0.88584 error=0.46032 msize=11.94
episodes=7 steps=162 score=20.25 epsilon=0.86813 error=0.34541 msize=16.00
episodes=8 steps=179 score=19.89 epsilon=0.85076 error=0.27983 msize=19.96
episodes=9 steps=204 score=20.40 epsilon=0.83375 error=0.25015 msize=23.44
episodes=10 steps=220 score=20.00 epsilon=0.81707 error=0.35286 msize=28.00
episodes=11 steps=232 score=19.33 epsilon=0.80073 error=0.40911 msize=32.00
episodes=12 steps=244 score=18.77 epsilon=0.78472 error=0.38815 msize=37.96
episodes=13 steps=268 score=19.1

episodes=106 steps=8477 score=83.38 epsilon=0.11748 error=0.12337 msize=197.23
episodes=107 steps=8677 score=85.15 epsilon=0.11513 error=0.21461 msize=188.66
episodes=108 steps=8877 score=86.98 epsilon=0.11283 error=0.14292 msize=183.42
episodes=109 steps=9077 score=88.73 epsilon=0.11057 error=0.12542 msize=194.72
episodes=110 steps=9277 score=90.57 epsilon=0.10836 error=0.13055 msize=182.89
episodes=111 steps=9477 score=92.45 epsilon=0.10619 error=0.11763 msize=193.31
episodes=112 steps=9677 score=94.33 epsilon=0.10407 error=0.28106 msize=198.06
episodes=113 steps=9877 score=96.09 epsilon=0.10199 error=0.59713 msize=187.25
episodes=114 steps=10077 score=97.86 epsilon=0.09995 error=0.09568 msize=198.77
episodes=115 steps=10277 score=99.73 epsilon=0.09995 error=0.17027 msize=196.88
episodes=116 steps=10477 score=101.53 epsilon=0.09995 error=0.13757 msize=188.89
episodes=117 steps=10677 score=103.23 epsilon=0.09995 error=0.13998 msize=181.94
episodes=118 steps=10877 score=105.09 epsilon=

### Final exploit episode

In [12]:
epsilon = 0
ep_score, ep_steps = episode(ep)
print(f"score = {ep_score}, steps = {ep_steps}")

# close Gym
env.close()

score = 200.0, steps = 200
