# Dependencies

In [1]:
import os

import imageio
import gymnasium as gym
import numpy as np
import torch
from agilerl.algorithms.ppo import PPO
from agilerl.hpo.mutation import Mutations
from agilerl.hpo.tournament import TournamentSelection
from agilerl.training.train_on_policy import train_on_policy
from agilerl.utils.utils import create_population, make_vect_envs
from tqdm import trange

# Defining Hyperparameters

In [2]:
# Initial hyperparameters
INIT_HP = {
    "POP_SIZE": 4,  # Population size
    "DISCRETE_ACTIONS": True,  # Discrete action space
    "BATCH_SIZE": 128,  # Batch size
    "LR": 0.001,  # Learning rate
    "LEARN_STEP": 1024,  # Learning frequency
    "GAMMA": 0.99,  # Discount factor
    "GAE_LAMBDA": 0.95,  # Lambda for general advantage estimation
    "ACTION_STD_INIT": 0.6,  # Initial action standard deviation
    "CLIP_COEF": 0.2,  # Surrogate clipping coefficient
    "ENT_COEF": 0.01,  # Entropy coefficient
    "VF_COEF": 0.5,  # Value function coefficient
    "MAX_GRAD_NORM": 0.5,  # Maximum norm for gradient clipping
    "TARGET_KL": None,  # Target KL divergence threshold
    "UPDATE_EPOCHS": 4,  # Number of policy update epochs
    # Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
    "CHANNELS_LAST": False,  # Use with RGB states
    "TARGET_SCORE": 200.0,  # Target score that will beat the environment
    "MAX_STEPS": 150000,  # Maximum number of steps an agent takes in an environment
    "EVO_STEPS": 10000,  # Evolution frequency
    "EVAL_STEPS": None,  # Number of evaluation steps per episode
    "EVAL_LOOP": 3,  # Number of evaluation episodes
    "TOURN_SIZE": 2,  # Tournament size
    "ELITISM": True,  # Elitism in tournament selection
}

# Mutation parameters
MUT_P = {
    # Mutation probabilities
    "NO_MUT": 0.4,  # No mutation
    "ARCH_MUT": 0.2,  # Architecture mutation
    "NEW_LAYER": 0.2,  # New layer mutation
    "PARAMS_MUT": 0.2,  # Network parameters mutation
    "ACT_MUT": 0.2,  # Activation layer mutation
    "RL_HP_MUT": 0.2,  # Learning HP mutation
    # Learning HPs to choose from
    "RL_HP_SELECTION": ["lr", "batch_size", "learn_step"],
    "MUT_SD": 0.1,  # Mutation strength
    "RAND_SEED": 42,  # Random seed
    # Define max and min limits for mutating RL hyperparams
    "MIN_LR": 0.0001,
    "MAX_LR": 0.01,
    "MIN_BATCH_SIZE": 8,
    "MAX_BATCH_SIZE": 1024,
    "MIN_LEARN_STEP": 256,
    "MAX_LEARN_STEP": 8192,
}

# Create the Environment

In [3]:
num_envs=8
env = make_vect_envs("CartPole-v1", num_envs=num_envs)  # Create environment
try:
    state_dim = env.single_observation_space.n  # Discrete observation space
    one_hot = True  # Requires one-hot encoding
except Exception:
    state_dim = env.single_observation_space.shape  # Continuous observation space
    one_hot = False  # Does not require one-hot encoding
try:
    action_dim = env.single_action_space.n  # Discrete action space
except Exception:
    action_dim = env.single_action_space.shape[0]  # Continuous action space

if INIT_HP["CHANNELS_LAST"]:
    # Adjust dimensions for PyTorch API (C, H, W), for envs with RGB image states
    state_dim = (state_dim[2], state_dim[0], state_dim[1])

# Create a Population of Agents

In [4]:
# Set-up the device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Define the network configuration of a simple mlp with two hidden layers, each with 64 nodes
net_config = {"arch": "mlp", "hidden_size": [64, 64]}

# Define a population
pop = create_population(
    algo="PPO",  # Algorithm
    state_dim=state_dim,  # State dimension
    action_dim=action_dim,  # Action dimension
    one_hot=one_hot,  # One-hot encoding
    net_config=net_config,  # Network configuration
    INIT_HP=INIT_HP,  # Initial hyperparameter
    population_size=INIT_HP["POP_SIZE"],  # Population size
    num_envs=num_envs,
    device=device,
)

# Creating Mutations and Tournament objects

In [5]:
tournament = TournamentSelection(
    INIT_HP["TOURN_SIZE"],
    INIT_HP["ELITISM"],
    INIT_HP["POP_SIZE"],
    INIT_HP["EVAL_LOOP"],
)

In [6]:
mutations = Mutations(
    algo="PPO",
    no_mutation=MUT_P["NO_MUT"],
    architecture=MUT_P["ARCH_MUT"],
    new_layer_prob=MUT_P["NEW_LAYER"],
    parameters=MUT_P["PARAMS_MUT"],
    activation=MUT_P["ACT_MUT"],
    rl_hp=MUT_P["RL_HP_MUT"],
    rl_hp_selection=MUT_P["RL_HP_SELECTION"],
    min_lr=MUT_P["MIN_LR"],
    max_lr=MUT_P["MAX_LR"],
    min_batch_size=MUT_P["MAX_BATCH_SIZE"],
    max_batch_size=MUT_P["MAX_BATCH_SIZE"],
    min_learn_step=MUT_P["MIN_LEARN_STEP"],
    max_learn_step=MUT_P["MAX_LEARN_STEP"],
    mutation_sd=MUT_P["MUT_SD"],
    arch=net_config["arch"],
    rand_seed=MUT_P["RAND_SEED"],
    device=device,
)

# Training and Saving an Agent

In [7]:
# Define a save path for our trained agent
save_path = "PPO_cartpole_trained_agent.pt"

trained_pop, pop_fitnesses = train_on_policy(
    env=env,
    env_name="Acrobot-v1",
    algo="PPO",
    pop=pop,
    INIT_HP=INIT_HP,
    MUT_P=MUT_P,
    swap_channels=INIT_HP["CHANNELS_LAST"],
    max_steps=INIT_HP["MAX_STEPS"],
    evo_steps=INIT_HP["EVO_STEPS"],
    eval_steps=INIT_HP["EVAL_STEPS"],
    eval_loop=INIT_HP["EVAL_LOOP"],
    tournament=tournament,
    mutation=mutations,
    wb=False,  # Boolean flag to record run with Weights & Biases
    save_elite=True,  # Boolean flag to save the elite agent in the population
    elite_path=save_path,
)


Training...


  7%|6         | 10496/150000 [  01:57<  35:47, 64.97step/s] 


                --- Global Steps 40960 ---
                Fitness:		['277.00', '263.92', '364.33', '417.00']
                Score:		[58.21656050955414, 68.65413533834587, 73.79508196721312, 72.10483870967742]
                5 fitness avgs:	['417.00', '277.00', '277.00', '417.00']
                10 score avgs:	['218.10', '232.30', '232.30', '218.10']
                Agents:		[3, 4, 5, 6]
                Steps:		[10240, 10240, 10240, 10240]
                Mutations:		['None', 'bs', 'param', 'None']
                

 14%|#3        | 20736/150000 [  03:37<  40:40, 52.97step/s] 


                --- Global Steps 81920 ---
                Fitness:		['247.75', '401.50', '491.00', '259.79']
                Score:		[290.39285714285717, 332.64, 354.96, 319.0689655172414]
                5 fitness avgs:	['384.00', '338.40', '338.40', '384.00']
                10 score avgs:	['375.70', '215.30', '215.30', '375.70']
                Agents:		[5, 7, 8, 9]
                Steps:		[20480, 20480, 20480, 20480]
                Mutations:		['None', 'param', 'param', 'arch']
                

 21%|##        | 30976/150000 [  05:34<  32:44, 60.58step/s] 


                --- Global Steps 122880 ---
                Fitness:		['499.25', '462.54', '277.12', '500.00']
                Score:		[491.0, 297.07142857142856, 301.3, 447.6666666666667]
                5 fitness avgs:	['422.67', '422.67', '422.42', '379.78']
                10 score avgs:	['405.80', '405.80', '490.20', '348.00']
                Agents:		[9, 10, 11, 12]
                Steps:		[30720, 30720, 30720, 30720]
                Mutations:		['bs', 'None', 'arch', 'arch']
                

 27%|##7       | 41216/150000 [  06:45<  12:16, 147.69step/s]


                --- Global Steps 163840 ---
                Fitness:		['497.12', '438.42', '470.17', '163.29']
                Score:		[495.25, 448.4117647058824, 464.05882352941177, 52.53896103896104]
                5 fitness avgs:	['441.28', '434.35', '434.35', '441.28']
                10 score avgs:	['492.40', '467.80', '467.80', '492.40']
                Agents:		[9, 13, 14, 15]
                Steps:		[40960, 40960, 40960, 40960]
                Mutations:		['None', 'bs', 'None', 'None']
                

 34%|###4      | 51456/150000 [  07:47<  14:27, 113.54step/s]


                --- Global Steps 204800 ---
                Fitness:		['487.62', '162.75', '439.12', '444.46']
                Score:		[500.0, 421.0, 356.84, 443.77777777777777]
                5 fitness avgs:	['450.55', '435.31', '441.92', '441.92']
                10 score avgs:	['500.00', '333.60', '400.60', '400.60']
                Agents:		[9, 16, 17, 18]
                Steps:		[51200, 51200, 51200, 51200]
                Mutations:		['arch', 'None', 'arch', 'None']
                

 41%|####1     | 61696/150000 [  08:52<  12:47, 115.08step/s]


                --- Global Steps 245760 ---
                Fitness:		['500.00', '376.12', '323.04', '154.08']
                Score:		[390.35, 336.4642857142857, 340.5, 238.85365853658536]
                5 fitness avgs:	['495.15', '455.13', '455.13', '495.15']
                10 score avgs:	['459.60', '198.10', '198.10', '459.60']
                Agents:		[9, 19, 20, 21]
                Steps:		[61440, 61440, 61440, 61440]
                Mutations:		['param', 'param', 'None', 'param']
                

 48%|####7     | 71936/150000 [  10:16<  13:58, 93.15step/s] 


                --- Global Steps 286720 ---
                Fitness:		['428.17', '500.00', '356.54', '470.88']
                Score:		[390.3636363636364, 368.09090909090907, 303.6896551724138, 491.5]
                5 fitness avgs:	['491.12', '482.58', '491.12', '491.12']
                10 score avgs:	['486.40', '259.70', '486.40', '486.40']
                Agents:		[21, 22, 23, 24]
                Steps:		[71680, 71680, 71680, 71680]
                Mutations:		['arch', 'bs', 'None', 'param']
                

 55%|#####4    | 82176/150000 [  11:09<  12:17, 92.00step/s] 


                --- Global Steps 327680 ---
                Fitness:		['454.79', '490.54', '358.75', '500.00']
                Score:		[404.23809523809524, 388.8181818181818, 479.8125, 398.55]
                5 fitness avgs:	['491.12', '491.12', '482.08', '491.12']
                10 score avgs:	['355.10', '355.10', '430.80', '355.10']
                Agents:		[24, 25, 26, 27]
                Steps:		[81920, 81920, 81920, 81920]
                Mutations:		['bs', 'None', 'None', 'None']
                

 62%|######1   | 92416/150000 [  12:04<  09:22, 102.35step/s]


                --- Global Steps 368640 ---
                Fitness:		['453.12', '418.58', '500.00', '498.54']
                Score:		[422.85, 477.6470588235294, 426.7894736842105, 366.45454545454544]
                5 fitness avgs:	['491.41', '491.41', '491.41', '482.32']
                10 score avgs:	['328.00', '328.00', '328.00', '345.70']
                Agents:		[27, 28, 29, 30]
                Steps:		[92160, 92160, 92160, 92160]
                Mutations:		['param', 'None', 'None', 'lr']
                

 68%|######8   | 102656/150000 [  12:51<  07:52, 100.28step/s]


                --- Global Steps 409600 ---
                Fitness:		['428.83', '500.00', '467.54', '500.00']
                Score:		[292.5806451612903, 500.0, 459.6666666666667, 470.52941176470586]
                5 fitness avgs:	['493.88', '493.88', '484.80', '487.39']
                10 score avgs:	['500.00', '500.00', '493.60', '447.20']
                Agents:		[28, 31, 32, 33]
                Steps:		[102400, 102400, 102400, 102400]
                Mutations:		['None', 'arch', 'None', 'None']
                

 75%|#######5  | 112896/150000 [  13:43<  06:45, 91.49step/s] 


                --- Global Steps 450560 ---
                Fitness:		['483.17', '500.00', '500.00', '500.00']
                Score:		[500.0, 477.47058823529414, 479.8235294117647, 468.6666666666667]
                5 fitness avgs:	['493.88', '487.39', '487.39', '487.39']
                10 score avgs:	['461.70', '443.60', '443.60', '443.60']
                Agents:		[31, 34, 35, 36]
                Steps:		[112640, 112640, 112640, 112640]
                Mutations:		['arch', 'lr', 'None', 'param']
                

 82%|########2 | 123136/150000 [  14:36<  04:54, 91.09step/s] 


                --- Global Steps 491520 ---
                Fitness:		['500.00', '472.75', '489.88', '447.04']
                Score:		[478.8235294117647, 492.4375, 500.0, 500.0]
                5 fitness avgs:	['499.71', '487.77', '491.19', '499.71']
                10 score avgs:	['464.00', '487.90', '500.00', '464.00']
                Agents:		[31, 37, 38, 39]
                Steps:		[122880, 122880, 122880, 122880]
                Mutations:		['param', 'None', 'None', 'lr']
                

 89%|########8 | 133376/150000 [  15:27<  03:04, 89.88step/s] 


                --- Global Steps 532480 ---
                Fitness:		['459.29', '434.67', '449.25', '500.00']
                Score:		[400.65, 414.05, 500.0, 456.05555555555554]
                5 fitness avgs:	['499.71', '499.71', '474.70', '491.57']
                10 score avgs:	['475.20', '475.20', '424.30', '469.20']
                Agents:		[39, 40, 41, 42]
                Steps:		[133120, 133120, 133120, 133120]
                Mutations:		['bs', 'lr', 'None', 'None']
                

 96%|#########5| 143616/150000 [  16:17<  01:13, 86.60step/s] 


                --- Global Steps 573440 ---
                Fitness:		['500.00', '500.00', '500.00', '386.54']
                Score:		[476.11764705882354, 500.0, 470.6111111111111, 371.4782608695652]
                5 fitness avgs:	['500.00', '500.00', '500.00', '500.00']
                10 score avgs:	['459.40', '459.40', '500.00', '500.00']
                Agents:		[39, 43, 44, 45]
                Steps:		[143360, 143360, 143360, 143360]
                Mutations:		['None', 'None', 'None', 'None']
                

|          | 153600/? [  17:08<  00:00, 149.27step/s]         


                --- Global Steps 614400 ---
                Fitness:		['356.08', '447.54', '483.54', '500.00']
                Score:		[495.9375, 491.5625, 435.8421052631579, 497.375]
                5 fitness avgs:	['500.00', '489.51', '489.51', '500.00']
                10 score avgs:	['495.80', '486.50', '486.50', '495.80']
                Agents:		[45, 46, 47, 48]
                Steps:		[153600, 153600, 153600, 153600]
                Mutations:		['None', 'None', 'None', 'None']
                




In [8]:
total_steps = 0

# TRAINING LOOP
print("Training...")
pbar = trange(INIT_HP["MAX_STEPS"], unit="step")
while np.less([agent.steps[-1] for agent in pop], INIT_HP["MAX_STEPS"]).all():
    pop_episode_scores = []
    for agent in pop:  # Loop through population
        state, info = env.reset()  # Reset environment at start of episode
        scores = np.zeros(num_envs)
        completed_episode_scores = []
        steps = 0

        for _ in range(-(INIT_HP["EVO_STEPS"] // -agent.learn_step)):

            states = []
            actions = []
            log_probs = []
            rewards = []
            dones = []
            values = []

            learn_steps = 0

            for idx_step in range(-(agent.learn_step // -num_envs)):
                if INIT_HP["CHANNELS_LAST"]:
                    state = np.moveaxis(state, [-1], [-3])

                # Get next action from agent
                action, log_prob, _, value = agent.get_action(state)

                # Act in environment
                next_state, reward, terminated, truncated, info = env.step(action)

                total_steps += num_envs
                steps += num_envs
                learn_steps += num_envs

                states.append(state)
                actions.append(action)
                log_probs.append(log_prob)
                rewards.append(reward)
                dones.append(terminated)
                values.append(value)

                state = next_state
                scores += np.array(reward)

                for idx, (d, t) in enumerate(zip(terminated, truncated)):
                    if d or t:
                        completed_episode_scores.append(scores[idx])
                        agent.scores.append(scores[idx])
                        scores[idx] = 0

            pbar.update(learn_steps // len(pop))

            if INIT_HP["CHANNELS_LAST"]:
                next_state = np.moveaxis(next_state, [-1], [-3])

            experiences = (
                states,
                actions,
                log_probs,
                rewards,
                dones,
                values,
                next_state,
            )
            # Learn according to agent's RL algorithm
            agent.learn(experiences)

        agent.steps[-1] += steps
        pop_episode_scores.append(completed_episode_scores)

    # Evaluate population
    fitnesses = [
        agent.test(
            env,
            swap_channels=INIT_HP["CHANNELS_LAST"],
            max_steps=INIT_HP["EVAL_STEPS"],
            loop=INIT_HP["EVAL_LOOP"],
        )
        for agent in pop
    ]
    mean_scores = [
        (
            np.mean(episode_scores)
            if len(episode_scores) > 0
            else "0 completed episodes"
        )
        for episode_scores in pop_episode_scores
    ]

    print(f"--- Global steps {total_steps} ---")
    print(f"Steps {[agent.steps[-1] for agent in pop]}")
    print(f"Scores: {mean_scores}")
    print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}')
    print(
        f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop]}'
    )

    # Tournament selection and population mutation
    elite, pop = tournament.select(pop)
    pop = mutations.mutation(pop)

    # Update step counter
    for agent in pop:
        agent.steps.append(agent.steps[-1])

# Save the trained algorithm
elite.save_checkpoint(save_path)

pbar.close()
env.close()

Training...


  7%|███████████▍                                                                                                                                                        | 10496/150000 [01:46<34:25, 67.54step/s]

--- Global steps 40960 ---
Steps [20480, 20480, 20480, 20480]
Scores: [251.9142857142857, 253.6969696969697, 334.16, 207.7906976744186]
Fitnesses: ['359.67', '311.88', '434.38', '218.00']
5 fitness avgs: ['318.33', '287.90', '399.35', '317.50']


 14%|██████████████████████▋                                                                                                                                             | 20736/150000 [03:37<34:32, 62.37step/s]

--- Global steps 81920 ---
Steps [30720, 30720, 30720, 30720]
Scores: [346.1111111111111, 247.05882352941177, 412.95238095238096, 415.15]
Fitnesses: ['238.71', '339.29', '387.62', '384.92']
5 fitness avgs: ['345.81', '324.76', '395.44', '394.54']


 21%|█████████████████████████████████▊                                                                                                                                  | 30976/150000 [05:21<28:56, 68.55step/s]

--- Global steps 122880 ---
Steps [40960, 40960, 40960, 40960]
Scores: [404.7142857142857, 309.58620689655174, 403.9, 350.92]
Fitnesses: ['357.88', '197.54', '298.38', '252.38']
5 fitness avgs: ['386.05', '345.97', '371.18', '359.68']


 27%|████████████████████████████████████████████▊                                                                                                                      | 41216/150000 [06:45<15:12, 119.18step/s]

--- Global steps 163840 ---
Steps [51200, 51200, 51200, 51200]
Scores: [338.0, 238.16666666666666, 393.76190476190476, 340.0]
Fitnesses: ['437.00', '456.71', '323.00', '254.33']
5 fitness avgs: ['396.24', '368.12', '361.54', '359.71']


 34%|████████████████████████████████████████████████████████▎                                                                                                           | 51456/150000 [08:36<27:12, 60.36step/s]

--- Global steps 204800 ---
Steps [61440, 61440, 61440, 61440]
Scores: [490.5625, 466.29411764705884, 443.94736842105266, 450.77777777777777]
Fitnesses: ['495.21', '464.12', '494.46', '474.33']
5 fitness avgs: ['422.42', '416.20', '422.27', '418.24']


 41%|███████████████████████████████████████████████████████████████████▍                                                                                                | 61696/150000 [10:57<20:57, 70.22step/s]

--- Global steps 245760 ---
Steps [71680, 71680, 71680, 71680]
Scores: [363.7916666666667, 433.3809523809524, 452.5, 490.5]
Fitnesses: ['500.00', '274.67', '500.00', '500.00']
5 fitness avgs: ['435.54', '390.48', '435.54', '435.54']


 48%|██████████████████████████████████████████████████████████████████████████████▋                                                                                     | 71936/150000 [13:13<33:59, 38.27step/s]

--- Global steps 286720 ---
Steps [81920, 81920, 81920, 81920]
Scores: [466.5882352941176, 326.15384615384613, 460.3333333333333, 456.22222222222223]
Fitnesses: ['416.21', '477.71', '500.00', '500.00']
5 fitness avgs: ['441.26', '408.49', '458.02', '458.02']


 55%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                          | 81920/150000 [16:37<23:09, 49.01step/s]

--- Global steps 327680 ---
Steps [92160, 92160, 92160, 92160]
Scores: [395.5, 471.70588235294116, 486.1875, 477.94117647058823]
Fitnesses: ['363.17', '500.00', '363.33', '335.75']
5 fitness avgs: ['459.07', '486.44', '459.11', '453.59']


 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                               | 92416/150000 [20:32<25:29, 37.65step/s]

--- Global steps 368640 ---
Steps [102400, 102400, 102400, 102400]
Scores: [436.4117647058824, 385.3333333333333, 485.47058823529414, 324.32142857142856]
Fitnesses: ['469.42', '373.58', '495.12', '285.58']
5 fitness avgs: ['492.93', '440.91', '498.07', '428.82']


 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                   | 102656/150000 [23:33<21:28, 36.74step/s]

--- Global steps 409600 ---
Steps [112640, 112640, 112640, 112640]
Scores: [83.5137614678899, 315.74074074074076, 473.0625, 423.05]
Fitnesses: ['238.96', '489.29', '428.29', '380.21']
5 fitness avgs: ['446.82', '439.73', '484.68', '417.91']


 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 112640/150000 [25:06<01:36, 387.35step/s]

--- Global steps 450560 ---
Steps [122880, 122880, 122880, 122880]
Scores: [433.8333333333333, 252.57575757575756, 441.1111111111111, 266.0571428571429]
Fitnesses: ['497.42', '487.83', '485.00', '267.75']
5 fitness avgs: ['484.17', '444.38', '481.68', '400.37']


 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 122880/150000 [27:39<04:31, 100.05step/s]

--- Global steps 491520 ---
Steps [133120, 133120, 133120, 133120]
Scores: [477.11764705882354, 461.4375, 52.592592592592595, 479.1764705882353]
Fitnesses: ['483.75', '464.79', '236.88', '425.54']
5 fitness avgs: ['480.92', '474.64', '431.54', '469.27']


 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 133120/150000 [30:31<02:46, 101.64step/s]

--- Global steps 532480 ---
Steps [143360, 143360, 143360, 143360]
Scores: [347.2692307692308, 435.94444444444446, 412.76190476190476, 500.0]
Fitnesses: ['433.88', '438.25', '451.12', '495.88']
5 fitness avgs: ['467.69', '456.93', '464.87', '480.09']


 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 143360/150000 [33:16<02:22, 46.59step/s]

--- Global steps 573440 ---
Steps [153600, 153600, 153600, 153600]
Scores: [471.5, 462.55555555555554, 472.5882352941176, 480.11764705882354]
Fitnesses: ['394.42', '500.00', '500.00', '498.25']
5 fitness avgs: ['459.95', '468.67', '481.07', '468.32']


 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 143360/150000 [33:26<01:32, 71.44step/s]
