In [8]:
import gym
from gym import Env
from gym.spaces import Discrete, Box, MultiDiscrete
import numpy as np
import random

from stable_baselines3 import PPO, DQN

In [38]:
# this will be an environment of an array 4*1 (eg. [4,3,2,1] or [1,2,4,3]
class ArrayEnv(Env):
    
    def __init__(self):

        self.game_array_size = 100

        self.action_space = Discrete(6) #6 possible types of swaps (01,02,03,12,13,23)

        high = np.array([1000] * 4)
        low = np.array([0] * 4)

        self.observation_space = Box(low, high, dtype=np.int16)

        #create an array of 100 random numbers between 0 and 1000
        self.state = np.random.randint(1000, size=(4))
        #game legnth of 100, shouldn't take more than 100 swaps

        self.end_array = np.copy(self.state)
        self.end_array.sort()

    #results when action taken
    def step(self, action):
        #print(self.state)
        #get the two values to swap
        if action == 0:
            x_indice = 0
            y_indice = 1
            
        elif action == 1:
            x_indice = 0
            y_indice = 2
            
        elif action == 2:
            x_indice = 0
            y_indice = 3
            
        elif action == 3:
            x_indice = 1
            y_indice = 2
            
        elif action == 4:
            x_indice = 1
            y_indice = 3
            
        elif action == 5:
            x_indice = 2
            y_indice = 3
        else:
            print("NO")
        

        #save the original values in order for ease of reading
        x_original = self.state[x_indice]
        y_original = self.state[y_indice]

        #perform the swap
        temp = self.state[x_indice]
        self.state[x_indice] = self.state[y_indice]
        self.state[y_indice] = temp


        #to calculate reward we first need to know how many elements are in the right spot
        correct_position = np.count_nonzero(self.state == self.end_array)
        # let's only reward if x comes before y in the array to simplify learning
        if x_indice < y_indice:
            #reward is set to the amount of things in correct position with size relative to
            #0.9/100 so that when everything is in place, the reward == 0.9 and then may be added to if the
            # movement itself is correct
            reward = correct_position*(0.9/4)
            #check if value at x is greater than value at y
            if x_original > y_original:
                #if a large x value is moving down the array
                reward +=0.1
            else:
                #undesirable action i.e. swapping two equal values or moving a large value up in the array
                reward = -100

        #check if game is over by comparing the current state to the final intended array
        if (self.state == self.end_array).all() == True:
            done = True
        else:
            done = False

        #set placeholder for info
        info = {}

        #return all data
        return self.state, reward, done, info        


    #implement printing the array here
    def render(self):
        #print (np.count_nonzero(self.state == self.end_array))
        print(self.state)

    #reset/setup the environment
    def reset(self):
        #reset array to random numbers
        self.state = np.random.randint(1000, size=(4))

        #create a sorted array for our final state
        self.end_array = np.copy(self.state)
        self.end_array.sort()
        #reset game length

        return self.state
    
    

In [35]:
# this will be an environment of an array 10*1 (eg. [4,3,2,1,6,7,8,9,29,201]
class ArrayEnv2(Env):
    
    def __init__(self, game_size):

        self.game_array_size = game_size

        self.action_space = MultiDiscrete([self.game_array_size, self.game_array_size]) #10 possible xs and 10 possible ys

        high = np.array([1000] * self.game_array_size)
        low = np.array([0] * self.game_array_size)

        self.observation_space = Box(low, high, dtype=np.int16)

        #create an array of 100 random numbers between 0 and 1000
        self.state = np.random.randint(1000, size=(self.game_array_size))
        #game legnth of 100, shouldn't take more than 100 swaps

        self.end_array = np.copy(self.state)
        self.end_array.sort()

    #results when action taken
    def step(self, action):

        
        x_indice = action[0]
        y_indice = action[1]
        #print("From: {}\tX: {}\tY: {}".format(action, x_indice, y_indice))
        #save the original values in order for ease of reading
        x_original = self.state[x_indice]
        y_original = self.state[y_indice]

        #perform the swap
        temp = self.state[x_indice]
        self.state[x_indice] = self.state[y_indice]
        self.state[y_indice] = temp


        #to calculate reward we first need to know how many elements are in the right spot
        correct_position = np.count_nonzero(self.state == self.end_array)
        # let's only reward if x comes before y in the array to simplify learning
        if x_indice < y_indice:
            #reward is set to the amount of things in correct position with size relative to
            #0.9/100 so that when everything is in place, the reward == 0.9 and then may be added to if the
            # movement itself is correct
            reward = correct_position*(0.9/self.game_array_size)
            #check if value at x is greater than value at y
            if x_original > y_original:
                #if a large x value is moving down the array
                reward +=0.1
            else:
                #undesirable action i.e. swapping two equal values or moving a large value up in the array
                reward = -100
        else:
            reward = -100

        #check if game is over by comparing the current state to the final intended array
        if (self.state == self.end_array).all() == True:
            done = True
        else:
            done = False

        #set placeholder for info
        info = {}

        #return all data
        return self.state, reward, done, info        


    #implement printing the array here
    def render(self):
        #print (np.count_nonzero(self.state == self.end_array))
        print(self.state)

    #reset/setup the environment
    def reset(self):
        #reset array to random numbers
        self.state = np.random.randint(1000, size=(self.game_array_size))

        #create a sorted array for our final state
        self.end_array = np.copy(self.state)
        self.end_array.sort()
        #reset game length

        return self.state

In [47]:
env = ArrayEnv2(5)

model = PPO("MlpPolicy", env, verbose=1, tensorboard_log="./PPO_array_5/")
model.learn(total_timesteps=10000000, log_interval=4)
model.save("test5-1")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./PPO_array_5/PPO_6
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 169          |
|    ep_rew_mean          | -1.3e+04     |
| time/                   |              |
|    fps                  | 746          |
|    iterations           | 4            |
|    time_elapsed         | 10           |
|    total_timesteps      | 8192         |
| train/                  |              |
|    approx_kl            | 0.0067355293 |
|    clip_fraction        | 0.0249       |
|    clip_range           | 0.2          |
|    entropy_loss         | -3.17        |
|    explained_variance   | -0.000128    |
|    learning_rate        | 0.0003       |
|    loss                 | 6.78e+05     |
|    n_updates            | 30           |
|    policy_gradient_loss | -0.00929     |
|    value_loss           | 1.37e+06     |
---------------

In [61]:
env = ArrayEnv2(5)
obs = env.reset()

episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    #print("--- original array ---")
    #env.render()
    #print("--- beginning sort ---")
    moves = 0
    while not done:
        moves+=1
        action, _states = model.predict(state)
        state, reward, done, info = env.step(action)
        score+=reward
        env.render()
    print("Episode:{} \tScore:{} \tMoves:{}".format(episode, score, moves))

[377 851 887 888 480]
[377 851 887 480 888]
[377 480 887 851 888]
[377 480 851 887 888]
Episode:1 	Score:2.38 	Moves:4
[ 44 188 852 847 897]
[ 44 188 897 847 852]
[ 44 188 852 847 897]
[ 44 188 847 852 897]
Episode:2 	Score:-97.72 	Moves:4
[190   2 682 699 800]
[190 699 682   2 800]
[  2 699 682 190 800]
[  2 190 682 699 800]
Episode:3 	Score:-97.72 	Moves:4
[ 83 829 200 564 892]
[ 83 200 829 564 892]
[ 83 200 564 829 892]
Episode:4 	Score:2.1 	Moves:3
[  8  66 342 957  30]
[  8  66 342  30 957]
[  8  66  30 342 957]
[  8  30  66 342 957]
Episode:5 	Score:2.38 	Moves:4
[ 44 163 149 514 930]
[ 44 149 163 514 930]
Episode:6 	Score:1.64 	Moves:2
[187 690 729 102 893]
[102 690 729 187 893]
[102 187 729 690 893]
[102 187 690 729 893]
Episode:7 	Score:2.38 	Moves:4
[401 629 642 582 976]
[401 629 582 642 976]
[401 642 582 629 976]
[401 642 629 582 976]
[401 642 582 629 976]
[401 642 629 582 976]
[401 642 582 629 976]
[401 629 582 642 976]
[401 642 582 629 976]
[401 629 582 642 976]
[401 642 5

In [None]:
env = ArrayEnv2(5)
for x in range(1000):
    new_action = env.action_space.sample()
    obs, reward, done, info = env.step(new_action)
    env.render()
    if done:
        print("cool")
        obs = env.reset()
env.close()

In [70]:
#copied Code

from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback
#from pettingzoo.butterfly import pistonball_v4
import supersuit as ss
from ray import tune
from ray.tune.suggest.optuna import OptunaSearch
import optuna
import os
import ray
from pathlib import Path
import gym
from ray.tune.suggest import ConcurrencyLimiter

space = {
    "n_epochs": optuna.distributions.IntUniformDistribution(3, 50),
    "gamma": optuna.distributions.LogUniformDistribution(.9, .999),
    "ent_coef": optuna.distributions.LogUniformDistribution(.001, .1),
    "learning_rate": optuna.distributions.LogUniformDistribution(5e-6, 5e-4),
    "vf_coef": optuna.distributions.UniformDistribution(.1, 1),
    "gae_lambda": optuna.distributions.UniformDistribution(.8, 1),
    "max_grad_norm": optuna.distributions.LogUniformDistribution(.01, 10),
    "n_steps": optuna.distributions.CategoricalDistribution([128, 256, 512, 1024, 2048, 4096]),
    "batch_size": optuna.distributions.CategoricalDistribution([32, 64, 128, 256]),  # , 512, 1024, 2048, 4096
    "n_envs": optuna.distributions.CategoricalDistribution([2, 4, 8]),
    "clip_range": optuna.distributions.UniformDistribution(.1, 5),
}


optuna_search = OptunaSearch(
    space,
    metric="mean_reward",
    mode="max")


def make_env(n_envs):
    if n_envs is None:
        env = ArrayEnv2(5)
    else:
        env = ArrayEnv2(5)
        env = ss.stable_baselines3_vec_env_v0(env, n_envs, multiprocessing=False)
    return env


def evaluate_all_policies(name):

    def evaluate_policy(env, model):
        total_reward = 0
        NUM_RESETS = 100

        for i in range(NUM_RESETS):
            done = False
            obs = env.reset()
            while not done:
                act = model.predict(obs, deterministic=True)[0] if not done else None
                observation, reward, done, info = env.step(act)
                total_reward += reward

        return total_reward/NUM_RESETS

    env = make_env(None)
    policy_folder = str(Path.home())+'/policy_logs/'+name+'/'
    policy_files = os.listdir(policy_folder)
    policy_file = sorted(policy_files, key=lambda x: int(x[9:-10]))[-1]
    model = PPO.load(policy_folder+policy_file)

    return evaluate_policy(env, model)


def gen_filename(params):
    name = ''
    keys = list(params.keys())

    for key in keys:
        name = name+key+'_'+str(params[key])[0:5]+'_'

    name = name[0:-1]  # removes trailing _
    return name.replace('.', '')


def train(parameterization):
    name = gen_filename(parameterization)
    folder = str(Path.home())+'/policy_logs/'+name+'/'
    checkpoint_callback = CheckpointCallback(save_freq=400, save_path=folder)

    env = make_env(parameterization['n_envs'])
    model = PPO("MlpPolicy", env, gamma=parameterization['gamma'], n_steps=parameterization['n_steps'], ent_coef=parameterization['ent_coef'], learning_rate=parameterization['learning_rate'], vf_coef=parameterization['vf_coef'], max_grad_norm=parameterization['max_grad_norm'], gae_lambda=parameterization['gae_lambda'], batch_size=parameterization['batch_size'], clip_range=parameterization['clip_range'], n_epochs=parameterization['n_epochs'], tensorboard_log=(str(Path.home())+'/tensorboard_logs/'+name+'/'), policy_kwargs={"net_arch": [256, 256]})
    model.learn(total_timesteps=2000000, callback=checkpoint_callback)  # time steps steps of each agent; was 4 million

    mean_reward = evaluate_all_policies(name)
    tune.report(mean_reward=mean_reward)


ray.init(address='auto')

analysis = tune.run(
    train,
    num_samples=100,
    search_alg=ConcurrencyLimiter(optuna_search, max_concurrent=10),
    verbose=2,
    resources_per_trial={"gpu": 1, "cpu": 5},
)

AssertionError: Optuna must be installed! Run `pip install optuna`.