In [1]:
import gym
from gym import Env
from gym.spaces import Discrete, Box, MultiDiscrete
import numpy as np
import random

from stable_baselines3 import PPO, DQN

In [38]:
# this will be an environment of an array 4*1 (eg. [4,3,2,1] or [1,2,4,3]
class ArrayEnv(Env):
    
    def __init__(self):

        self.game_array_size = 100

        self.action_space = Discrete(6) #6 possible types of swaps (01,02,03,12,13,23)

        high = np.array([1000] * 4)
        low = np.array([0] * 4)

        self.observation_space = Box(low, high, dtype=np.int16)

        #create an array of 100 random numbers between 0 and 1000
        self.state = np.random.randint(1000, size=(4))
        #game legnth of 100, shouldn't take more than 100 swaps

        self.end_array = np.copy(self.state)
        self.end_array.sort()

    #results when action taken
    def step(self, action):
        #print(self.state)
        #get the two values to swap
        if action == 0:
            x_indice = 0
            y_indice = 1
            
        elif action == 1:
            x_indice = 0
            y_indice = 2
            
        elif action == 2:
            x_indice = 0
            y_indice = 3
            
        elif action == 3:
            x_indice = 1
            y_indice = 2
            
        elif action == 4:
            x_indice = 1
            y_indice = 3
            
        elif action == 5:
            x_indice = 2
            y_indice = 3
        else:
            print("NO")
        

        #save the original values in order for ease of reading
        x_original = self.state[x_indice]
        y_original = self.state[y_indice]

        #perform the swap
        temp = self.state[x_indice]
        self.state[x_indice] = self.state[y_indice]
        self.state[y_indice] = temp


        #to calculate reward we first need to know how many elements are in the right spot
        correct_position = np.count_nonzero(self.state == self.end_array)
        # let's only reward if x comes before y in the array to simplify learning
        if x_indice < y_indice:
            #reward is set to the amount of things in correct position with size relative to
            #0.9/100 so that when everything is in place, the reward == 0.9 and then may be added to if the
            # movement itself is correct
            reward = correct_position*(0.9/4)
            #check if value at x is greater than value at y
            if x_original > y_original:
                #if a large x value is moving down the array
                reward +=0.1
            else:
                #undesirable action i.e. swapping two equal values or moving a large value up in the array
                reward = -100

        #check if game is over by comparing the current state to the final intended array
        if (self.state == self.end_array).all() == True:
            done = True
        else:
            done = False

        #set placeholder for info
        info = {}

        #return all data
        return self.state, reward, done, info        


    #implement printing the array here
    def render(self):
        #print (np.count_nonzero(self.state == self.end_array))
        print(self.state)

    #reset/setup the environment
    def reset(self):
        #reset array to random numbers
        self.state = np.random.randint(1000, size=(4))

        #create a sorted array for our final state
        self.end_array = np.copy(self.state)
        self.end_array.sort()
        #reset game length

        return self.state
    
    

In [2]:
# this will be an environment of an array 10*1 (eg. [4,3,2,1,6,7,8,9,29,201]
class ArrayEnv2(Env):
    
    def __init__(self, game_size):

        self.game_array_size = game_size

        self.action_space = MultiDiscrete([self.game_array_size, self.game_array_size]) #10 possible xs and 10 possible ys

        high = np.array([1000] * self.game_array_size)
        low = np.array([0] * self.game_array_size)

        self.observation_space = Box(low, high, dtype=np.int16)

        #create an array of 100 random numbers between 0 and 1000
        self.state = np.random.randint(1000, size=(self.game_array_size))
        #game legnth of 100, shouldn't take more than 100 swaps

        self.end_array = np.copy(self.state)
        self.end_array.sort()

    #results when action taken
    def step(self, action):

        
        x_indice = action[0]
        y_indice = action[1]
        #print("From: {}\tX: {}\tY: {}".format(action, x_indice, y_indice))
        #save the original values in order for ease of reading
        x_original = self.state[x_indice]
        y_original = self.state[y_indice]

        #perform the swap
        temp = self.state[x_indice]
        self.state[x_indice] = self.state[y_indice]
        self.state[y_indice] = temp


        #to calculate reward we first need to know how many elements are in the right spot
        correct_position = np.count_nonzero(self.state == self.end_array)
        # let's only reward if x comes before y in the array to simplify learning
        if x_indice < y_indice:
            #reward is set to the amount of things in correct position with size relative to
            #0.9/100 so that when everything is in place, the reward == 0.9 and then may be added to if the
            # movement itself is correct
            reward = correct_position*(0.9/self.game_array_size)
            #check if value at x is greater than value at y
            if x_original > y_original:
                #if a large x value is moving down the array
                reward +=0.1
            else:
                #undesirable action i.e. swapping two equal values or moving a large value up in the array
                reward = -100
        else:
            reward = -100

        #check if game is over by comparing the current state to the final intended array
        if (self.state == self.end_array).all() == True:
            done = True
        else:
            done = False

        #set placeholder for info
        info = {}

        #return all data
        return self.state, reward, done, info        


    #implement printing the array here
    def render(self):
        #print (np.count_nonzero(self.state == self.end_array))
        print(self.state)

    #reset/setup the environment
    def reset(self):
        #reset array to random numbers
        self.state = np.random.randint(1000, size=(self.game_array_size))

        #create a sorted array for our final state
        self.end_array = np.copy(self.state)
        self.end_array.sort()
        #reset game length

        return self.state

In [15]:
env = ArrayEnv2(5)

model = PPO("MlpPolicy", env, verbose=1, tensorboard_log="./PPO_array_5/", learning_rate = 0.00003)
model.learn(total_timesteps=3000000, log_interval=4)
model.save("test5-03")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./PPO_array_5/PPO_9
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 169           |
|    ep_rew_mean          | -1.35e+04     |
| time/                   |               |
|    fps                  | 786           |
|    iterations           | 4             |
|    time_elapsed         | 10            |
|    total_timesteps      | 8192          |
| train/                  |               |
|    approx_kl            | 3.2612734e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -3.22         |
|    explained_variance   | 0.000194      |
|    learning_rate        | 3e-05         |
|    loss                 | 8.33e+05      |
|    n_updates            | 30            |
|    policy_gradient_loss | -0.00034      |
|    value_loss           | 1.64e+06   

In [17]:
#env = ArrayEnv2(5)
obs = env.reset()
#model = PPO("MlpPolicy", env, verbose=1, tensorboard_log="./PPO_array_5/")
#model.load("test5")

episodes = 100
negatives = 0
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    #print("--- original array ---")
    #env.render()
    #print("--- beginning sort ---")
    moves = 0
    while not done:
        moves+=1
        action, _states = model.predict(state)
        state, reward, done, info = env.step(action)
        score+=reward
        #env.render()
    if score < 0:
        negatives+=1
    print("Episode: {} \tScore: {} \tMoves: {}".format(episode, score, moves))

print("Negatives: {}\t % Neg: {}".format(negatives, negatives/episodes)

SyntaxError: invalid syntax (<ipython-input-17-8d4bc8146b5b>, line 22)

In [None]:
env = ArrayEnv2(5)
for x in range(1000):
    new_action = env.action_space.sample()
    obs, reward, done, info = env.step(new_action)
    env.render()
    if done:
        print("cool")
        obs = env.reset()
env.close()