In [1]:
import gym
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random

from stable_baselines3 import PPO, DQN

In [6]:
env = gym.make("CartPole-v0")

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000, log_interval=4)
model.save("dqn_cartpole")

del model # remove to demonstrate saving and loading

model = PPO.load("dqn_cartpole")

obs = env.reset()
for x in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
      obs = env.reset()
env.close()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 48.6         |
|    ep_rew_mean          | 48.6         |
| time/                   |              |
|    fps                  | 1103         |
|    iterations           | 4            |
|    time_elapsed         | 7            |
|    total_timesteps      | 8192         |
| train/                  |              |
|    approx_kl            | 0.0106117055 |
|    clip_fraction        | 0.0966       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.638       |
|    explained_variance   | 0.201        |
|    learning_rate        | 0.0003       |
|    loss                 | 20.5         |
|    n_updates            | 30           |
|    policy_gradient_loss | -0.0202      |
|    value_loss           | 55           |
------------------------------------------


In [33]:
# this will be an environment of an array 4*1 (eg. [4,3,2,1] or [1,2,4,3]
class ArrayEnv(Env):
#git test
    
    def __init__(self):

        self.game_array_size = 100

        self.action_space = Discrete(6) #6 possible types of swaps (01,02,03,12,13,23)

        high = np.array([4] * 4)
        low = np.array([0] * 4)

        self.observation_space = Box(low, high, dtype=np.int16)

        #create an array of 100 random numbers between 0 and 1000
        self.state = np.random.randint(1000, size=(4))
        #game legnth of 100, shouldn't take more than 100 swaps

        self.end_array = np.copy(self.state)
        self.end_array.sort()

    #results when action taken
    def step(self, action):
        #print(self.state)
        #get the two values to swap
        if action == 0:
            x_indice = 0
            y_indice = 1
            
        elif action == 1:
            x_indice = 0
            y_indice = 2
            
        elif action == 2:
            x_indice = 0
            y_indice = 3
            
        elif action == 3:
            x_indice = 1
            y_indice = 2
            
        elif action == 4:
            x_indice = 1
            y_indice = 3
            
        elif action == 5:
            x_indice = 2
            y_indice = 3
        else:
            print("NO")
        

        #save the original values in order for ease of reading
        x_original = self.state[x_indice]
        y_original = self.state[y_indice]

        #perform the swap
        temp = self.state[x_indice]
        self.state[x_indice] = self.state[y_indice]
        self.state[y_indice] = temp


        #to calculate reward we first need to know how many elements are in the right spot
        correct_position = np.count_nonzero(self.state == self.end_array)
        # let's only reward if x comes before y in the array to simplify learning
        if x_indice < y_indice:
            #reward is set to the amount of things in correct position with size relative to
            #0.9/100 so that when everything is in place, the reward == 0.9 and then may be added to if the
            # movement itself is correct
            reward = correct_position*(0.9/4)
            #check if value at x is greater than value at y
            if x_original > y_original:
                #if a large x value is moving down the array
                reward +=0.1
            else:
                #undesirable action i.e. swapping two equal values or moving a large value up in the array
                reward = -100

        #check if game is over by comparing the current state to the final intended array
        if (self.state == self.end_array).all() == True:
            done = True
        else:
            done = False

        #set placeholder for info
        info = {}

        #return all data
        return self.state, reward, done, info        


    #implement printing the array here
    def render(self):
        #print (np.count_nonzero(self.state == self.end_array))
        print(self.state)

    #reset/setup the environment
    def reset(self):
        #reset array to random numbers
        self.state = np.random.randint(1000, size=(4))

        #create a sorted array for our final state
        self.end_array = np.copy(self.state)
        self.end_array.sort()
        #reset game length

        return self.state

In [34]:
env = ArrayEnv()

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=50000, log_interval=4)
model.save("test4")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 15.5        |
|    ep_rew_mean          | -679        |
| time/                   |             |
|    fps                  | 1161        |
|    iterations           | 4           |
|    time_elapsed         | 7           |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.008183256 |
|    clip_fraction        | 0.0373      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.76       |
|    explained_variance   | 2.47e-05    |
|    learning_rate        | 0.0003      |
|    loss                 | 9.01e+04    |
|    n_updates            | 30          |
|    policy_gradient_loss | -0.02       |
|    value_loss           | 2.12e+05    |
-----------------------------------------
------------------------

In [36]:
env = ArrayEnv()
obs = env.reset()

episodes = 100
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    #print("--- original array ---")
    #env.render()
    #print("--- beginning sort ---")
    moves = 0
    while not done:
        moves+=1
        action, _states = model.predict(state)
        state, reward, done, info = env.step(action)
        score+=reward
        #env.render()
    print("Episode:{} \tScore:{} \tMoves:{}".format(episode, score, moves))

--- original array ---
[542 659  33 589]
--- beginning sort ---
Episode:1 Score:1.65
--- original array ---
[589 261 917   9]
--- beginning sort ---
Episode:2 Score:2.425
--- original array ---
[216 340  96 279]
--- beginning sort ---
Episode:3 Score:1.65
--- original array ---
[945 281 507 651]
--- beginning sort ---
Episode:4 Score:1.875
--- original array ---
[501   4 678 628]
--- beginning sort ---
Episode:5 Score:1.55
--- original array ---
[195 339 278 142]
--- beginning sort ---
Episode:6 Score:1.75
--- original array ---
[394 632 150 312]
--- beginning sort ---
Episode:7 Score:1.55
--- original array ---
[248 214 169 688]
--- beginning sort ---
Episode:8 Score:1.875
--- original array ---
[911 817 275 849]
--- beginning sort ---
Episode:9 Score:1.55
--- original array ---
[239 935 870 472]
--- beginning sort ---
Episode:10 Score:1.875
--- original array ---
[606 265  81 847]
--- beginning sort ---
Episode:11 Score:1.0
--- original array ---
[622 692  61 365]
--- beginning sort 

In [None]:
env = ArrayEnv()
for x in range(1000):
    new_action = env.action_space.sample()
    obs, reward, done, info = env.step(new_action)
    env.render()
    if done:
        print("cool")
        obs = env.reset()
env.close()