In [None]:
import gym
import numpy as np
from gym import spaces
from random import randrange

SIZE = 10
REWARD_COUNT = 1
MAX_STEPS = 2500

class PathFindingEnvironment(gym.Env):
    """Simple Pathfinding Environment"""
    metadata = {'render.modes': ['human']}

    def __init__(self):
        super(PathFindingEnvironment, self).__init__()
        self.reward_range = (0, 1)
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.MultiDiscrete([3] * SIZE * SIZE)
        self.num_rounds = 0
        self.reset()

    def step(self, action):
        old_player_location = [self.player_location[0], self.player_location[1]]
        if action == 0:
            self.player_location[0] = min(self.player_location[0] + 1, SIZE - 1)
        elif action == 1:
            self.player_location[0] = max(self.player_location[0] - 1, 0)
        elif action == 2:
            self.player_location[1] = min(self.player_location[1] + 1, SIZE - 1)
        elif action == 3:
            self.player_location[1] = max(self.player_location[1] - 1, 0)
        
        done = False
        reward = 0
                
        if self.map[self.player_location[0]][self.player_location[1]] == 1:
            reward = 1 - min(self.num_steps / MAX_STEPS, 0.9)
            self.reward_count -= 1
            if self.reward_count == 0:
                done = True
        
        if self.num_steps == MAX_STEPS:
            done = True
        
     
        self.map[old_player_location[0]][old_player_location[1]] = 0
        self.map[self.player_location[0]][self.player_location[1]] = 2
        
        self.num_steps += 1
        return self.get_obs(), reward, done, {}
    
    def get_obs(self):
        out = []
        for row in self.map:
            for col in row:
                out.append(col)
        return np.array(out)

    
    def reset(self):
        # Reset the state of the environment to an initial state
        self.map = [[0 for j in range(SIZE)] for i in range(SIZE)]
        for i in range(REWARD_COUNT):
            reward_location = [randrange(SIZE), randrange(SIZE)]
            while (self.map[reward_location[0]][reward_location[1]] == 1):
                reward_location = [randrange(SIZE), randrange(SIZE)]
            self.map[reward_location[0]][reward_location[1]] = 1
        self.reward_count = REWARD_COUNT
        self.num_rounds += 1
        self.num_steps = 0
        self.player_location = [randrange(SIZE), randrange(SIZE)]
        while (self.map[self.player_location[0]][self.player_location[1]] == 1):
             self.player_location = [randrange(SIZE), randrange(SIZE)]
        self.map[self.player_location[0]][self.player_location[1]] = 2
        return self.get_obs()

    def render(self, mode='human', close=False):
        # Render the environment to the screen    
        for row in self.map:
            display = ""
            for col in row:
                if col == 0:
                    display += ' '
                elif col == 1:
                    display += '*'
                elif col == 2:
                    display += 'X'
            print(display)


In [3]:
from stable_baselines.common.policies import MlpPolicy, MlpLnLstmPolicy, FeedForwardPolicy, CnnPolicy
from stable_baselines.common import make_vec_env
from stable_baselines.common.env_checker import check_env
from stable_baselines import PPO2, DQN, ACER
from IPython.display import display, clear_output
import tensorflow as tf

tf.compat.v1.disable_eager_execution()
# The algorithms require a vectorized environment to run

# # Custom MLP policy of three layers of size 128 each
# class CustomPolicy(FeedForwardPolicy):
#     def __init__(self, *args, **kwargs):
#         super(CustomPolicy, self).__init__(*args, **kwargs,
#                                            net_arch=[dict(pi=[64, 64, 64, 64, 64, 64],
#                                                           vf=[64, 64, 64, 64, 64, 64])],
#                                            feature_extraction="mlp")

env = make_vec_env(PathFindingEnvironment, n_envs=8)
model = ACER(CnnPolicy, env, tensorboard_log="./tensorboard/")
model.setup_model()
model.learn(total_timesteps=2000000)
env.get_attr('num_rounds')



IndexError: list index out of range

In [16]:
import time
from stable_baselines.common.evaluation import evaluate_policy



# # Evaluate the agent
# mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=1000)
done = 0
obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
#     action=[randrange(4)]
    obs, rewards, dones, info = env.step(action)
    if (dones[0]):
        done += 1
        print(rewards)
        print(i)
    clear_output(wait=True)
    print(i)
    env.envs[0].render()
    time.sleep(0.5)


done

38
      
      
      
 X    
*     
      


KeyboardInterrupt: 