In [1]:
import gym
import numpy as np
import pygame
from gym import spaces
import random
import math
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env


In [4]:

class CustomEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self):
        super(CustomEnv, self).__init__()
        self.screen_width = 1240
        self.screen_height = 800
        self.player_speed = 5
        self.goal_speed = 0
        self.action_space = spaces.Discrete(5)
        self.observation_space = spaces.Box(low=np.array([0, 0]), high=np.array([self.screen_width, self.screen_height]), dtype=np.float32)
        self.goal_radius = 20
        self.border_width = 10
        self.state = None
        self.steps_beyond_done = None
        self.screen = None
        self.clock = pygame.time.Clock()

    def step(self, action):
        assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action))
        player_x, player_y, goal_x, goal_y = self.state
        player_x += (action == 3) * -self.player_speed + (action == 4) * self.player_speed
        player_y += (action == 1) * -self.player_speed + (action == 2) * self.player_speed
        player_x = np.clip(player_x, self.border_width, self.screen_width - self.border_width)
        player_y = np.clip(player_y, self.border_width, self.screen_height - self.border_width)
        self.state = (player_x, player_y, goal_x, goal_y)
        done = math.sqrt((player_x - goal_x) ** 2 + (player_y - goal_y) ** 2) < self.goal_radius
        reward = 1 if done else 0
        info = {}
        return np.array(self.state), reward, done, info

    def reset(self):
        self.state = (self.screen_width // 2, self.screen_height // 2, random.randint(self.border_width, self.screen_width - self.border_width), random.randint(self.border_width, self.screen_height - self.border_width))
        self.steps_beyond_done = None
        return np.array(self.state)

    def render(self, mode='human'):
        if self.screen is None:
            pygame.init()
            self.screen = pygame.display.set_mode((self.screen_width, self.screen_height))
        self.screen.fill((0, 0, 0))
        player_x, player_y, goal_x, goal_y = self.state
        pygame.draw.circle(self.screen, (255, 0, 0), (int(player_x), int(player_y)), 10)
        pygame.draw.circle(self.screen, (0, 0, 255), (int(goal_x), int(goal_y)), self.goal_radius)
        pygame.display.flip()
        self.clock.tick(60)

    def close(self):
        if self.screen is not None:
            pygame.quit()
            self.screen = None
            
    def seed(self, seed=None):
        self._seed = seed
        random.seed(seed)
        np.random.seed(seed)

# Vectorize the environment
env = make_vec_env(lambda: CustomEnv(), n_envs=1)

# Instantiate the agent
model = PPO('MlpPolicy', env, verbose=1)



Using cpu device


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [5]:
# Train the agent
model.learn(total_timesteps=100)

# Save the agent
model.save("ppo_custom_env")

ValueError: could not broadcast input array from shape (4,) into shape (2,)

In [None]:
# Load and train further
model = PPO.load("ppo_custom_env")
model.learn(total_timesteps=10000)

In [None]:

# Test the trained agent
obs = env.reset()
for _ in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = env.step(action)
    env.render()
    if dones:
        obs = env.reset()
env.close()