In [3]:
import gymnasium as gym 
import pygame
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random, time
import os
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
print('done')

done


In [24]:
class MowerEnv(Env):
    def __init__(self, len, render_mode = True):
        self.len = len
        self.render_mode = render_mode
        if render_mode == True:
            pygame.init()
            self.screen = pygame.display.set_mode((720, 720))      
        # Actions we can forward, turnleft, turnright
        self.action_space = Discrete(4)
        # 0=visited, 1=notvisited, 2=wall, 3=car (non one hot encoded)
        self.observation_space = Box(0, 4, shape=(self.len, self.len))
        # Set starting state
        self.state = np.ones((self.len, self.len), dtype=np.float32)
        self.state[0][0] = 3
        self.pos = [0, 0]
        # Set length
        self.running_length = (self.len+4)*(self.len+4)
        self.direct = [[-1, 0], [0, 1], [1, 0], [0, -1]]
        
    def step(self, action):
        reward = 0
        # Apply action (no walls for now)
        x = self.pos[0] + self.direct[action][0]
        y = self.pos[1] + self.direct[action][1]
        if x < 0 or x >= self.len or y < 0 or y >= self.len:
            reward = -5.0
        else:
            if self.state[x][y] == 1:
                reward = 1.0
            elif self.state[x][y] == 0:
                reward = -1.0
            self.state[x][y] = 3
            self.state[self.pos[0]][self.pos[1]] = 0
            self.pos = [x, y]
        
        # Reduce shower length by 1 second
        self.running_length -= 1 
        
        # Check if shower is done
        if self.running_length <= 0: 
            done = True
        else:
            done = False
        if np.sum(self.state) == 3:
            reward = self.len * self.len * 3
            done = True
        # Apply temperature noise
        #self.state += random.randint(-1,1)
        # Set placeholder for info
        info = {}
        
        # Return step information
        return (self.state, reward, done, False, info)

    def render(self):
        # Implement viz
        if self.render_mode == True:
            pygame.event.get()
            gap = 720//self.len
            for i in range(0, 720, gap):
                for j in range(0, 720, gap):
                    if self.state[i//gap][j//gap] == 0:
                        color = (255, 255, 255)
                    elif self.state[i//gap][j//gap] == 1:
                        color = (255, 255,0)
                    elif self.state[i//gap][j//gap] == 3:
                        color = (255, 0, 0)
                    pygame.draw.rect(self.screen, color, (j, i, gap, gap))
            pygame.display.update()
    
    
    def reset(self, seed=None, options=None):
        # Set starting state
        self.state = np.ones((self.len, self.len), dtype=np.float32)
        self.state[0][0] = 0
        self.state[0][0] = 3
        self.pos = [0, 0]
        # Set length
        self.running_length = (self.len+4)*(self.len+4)
        return self.state, {}
    
    def close(self):
        pass

array([[2, 3, 0, 1, 0, 3, 0, 0],
       [1, 3, 5, 0, 5, 2, 3, 5],
       [5, 1, 1, 1, 2, 4, 2, 2],
       [1, 4, 1, 1, 3, 4, 2, 1],
       [0, 3, 5, 4, 5, 4, 5, 0],
       [4, 1, 4, 5, 0, 3, 0, 1],
       [3, 5, 3, 3, 4, 3, 0, 3],
       [0, 5, 2, 3, 0, 1, 4, 5]], dtype=int64)

In [25]:
env = MowerEnv(36, True)

In [26]:
check_env(env)

In [28]:
#model = DQN("MlpPolicy", env, verbose=1)
model = PPO("MlpPolicy", env, verbose=1, gamma=0.999)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [29]:
model.learn(total_timesteps=400000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.6e+03  |
|    ep_rew_mean     | -976     |
| time/              |          |
|    fps             | 466      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.6e+03     |
|    ep_rew_mean          | -1.39e+03   |
| time/                   |             |
|    fps                  | 356         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.017953143 |
|    clip_fraction        | 0.211       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.37       |
|    explained_variance   | -0.0106     |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x225bbcc1c90>

In [32]:
average = 0
length = 0
for i in range(5):
    obs, _ = env.reset()
    while True:
        # Next action:
        # (feed the observation to your agent here)
        action = model.predict(obs)

        # Processing:
        obs, reward, terminated, _, info = env.step(action[0])
        average += reward
        length += 1
        # Rendering the game:
        # (remove this two lines during training)
        env.render()
        time.sleep(1 / 600)  # FPS
        
        # Checking if the player is still alive
        if terminated:
            break

    env.close()
print(length/5)
print(average/5)

1600.0
-1330.0


In [15]:
pygame.quit()