## 1. Import dependencies

In [1]:
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import vec_frame_stack
from stable_baselines3.common.evaluation import evaluate_policy

## 2. Types of spaces

In [3]:
Discrete(3).sample()

np.int64(2)

In [6]:
Box(1, 2, shape=(3, 3)).sample()

array([[1.1809281, 1.4727439, 1.1300299],
       [1.1918077, 1.0165738, 1.7082382],
       [1.6306828, 1.6222956, 1.8615623]], dtype=float32)

In [7]:
Tuple((Discrete(3), Box(1, 2, shape=(3, 3)))).sample()

(np.int64(2),
 array([[1.8700967, 1.8984876, 1.9108628],
        [1.6352034, 1.5672063, 1.7939303],
        [1.0130398, 1.2071084, 1.0129813]], dtype=float32))

In [8]:
Dict({'height':Discrete(2), "speed":Box(0,100, shape=(1,))}).sample()

{'height': np.int64(0), 'speed': array([43.863068], dtype=float32)}

In [14]:
MultiBinary(4).sample()

array([0, 1, 1, 0], dtype=int8)

In [20]:
MultiDiscrete([15, 10, 5]).sample() # [varies between 0:14, 0:9, 0:4]

array([10,  2,  1])

## 3. Building and Env

In [21]:
# Main 4 functions in any environment
class CustomEnv(Env):
    def __init__(self):
        pass
    def step(self, action):
        pass
    def render(self):
        pass
    def reset (self):
        pass

In [30]:
import numpy as np
import random
from gymnasium import Env
from gymnasium.spaces import Discrete, Box

class ShowerEnv(Env):
    def __init__(self):
        # Actions we can take: down (0), stay (1), up (2)
        self.action_space = Discrete(3)
        
        # Temperature array (1D array of a single float)
        self.observation_space = Box(
            low=np.array([0.0]), # Use float for consistency
            high=np.array([100.0]), # Use float for consistency
            dtype=np.float32)
        
        # Initial state: 1D array containing a random starting temperature
        self.state = np.array([38.0 + random.randint(-3, 3)]).astype(np.float32)
        
        # Set shower length -> episode length
        self.shower_length = 60
        
    # -----------------------------------------------------------
    def step(self, action):
        # Apply action
        # 0 - 1 = -1  -> decrease temp
        # 1 - 1 = 0   -> hold
        # 2 - 1 = 1   -> increase tmep
        
        # Note: self.state is a numpy array, so we must add a scalar to the first element
        self.state[0] += action - 1 
        
        # Reduce shower length (episode) by 1 second
        self.shower_length -= 1 
        
        # Get the scalar temperature value for reward calculation
        current_temp = self.state[0]
        
        # Calculate reward
        if current_temp >= 37.0 and current_temp <= 39.0: 
            reward = 5.0 
        else: 
            reward = -1.0 
        
        # Check if shower is done
        terminated = self.shower_length <= 0
        truncated = False # Typically used for time limits, set to False here
        
        # Set placeholder for info
        info = {}
        
        # Return step information (observation, reward, terminated, truncated, info)
        return self.state, reward, terminated, truncated, info

    # -----------------------------------------------------------
    def render(self):
        # Implement viz
        pass
    
    # -----------------------------------------------------------
    def reset(self, seed=None, options=None):
        # *** FIX 1: The correct method signature and super call are needed ***
        # Handle seeding (important for SB3 compatibility)
        super().reset(seed=seed)
        
        # Reset shower temperature: Ensure the state is a np.array and the correct dtype
        self.state = np.array([38.0 + random.randint(-3, 3)]).astype(np.float32)
        
        # Reset shower time
        self.shower_length = 60 
        
        # *** FIX 2: Must return both the observation (self.state) AND an info dictionary ***
        info = {}
        return self.state, info

In [31]:
env = ShowerEnv()

  gym.logger.warn(
  gym.logger.warn(


In [28]:
env.observation_space

Box(0.0, 100.0, (1,), float32)

In [27]:
env.observation_space.sample()

array([56.140285], dtype=float32)

In [29]:
env.action_space

Discrete(3)

In [30]:
env.action_space.sample()

np.int64(1)

## 4. Test env

In [27]:
episodes = 5

for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    terminated = False
    truncated = False
    score = 0
    
    while not done:
        # select a random action from the action space
        action = env.action_space.sample()
        
        # perform that action
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        
        # get score
        score += reward
        
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:-54.0
Episode:2 Score:-26.0
Episode:3 Score:-48.0
Episode:4 Score:-22.0
Episode:5 Score:-32.0


## 5. Train model

In [32]:
model = PPO('MlpPolicy', env, verbose=1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [None]:
model.learn(total_timesteps=50000)

In [35]:
model.save('../Training/Saved_Models/PPO_Custom_ENV')

In [37]:
evaluate_policy(model, env, n_eval_episodes=10)

(np.float64(156.0), np.float64(176.36326148038881))