In [None]:
!pip install stable-baselines3 sb3-contrib

Collecting stable-baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.3/182.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sb3-contrib
  Downloading sb3_contrib-2.3.0-py3-none-any.whl (80 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.3/80.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gymnasium<0.30,>=0.28.1 (from stable-baselines3)
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium<0.30,>=0.28.1->stable-baselines3)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13->stable-baselines3)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecti

In [None]:
# GPU version
import gymnasium as gym
import numpy as np
from sb3_contrib import TQC
from stable_baselines3.common.callbacks import BaseCallback
import torch

# Define custom environment class
class InjectionMoldingEnv(gym.Env):
    """
    Custom environment for injection molding process control.
    """

    def __init__(self):
        super().__init__()

        # Define observation space
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(5,), dtype=np.float32)

        # Define action space
        self.action_space = gym.spaces.Box(low=-1.0, high=1.0, shape=(3,), dtype=np.float32)

        # Initialize state variables
        self.state = np.zeros(5, dtype=np.float32)
        self.time = 0

    def reset(self, seed=None, options=None):
        # Reset state and timer for a new episode
        super().reset(seed=seed)
        self.state = np.random.rand(5).astype(np.float32)  # Example: Random initial state
        self.time = 0
        return self.state, {}  # Return the initial observation and an empty info dictionary

    def step(self, action):
        # Implement action logic and calculate reward
        reward = 0  # Initialize reward

        # Apply action (replace with action effects on state and reward)
        # ... (modify based on your environment dynamics and action effects)

        # Update state based on action and time
        # (replace with appropriate state dynamics)
        action = np.array(action, dtype=self.action_space.dtype)  # Convert action to the appropriate type
        self.state[2:] += action  # Example: Action directly affects state

        # Calculate reward based on process quality, cycle time, and other relevant factors
        reward = -np.sum(np.abs(self.state[2:] - 1))  # Example: Reward for maintaining optimal state values

        # Check for terminal state (replace with termination criteria)
        terminated = False
        truncated = False
        if self.time > 200:  # Example: Maximum cycle time reached
            terminated = True

        self.time += 1
        return self.state, reward, terminated, truncated, {}  # Return the state, reward, terminated flag, truncated flag, and an empty info dictionary

# Create a callback to track training progress
class TrainingCallback(BaseCallback):
    def __init__(self, check_freq: int, save_path: str, verbose=1):
        super(TrainingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            # Save the model every check_freq steps
            self.model.save(self.save_path)
        return True

# Create the custom environment
env = InjectionMoldingEnv()

# Define learning rate schedule (optional, can be a constant value)
def linear_lr_schedule(current_progress_remaining):
    return 0.001 * current_progress_remaining  # Adjust learning rate decay as needed

# Create TQC model
model = TQC("MlpPolicy", env, learning_rate=linear_lr_schedule, verbose=1, device="cuda")

# Define the total number of training timesteps
total_timesteps = 100000  # Adjust based on experiment requirements

# Define the callback for saving the model during training
callback = TrainingCallback(check_freq=1000, save_path="./tqc_injection_molding_model")

# Train the model
model.learn(total_timesteps=total_timesteps, callback=callback)

# Evaluate the trained model
num_episodes = 10
rewards = []

for _ in range(num_episodes):
    observation = env.reset()[0]  # Reset returns a tuple (observation, info)
    episode_reward = 0
    done = False
    while not done:
        action, _states = model.predict(observation, deterministic=True)
        ction = np.array(action, dtype=env.action_space.dtype)  # Convert action to the appropriate type
        observation, reward, done, truncated, info = env.step(action)
        episode_reward += reward
        if done or truncated:
            break
    rewards.append(episode_reward)

print(f"Average reward over {num_episodes} episodes: {np.mean(rewards)}")

# Close the environment
env.close()

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 202      |
|    ep_rew_mean     | -803     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 78       |
|    time_elapsed    | 10       |
|    total_timesteps | 808      |
| train/             |          |
|    actor_loss      | 8.96     |
|    critic_loss     | 0.145    |
|    ent_coef        | 0.698    |
|    ent_coef_loss   | -1.21    |
|    learning_rate   | 0.000193 |
|    n_updates       | 707      |
---------------------------------
Average reward over 100 episodes: -69.74166525602341
