In [1]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces
import random


In [2]:
class KnapsackEnv(gym.Env):
    """
    A Gym environment for the 0-1 Knapsack Problem using reinforcement learning.
    
    The environment models the sequential decision process of selecting items
    for a knapsack with limited capacity to maximize total value.
    """
    metadata = {'render.modes': ['human']}
    
    def __init__(self, max_items=10, 
                 value_range=(1, 10), 
                 weight_range=(1, 10), 
                 capacity_ratio=0.5,
                 undefined_action_penalty=-10.0,
                 heavy_item_penalty=-1.0,
                 reward_scale=1.0):
        """
        Initialize the Knapsack environment.
        
        Args:
            max_items (int): Maximum number of items in any problem instance (N)
            value_range (tuple): Range of possible item values (min, max)
            weight_range (tuple): Range of possible item weights (min, max)
            capacity_ratio (float): Ratio to determine knapsack capacity relative to total item weight
            undefined_action_penalty (float): Penalty for selecting a non-existent item
            heavy_item_penalty (float): Penalty for selecting an item that exceeds remaining capacity
            reward_scale (float): Scaling factor for the reward
        """
        super(KnapsackEnv, self).__init__()
        
        self.max_items = max_items  # N in the problem description
        self.value_range = value_range
        self.weight_range = weight_range
        self.capacity_ratio = capacity_ratio
        self.undefined_action_penalty = undefined_action_penalty
        self.heavy_item_penalty = heavy_item_penalty
        self.reward_scale = reward_scale
        
        # Action space: selecting one of the N items
        self.action_space = spaces.Discrete(max_items)
        
        # State space: 2N + 4 features
        # - n_P' (number of remaining items)
        # - v_i and w_i for each item (2*N features)
        # - W_P' (remaining capacity)
        # - Total value of items
        # - Total weight of items
        self.observation_space = spaces.Box(
            low=0, 
            high=float('inf'),
            shape=(2 * max_items + 4,),
            dtype=np.float32
        )
        
        # Initialize problem instance
        self.reset()
        
    def reset(self, seed=None, options=None):
        """
        Reset the environment with a new problem instance.
        
        Returns:
            observation (np.array): Initial state vector
            info (dict): Additional information
        """
        super().reset(seed=seed)
        
        # Generate a random number of items (up to max_items)
        self.current_items = random.randint(1, self.max_items)
        
        # Generate random values and weights for items
        self.values = np.random.uniform(
            self.value_range[0], 
            self.value_range[1], 
            self.current_items
        ).astype(np.float32)
        
        self.weights = np.random.uniform(
            self.weight_range[0], 
            self.weight_range[1], 
            self.current_items
        ).astype(np.float32)
        
        # Set the knapsack capacity based on the total weight and capacity ratio
        total_weight = np.sum(self.weights)
        self.initial_capacity = float(total_weight * self.capacity_ratio)
        self.remaining_capacity = self.initial_capacity
        
        # Track which items are still available
        self.available_items = np.ones(self.max_items, dtype=bool)
        self.available_items[self.current_items:] = False
        
        # Track the current solution
        self.selected_items = []
        self.current_value = 0.0
        self.current_weight = 0.0
        
        # Create and return the initial observation
        observation = self._get_state()
        info = {
            'num_items': self.current_items,
            'capacity': self.initial_capacity,
            'selected_items': [],
            'current_value': 0.0,
            'current_weight': 0.0
        }
        
        return observation, info
    
    def step(self, action):
        """
        Take an action (select an item) and return the next state, reward, etc.
        
        Args:
            action (int): The index of the item to select (0 to N-1)
            
        Returns:
            observation (np.array): The next state
            reward (float): The reward for the action
            terminated (bool): Whether the episode is done
            truncated (bool): Whether the episode was truncated
            info (dict): Additional information
        """
        # Initialize reward
        reward = 0.0
        
        # Check if the action is valid (item exists)
        if action >= self.max_items or not self.available_items[action]:
            # Penalty for undefined action
            reward = self.undefined_action_penalty
        else:
            # Check if the item fits in the knapsack
            item_weight = self.weights[action] if action < len(self.weights) else 0
            
            if item_weight <= self.remaining_capacity:
                # Item fits, add it to the knapsack
                self.selected_items.append(action)
                item_value = self.values[action] if action < len(self.values) else 0
                
                # Update current solution
                self.current_value += item_value
                self.current_weight += item_weight
                self.remaining_capacity -= item_weight
                
                # Reward is the value of the item
                reward = item_value * self.reward_scale
            else:
                # Item doesn't fit, apply penalty
                reward = self.heavy_item_penalty
        
        # Remove the item from available items regardless of whether it was added
        if action < self.max_items:
            self.available_items[action] = False
        
        # Check if the episode is done
        terminated = not np.any(self.available_items) or self.remaining_capacity <= min(self.weights[self.available_items[:len(self.weights)]]) if any(self.available_items[:len(self.weights)]) else True
        
        # Create the next state
        observation = self._get_state()
        
        # Information about the current state
        info = {
            'selected_items': self.selected_items.copy(),
            'current_value': float(self.current_value),
            'current_weight': float(self.current_weight),
            'remaining_capacity': float(self.remaining_capacity)
        }
        
        return observation, reward, terminated, False, info
        
    def _get_state(self):
        """
        Create the state representation as described in the problem.
        
        Returns:
            state (np.array): The state vector of size 2N + 4
        """
        # Initialize state vector with zeros
        state = np.zeros(2 * self.max_items + 4, dtype=np.float32)
        
        # Number of remaining items
        remaining_items = np.sum(self.available_items)
        state[0] = remaining_items
        
        # Values and weights of available items
        v_idx = 1
        w_idx = 1 + self.max_items
        
        for i in range(self.max_items):
            if i < len(self.values) and self.available_items[i]:
                state[v_idx + i] = self.values[i]
                state[w_idx + i] = self.weights[i]
        
        # Remaining capacity
        state[2 * self.max_items + 1] = self.remaining_capacity
        
        # Total value and weight of all available items
        if remaining_items > 0:
            available_indices = np.where(self.available_items[:len(self.values)])[0]
            state[2 * self.max_items + 2] = np.sum(self.values[available_indices]) if len(available_indices) > 0 else 0
            state[2 * self.max_items + 3] = np.sum(self.weights[available_indices]) if len(available_indices) > 0 else 0
        
        return state
    
    def render(self, mode='human'):
        """
        Render the current state of the environment.
        
        Args:
            mode (str): The rendering mode
        """
        if mode == 'human':
            print("\n==== Knapsack Environment State ====")
            print(f"Capacity: {self.remaining_capacity:.2f}/{self.initial_capacity:.2f}")
            print(f"Current solution value: {self.current_value:.2f}")
            print(f"Current solution weight: {self.current_weight:.2f}")
            print(f"Selected items: {self.selected_items}")
            print(f"Available items: {np.where(self.available_items)[0]}")
            print("====================================\n")
    
    def get_optimal_solution(self):
        """
        Compute the optimal solution for the current problem instance using dynamic programming.
        This can be used to evaluate the RL agent's performance.
        
        Returns:
            optimal_value (float): The maximum possible value
            optimal_items (list): The indices of items in the optimal solution
        """
        n = len(self.values)
        capacity = int(self.initial_capacity * 100)  # Scale to integers for DP
        weights = [int(w * 100) for w in self.weights]  # Scale to integers for DP
        
        # Initialize DP table
        dp = [[0 for _ in range(capacity + 1)] for _ in range(n + 1)]
        
        # Fill the DP table
        for i in range(1, n + 1):
            for w in range(capacity + 1):
                if weights[i - 1] <= w:
                    dp[i][w] = max(
                        self.values[i - 1] + dp[i - 1][w - weights[i - 1]],
                        dp[i - 1][w]
                    )
                else:
                    dp[i][w] = dp[i - 1][w]
        
        # Backtrack to find the items in optimal solution
        optimal_value = dp[n][capacity]
        optimal_items = []
        
        w = capacity
        for i in range(n, 0, -1):
            if dp[i][w] != dp[i - 1][w]:
                optimal_items.append(i - 1)
                w -= weights[i - 1]
        
        return optimal_value, optimal_items


In [3]:

# Example of creating the environment and running an episode with random actions
def example_random_episode():
    # Create environment
    env = KnapsackEnv(max_items=5)
    obs, info = env.reset()
    
    done = False
    total_reward = 0
    
    print("Starting a new episode with random actions")
    env.render()
    
    while not done:
        # Take a random action
        action = env.action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        
        total_reward += reward
        print(f"Action: {action}, Reward: {reward:.2f}")
        env.render()
    
    print(f"Episode finished with total reward: {total_reward:.2f}")
    
    # Compare with optimal solution
    optimal_value, optimal_items = env.get_optimal_solution()
    print(f"Optimal solution value: {optimal_value:.2f}")
    print(f"Optimal solution items: {optimal_items}")
    print(f"Agent performance ratio: {env.current_value/optimal_value if optimal_value > 0 else 0:.2f}")


if __name__ == "__main__":
    example_random_episode()

Starting a new episode with random actions

==== Knapsack Environment State ====
Capacity: 9.70/9.70
Current solution value: 0.00
Current solution weight: 0.00
Selected items: []
Available items: [0 1 2 3 4]

Action: 3, Reward: 8.97

==== Knapsack Environment State ====
Capacity: 0.70/9.70
Current solution value: 8.97
Current solution weight: 9.00
Selected items: [np.int64(3)]
Available items: [0 1 2 4]

Episode finished with total reward: 8.97
Optimal solution value: 19.80
Optimal solution items: [4, 2, 0]
Agent performance ratio: 0.45
