In [7]:
from knapsackgym import KnapsackEnv
from DP_Knapsack import solve_knapsack_dp, solve_KP_instances_with_DP
from AbstractKnapsackPolicy import AbstractKnapsackPolicy
from A2C import KnapsackA2C
from StateAggregator import StateAggregator
from typing import List, Callable, Optional, Union, Tuple, Dict, Any

## Main KnapsackDRLSolver

In [8]:

class KnapsackDRLSolver:
    """DRL-based Knapsack Solver using A2C algorithm"""
    
    def __init__(self, env, KPsolver:AbstractKnapsackPolicy, use_state_aggregation=False, gamma=0.99, lr_policy=0.001, lr_value=0.001, verbose=True):
        """
        Initialize the DRL solver.
        
        Args:
            env: Gym environment for the knapsack problem
            N (int): Maximum number of items in a problem instance
            use_state_aggregation (bool): Whether to use state aggregation
            gamma (float): Discount factor
            lr_policy (float): Learning rate for policy network
            lr_value (float): Learning rate for value network
        """
        self.env:KnapsackEnv = env
        self.N = KPsolver.N
        self.use_state_aggregation = use_state_aggregation
        
        
        # Initialize state aggregator if needed
        self.state_aggregator = StateAggregator() if use_state_aggregation else None
        self.KPsolver = KPsolver
        self.verbose = verbose
        
    def process_state(self, state):
        """
        Process state with optional aggregation.
        
        Args:
            state (numpy.ndarray): Original state
            
        Returns:
            numpy.ndarray: Processed state
        """
        if self.use_state_aggregation and self.state_aggregator is not None:
            return self.state_aggregator.aggregate(state)
        return state
    
    def train(self, problem_instances, t_max=None):
        """
        Train the DRL solver on multiple problem instances with progress tracking.
        
        Args:
            problem_instances (List[Dict]): List of problem instances
            t_max (int): Maximum number of training steps
            
        Returns:
            List[float]: Values of solutions for each problem instance
        """
        assert len(problem_instances) is not None or len(problem_instances) > 0

        if t_max is None:
            t_max = 3 * self.N * 10000  # As specified in the pseudocode
        
        val = [0] * len(problem_instances)  # Initialize solution values
        
        print(f"Training on {len(problem_instances)} KP Instances, with N={self.N}, t_max={t_max}")
        for t in range(t_max):

            # Select a problem instance (line 6 in pseudocode)
            # P_idx = np.random.randint(0, len(problem_instances))
            P_idx = t % len(problem_instances)
            P = problem_instances[P_idx]
            
            assert len(P['values'] )<= self.N, f"Problem Instance has too many items. KnapsackEnv is configuered to accept no more than <= {self.N}."

            # Change the environment to use this problem instance
            self.env.change_problem_instance(P)
            
            # Reset environment and get initial state
            state = self.env.reset()
            
            # Initialize for this episode
            done = False
            ow = 0  # Total weight of selected items
            ov = 0  # Total value of selected items
            
            # Create a copy of the problem instance P for modification
            n_P_prime = len(P['values'])
            W_P_prime = P['capacity']
            
            # Store states, actions, rewards for batch update
            states:List[np.ndarray] = []
            actions:List[int] = []
            rewards:List[float] = []
            next_states:List[np.ndarray] = []
            dones:List[bool] = []
            
            # Track episode rewards
            episode_rewards:List[float] = []
            
            # Solve the knapsack problem for this instance
            while ow < W_P_prime and n_P_prime > 0 and not done:
                # Process state if needed
                processed_state = self.process_state(state)
                states.append(processed_state)
                
                # Get available actions (indices of remaining items)
                available_actions = list(range(len(self.env.items)))
                
                # Get action according to policy (line 12 in pseudocode)
                action = self.KPsolver.get_action(processed_state, available_actions)
                actions.append(action)
                
                # Take action and observe reward and next state
                next_state, reward, done, info = self.env.step(action)
                rewards.append(reward)
                episode_rewards.append(reward)
                next_states.append(next_state)
                dones.append(done)
                
                # Check if item fits in knapsack (line 13 in pseudocode)
                if action < n_P_prime and info['current_weight'] - ow <= W_P_prime:
                    # Update totals (line 14 in pseudocode)
                    ow = info['current_weight']
                    ov = info['current_value']
                    W_P_prime = P['capacity'] - ow
                
                # Update P_prime (line 16 in pseudocode)
                n_P_prime -= 1
                
                # Update state
                state = next_state
            
            # Update parameters using collected trajectories
            self.KPsolver.update_parameters(states, actions, rewards, next_states, dones)
            
            if self.verbose and t % 1000 == 0:
                print(f"Iteration [{t}/{t_max}], Training KP Instance {P_idx}, Reward: {sum(rewards) / len(rewards)}")


            # Update best value if needed (lines 20-22 in pseudocode)
            if ov > val[P_idx]:
                val[P_idx] = ov
            
        return val
    
    
    def solve(self, problem_instance):
        """
        Solve a single knapsack problem instance using the trained policy.
        
        Args:
            problem_instance (Dict): A problem instance
            
        Returns:
            Tuple[float, List[int]]: Total value and list of selected item indices
        """
        # Set environment to use this problem instance
        self.env.change_problem_instance(problem_instance)
        
        # Reset environment and get initial state
        state = self.env.reset()
        
        done = False
        total_value = 0
        total_weight = 0
        selected_items = []
        
        while not done:
            # Process state if needed
            processed_state = self.process_state(state)
            
            # Get available actions (indices of remaining items)
            available_actions = list(range(len(self.env.items)))
            
            if not available_actions:
                break
                
            # Get action according to policy
            action = self.KPsolver.get_action(processed_state, available_actions)
            
            # Get original item index (before any removal)
            original_item_idx = self.env.items[action][2]
            
            # Take action and observe reward and next state
            next_state, reward, done, info = self.env.step(action)
            
            # If item was added (positive reward means item fit)
            if reward > 0:
                selected_items.append(original_item_idx)
            
            # Update value and weight
            total_value = info['current_value']
            total_weight = info['current_weight']
            
            # Update state
            state = next_state
        
        return total_value, selected_items

# Training Loop

In [9]:

def train_knapsack_solver(env, problem_instances:List[Dict[str, Any]], KPsolver, use_state_aggregation=False, t_max=None, verbose=True):
    """
    Train a DRL-based knapsack solver on multiple problem instances.
    
    Args:
        env: Gym environment for knapsack problem
        problem_instances (List[Dict]): List of problem instances
        N (int): Maximum number of items in a problem instance
        use_state_aggregation (bool): Whether to use state aggregation
        gamma (float): Discount factor
        lr_policy (float): Learning rate for policy network
        lr_value (float): Learning rate for value network
        t_max (int): Maximum number of training steps
        
    Returns:
        Tuple[KnapsackDRLSolver, List[float]]: Trained solver and solution values
    """
    # Initialize solver
    solver = KnapsackDRLSolver(
        env=env,
        KPsolver=KPsolver,
        use_state_aggregation=use_state_aggregation,
        verbose=verbose
    )
    
    # Train solver
    solution_values = solver.train(problem_instances, t_max)

    return solver, solution_values


def evaluate_knapsack_solver(solver:KnapsackDRLSolver, test_instances:list[dict]):
    """
    Evaluate the trained solver on test instances.
    
    Args:
        solver (KnapsackDRLSolver): Trained solver
        test_instances (List[Dict]): List of test problem instances
        
    Returns:
        List[Dict]: Evaluation results for each test instance
    """
    results = []
    
    for i, instance in enumerate(test_instances):
        # Solve instance
        value, selected_items = solver.solve(instance)
        
        # Calculate actual total weight
        total_weight = sum(instance['weights'][idx] for idx in selected_items)
        
        # Check if solution respects capacity constraint
        is_valid = total_weight <= instance['capacity']

        optimal_value, optimal_items = solve_knapsack_dp(instance)
        
        # Store results
        results.append({
            'instance_idx': i,
            'total_value': value,
            'total_weight': total_weight,
            'capacity': instance['capacity'],
            'selected_items': selected_items,
            'is_valid': is_valid,
            "optimal_sol": optimal_value,
            "optimal_items": optimal_items,
            "optimality_ratio": value /  optimal_value 
        })
    
    return results

## Testing stuff

In [10]:
# Example usage
if __name__ == "__main__":
    
    # Create problem instances
    problem_instances = [
        {
            'values': [10, 5, 15, 7, 6, 18, 3, 20],
            'weights': [2, 3, 5, 7, 1, 4, 1, 5],
            'capacity': 15
        },
        {
            'values': [20, 30, 15, 25, 10, 14, 2, 6],
            'weights': [5, 10, 8, 12, 4, 2, 5, 20],
            'capacity': 20
        }
    ]

    N = 20
    gamma = 0.99
    t_max = 10000
    env:KnapsackEnv = KnapsackEnv(problem_instance=None, N=N)

    KPSolver = KnapsackA2C(N=N, gamma=gamma, lr_policy=0.001, lr_value=0.001, verbose=True)
    # Train solver
    solver, solution_values = train_knapsack_solver(
        env=env,
        problem_instances=problem_instances,
        KPsolver=KPSolver,
        use_state_aggregation=False,
        t_max=t_max, # Set a smaller value for testing
        verbose=True
    )

    optimal_sol = solve_KP_instances_with_DP(problem_instances)
    
    print("Trained solution values:", solution_values)
    print("Optimal solution values:", optimal_sol)
    
    # Evaluate on test instances
    test_instances = [
        {
            'values': [10, 20, 30],
            'weights': [5, 10, 15],
            'capacity': 20
        }
    ]
    
    solver.verbose = False
    results = evaluate_knapsack_solver(solver, test_instances)
    print("Evaluation results:")
    for res in results:
        print(res)

Training on 2 KP Instances, with N=20, t_max=10000
Iteration [0/10000], Training KP Instance 0, Reward: 0.19999999999999998


  states_tensor = torch.FloatTensor(states)
  value_loss = F.mse_loss(state_values, returns_tensor)


Iteration [1000/10000], Training KP Instance 0, Reward: 0.29166666666666663
Iteration [2000/10000], Training KP Instance 0, Reward: 0.32499999999999996
Iteration [3000/10000], Training KP Instance 0, Reward: 0.32499999999999996
Iteration [4000/10000], Training KP Instance 0, Reward: 0.32499999999999996
Iteration [5000/10000], Training KP Instance 0, Reward: 0.32499999999999996
Iteration [6000/10000], Training KP Instance 0, Reward: 0.32499999999999996
Iteration [7000/10000], Training KP Instance 0, Reward: 0.32499999999999996
Iteration [8000/10000], Training KP Instance 0, Reward: 0.32499999999999996
Iteration [9000/10000], Training KP Instance 0, Reward: 0.32499999999999996
Trained solution values: [54.0, 64.0]
Optimal solution values: [np.float64(59.0), np.float64(64.0)]
Evaluation results:
{'instance_idx': 0, 'total_value': 40.0, 'total_weight': 20, 'capacity': 20, 'selected_items': [2, 0], 'is_valid': True, 'optimal_sol': np.float64(40.0), 'optimal_items': [0, 2], 'optimality_ratio