In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

# Define a simple environment
class InventoryEnv:
    def __init__(self, max_inventory=10, max_demand=5):
        self.max_inventory = max_inventory
        self.max_demand = max_demand
        self.reset()

    def reset(self):
        self.inventory = random.randint(0, self.max_inventory)
        self.demand = random.randint(0, self.max_demand)
        return torch.tensor([self.inventory, self.demand], dtype=torch.float32)

    def step(self, action):
        # action = order amount (0, 1, 2)
        self.inventory += action

        # compute reward
        reward = 0
        if self.inventory >= self.demand:
            reward = self.demand * 2 - (self.inventory - self.demand)  # reward high demand match, penalize overstock
        else:
            reward = - (self.demand - self.inventory) * 2  # penalty for shortage

        # update inventory
        self.inventory -= self.demand
        self.inventory = max(0, min(self.inventory, self.max_inventory))

        # new demand
        self.demand = random.randint(0, self.max_demand)

        next_state = torch.tensor([self.inventory, self.demand], dtype=torch.float32)
        done = False
        return next_state, torch.tensor(reward, dtype=torch.float32), done, {}

# Q-Network
class QNet(nn.Module):
    def __init__(self, state_size=2, action_size=3, hidden_size=32):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_size)
        )

    def forward(self, x):
        return self.net(x)

# Train RL agent
env = InventoryEnv()
qnet = QNet()
optimizer = optim.Adam(qnet.parameters(), lr=0.01)
gamma = 0.9
epsilon = 0.2
num_episodes = 10000

for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0

    for t in range(20):  # each episode = 20 steps
        # Epsilon-greedy
        if random.random() < epsilon:
            action = random.randint(0, 2)
        else:
            with torch.no_grad():
                q_values = qnet(state)
                action = torch.argmax(q_values).item()

        next_state, reward, done, _ = env.step(action)
        total_reward += reward.item()

        # Compute target
        with torch.no_grad():
            target = reward + gamma * torch.max(qnet(next_state))

        # Compute loss
        pred = qnet(state)[action]
        loss = (pred - target) ** 2

        # Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        state = next_state
        if done:
            break

    if (episode + 1) % 100 == 0:
        print(f"Episode {episode+1}: Total reward = {total_reward:.2f}")

print("Training complete ✅")

Episode 100: Total reward = 1.00
Episode 200: Total reward = -3.00
Episode 300: Total reward = -16.00
Episode 400: Total reward = -15.00
Episode 500: Total reward = -46.00
Episode 600: Total reward = -13.00
Episode 700: Total reward = 10.00
Episode 800: Total reward = -18.00
Episode 900: Total reward = 13.00
Episode 1000: Total reward = -52.00
Episode 1100: Total reward = -3.00
Episode 1200: Total reward = 0.00
Episode 1300: Total reward = 1.00
Episode 1400: Total reward = -11.00
Episode 1500: Total reward = -9.00
Episode 1600: Total reward = -27.00
Episode 1700: Total reward = -5.00
Episode 1800: Total reward = 14.00
Episode 1900: Total reward = 12.00
Episode 2000: Total reward = 3.00
Episode 2100: Total reward = -2.00
Episode 2200: Total reward = 19.00
Episode 2300: Total reward = 6.00
Episode 2400: Total reward = 23.00
Episode 2500: Total reward = -16.00
Episode 2600: Total reward = -21.00
Episode 2700: Total reward = 1.00
Episode 2800: Total reward = -7.00
Episode 2900: Total rewar

In [2]:
# ---- TESTING PHASE ----
print("\n--- Testing trained policy ---")

test_env = InventoryEnv()
total_test_reward = 0

for episode in range(5):  # run 5 test episodes
    state = test_env.reset()
    episode_reward = 0

    for t in range(10):  # each episode = 10 steps
        with torch.no_grad():
            q_values = qnet(state)
            action = torch.argmax(q_values).item()  # always pick best action

        next_state, reward, done, _ = test_env.step(action)
        episode_reward += reward.item()
        state = next_state

        print(f"Step {t+1}: State={state.numpy()} | Action={action} | Reward={reward.item():.2f}")

    print(f"Episode {episode+1} total reward: {episode_reward:.2f}")
    total_test_reward += episode_reward

print(f"\nAverage test reward: {total_test_reward / 5:.2f}")



--- Testing trained policy ---
Step 1: State=[4. 2.] | Action=0 | Reward=-2.00
Step 2: State=[4. 4.] | Action=2 | Reward=0.00
Step 3: State=[2. 2.] | Action=2 | Reward=6.00
Step 4: State=[2. 0.] | Action=2 | Reward=2.00
Step 5: State=[3. 4.] | Action=1 | Reward=-3.00
Step 6: State=[1. 4.] | Action=2 | Reward=7.00
Step 7: State=[0. 1.] | Action=2 | Reward=-2.00
Step 8: State=[1. 4.] | Action=2 | Reward=1.00
Step 9: State=[0. 4.] | Action=2 | Reward=-2.00
Step 10: State=[0. 0.] | Action=2 | Reward=-4.00
Episode 1 total reward: 3.00
Step 1: State=[0. 0.] | Action=2 | Reward=6.00
Step 2: State=[2. 1.] | Action=2 | Reward=-2.00
Step 3: State=[3. 2.] | Action=2 | Reward=-1.00
Step 4: State=[3. 3.] | Action=2 | Reward=1.00
Step 5: State=[2. 0.] | Action=2 | Reward=4.00
Step 6: State=[3. 4.] | Action=1 | Reward=-3.00
Step 7: State=[1. 5.] | Action=2 | Reward=7.00
Step 8: State=[0. 1.] | Action=2 | Reward=-4.00
Step 9: State=[1. 3.] | Action=2 | Reward=1.00
Step 10: State=[0. 4.] | Action=2 | 