In [16]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt

In [17]:
from enum import IntEnum


class Action(IntEnum):
    UP = 0
    DOWN = 1
    LEFT = 2
    RIGHT = 3
    PICKUP = 4
    DROPOFF = 5


ACTION_NAMES = {
    Action.UP: "UP",
    Action.DOWN: "DOWN",
    Action.LEFT: "LEFT",
    Action.RIGHT: "RIGHT",
    Action.PICKUP: "PICKUP",
    Action.DROPOFF: "DROPOFF",
}

In [18]:
class WarehouseRobotEnv(gym.Env):
    metadata = {"render.modes": ["human"]}


    STEP_COST = -1
    OBSTACLE_PENALTY = -20
    PICKUP_REWARD = 25
    DELIVERY_REWARD = 100

    DIRECTIONS = {
        Action.UP: (-1, 0),
        Action.DOWN: (1, 0),
        Action.LEFT: (0, -1),
        Action.RIGHT: (0, 1),
    }

    def __init__(self):
        super().__init__()
        self.grid_size = 6
        self.obstacles = {(1, 1), (1, 2), (3, 4)}  # shelves
        self.pickup_point = (2, 3)
        self.dropoff_point = (5, 5)
        self.start_state = (0, 0)

        self.action_space = spaces.Discrete(len(Action))
        self.observation_space = spaces.MultiDiscrete([self.grid_size, self.grid_size, 2])

        self.reset()

    def reset(self, *, seed=None, options=None):
        super().reset(seed=seed)
        self.robot_pos = self.start_state
        self.carrying = False
        self._picked_once = False
        return self._get_obs(), {}

    def _get_obs(self):
        return np.array((*self.robot_pos, int(self.carrying)), dtype=np.int8)

    def step(self, action):
        assert self.action_space.contains(action), f"Invalid action {action}"

        r, c = self.robot_pos
        reward = self.STEP_COST 
        terminated = False

        if action in self.DIRECTIONS:
            dr, dc = self.DIRECTIONS[action]
            target = (max(0, min(self.grid_size - 1, r + dr)),
                      max(0, min(self.grid_size - 1, c + dc)))

            # if np.random.rand() < 0.1:
            #     target = (r, c)

            if target in self.obstacles:
                reward = self.OBSTACLE_PENALTY
            else:
                self.robot_pos = target

        elif action == Action.PICKUP:
            if self.robot_pos == self.pickup_point and not self.carrying:
                self.carrying = True
                if not self._picked_once:
                    reward += self.PICKUP_REWARD
                    self._picked_once = True

        elif action == Action.DROPOFF:
            if self.robot_pos == self.dropoff_point and self.carrying:
                reward += self.DELIVERY_REWARD
                terminated = True 

        obs = self._get_obs()
        return obs, reward, terminated, False, {}

    def render(self):
        grid = np.full((self.grid_size, self.grid_size), ".", dtype=str)
        for r, c in self.obstacles:
            grid[r, c] = "#"
        pr, pc = self.pickup_point
        dr, dc = self.dropoff_point
        grid[pr, pc] = "P"
        grid[dr, dc] = "D"

        rr, rc = self.robot_pos
        grid[rr, rc] = "R*" if self.carrying else "R"

        print("\n".join(" ".join(row) for row in grid))
        print()


In [19]:
env = WarehouseRobotEnv()
obs, _ = env.reset()

cumulative_reward = 0  

for t in range(10):
    row, col, carrying = obs
    action = env.action_space.sample()
    action_name = ACTION_NAMES[action]
    next_obs, reward, done, _, _ = env.step(action)

    cumulative_reward += reward 

    print(
        f"Step {t+1}: Curr State=({row},{col}) Carrying={bool(carrying)} "
        f"| Chosen Action={action_name} ({action}) -> Reward={reward} "
        f"| Cumulative Reward={cumulative_reward} "
        f"| Done={done}"
    )
    env.render()

    obs = next_obs
    if done:
        print(f"Delivery completed — total episode reward: {cumulative_reward}\n")
        obs, _ = env.reset()
        cumulative_reward = 0 

Step 1: Curr State=(0,0) Carrying=False | Chosen Action=DROPOFF (5) -> Reward=-1 | Cumulative Reward=-1 | Done=False
R . . . . .
. # # . . .
. . . P . .
. . . . # .
. . . . . .
. . . . . D

Step 2: Curr State=(0,0) Carrying=False | Chosen Action=UP (0) -> Reward=-1 | Cumulative Reward=-2 | Done=False
R . . . . .
. # # . . .
. . . P . .
. . . . # .
. . . . . .
. . . . . D

Step 3: Curr State=(0,0) Carrying=False | Chosen Action=RIGHT (3) -> Reward=-1 | Cumulative Reward=-3 | Done=False
. R . . . .
. # # . . .
. . . P . .
. . . . # .
. . . . . .
. . . . . D

Step 4: Curr State=(0,1) Carrying=False | Chosen Action=LEFT (2) -> Reward=-1 | Cumulative Reward=-4 | Done=False
R . . . . .
. # # . . .
. . . P . .
. . . . # .
. . . . . .
. . . . . D

Step 5: Curr State=(0,0) Carrying=False | Chosen Action=DROPOFF (5) -> Reward=-1 | Cumulative Reward=-5 | Done=False
R . . . . .
. # # . . .
. . . P . .
. . . . # .
. . . . . .
. . . . . D

Step 6: Curr State=(0,0) Carrying=False | Chosen Action=LEFT

In [20]:
class StochasticWarehouseEnv(WarehouseRobotEnv):
    def __init__(self):
        super().__init__()

    def step(self, action):
        r, c = self.robot_pos
        if action in self.DIRECTIONS:
            if np.random.rand() < 0.2:
                return self._get_obs(), -1, False, False, {}

        return super().step(action)



env = StochasticWarehouseEnv()
obs, _ = env.reset()
cumulative_reward = 0

for t in range(10):
    row, col, carrying = obs
    action = env.action_space.sample()
    action_name = ACTION_NAMES[action]

    next_obs, reward, done, _, _ = env.step(action)
    cumulative_reward += reward

    print(
        f"Step {t+1}: Pos=({row},{col}) Carrying={bool(carrying)} "
        f"| Action={action_name} ({action}) -> Reward={reward} "
        f"| Cumulative Reward={cumulative_reward} "
        f"| Next={next_obs} | Done={done}"
    )
    env.render()

    obs = next_obs
    if done:
        print(f"Delivery completed — total episode reward: {cumulative_reward}\n")
        obs, _ = env.reset()
        cumulative_reward = 0


Step 1: Pos=(0,0) Carrying=False | Action=DOWN (1) -> Reward=-1 | Cumulative Reward=-1 | Next=[1 0 0] | Done=False
. . . . . .
R # # . . .
. . . P . .
. . . . # .
. . . . . .
. . . . . D

Step 2: Pos=(1,0) Carrying=False | Action=UP (0) -> Reward=-1 | Cumulative Reward=-2 | Next=[0 0 0] | Done=False
R . . . . .
. # # . . .
. . . P . .
. . . . # .
. . . . . .
. . . . . D

Step 3: Pos=(0,0) Carrying=False | Action=UP (0) -> Reward=-1 | Cumulative Reward=-3 | Next=[0 0 0] | Done=False
R . . . . .
. # # . . .
. . . P . .
. . . . # .
. . . . . .
. . . . . D

Step 4: Pos=(0,0) Carrying=False | Action=UP (0) -> Reward=-1 | Cumulative Reward=-4 | Next=[0 0 0] | Done=False
R . . . . .
. # # . . .
. . . P . .
. . . . # .
. . . . . .
. . . . . D

Step 5: Pos=(0,0) Carrying=False | Action=UP (0) -> Reward=-1 | Cumulative Reward=-5 | Next=[0 0 0] | Done=False
R . . . . .
. # # . . .
. . . P . .
. . . . # .
. . . . . .
. . . . . D

Step 6: Pos=(0,0) Carrying=False | Action=PICKUP (4) -> Reward=-1 | 