In [1]:
import numpy as np
from tqdm import tqdm
import gymnasium as gym
from gymnasium.wrappers import FlattenObservation, RecordEpisodeStatistics
import torch
from torch.utils.tensorboard import SummaryWriter
from matplotlib import pyplot as plt
import matplotlib

from clinic_environment import ClinicEnv
from clinic_agent import ClinicDQNAgent, ReplayMemory, Transition

In [2]:
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

<contextlib.ExitStack at 0x767ff15b0ef0>

In [3]:
learning_rate = 1e-4
n_episodes = 50_000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)  # reduce the exploration over tim
final_epsilon = 0.1

In [4]:
writer = SummaryWriter()

In [5]:
clinic_capacity = np.array([1, 2])
clinic_travel_times = np.array([[0, 10], [10, 0]])
patient_times = np.array([30, 40])
num_nurses = 2

unwrapped_clinic_env = ClinicEnv(clinic_capacity, clinic_travel_times, patient_times, num_nurses)
clinic_env = RecordEpisodeStatistics(FlattenObservation(unwrapped_clinic_env))
clinic_env.get_valid_actions = unwrapped_clinic_env.get_valid_actions

In [6]:
agent = ClinicDQNAgent(
    clinic_env, 
    learning_rate=learning_rate, 
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
    n_iter=n_episodes,
    device="cuda",
    writer=writer,
)

In [10]:
obs, info = unwrapped_clinic_env.reset()

(0,
 {'nurse_turn': 0,
  'nurses': ({'location': 1,
    'operating_minutes_left': 0.0,
    'traveling_minutes_left': 0.0,
    'status': <NurseStatus.IDLE: 1>},
   {'location': 1,
    'operating_minutes_left': 0.0,
    'traveling_minutes_left': 0.0,
    'status': <NurseStatus.IDLE: 1>}),
  'patients': ({'status': 1,
    'treatment_time': 30.0,
    'minutes_in_treatment': 0.0,
    'treated_at': 0},
   {'status': 1,
    'treatment_time': 40.0,
    'minutes_in_treatment': 0.0,
    'treated_at': 0}),
  'clinics': ({'capacity': 1.0, 'num_patients': 0.0},
   {'capacity': 2.0, 'num_patients': 0.0})})

In [7]:
def play_episode(env, agent, randomize: bool = True):
    obs, info = env.reset()
    done = False

    total_reward = 0
    while not done:
        action = agent.get_action(obs, randomize=randomize)
        next_obs, reward, terminated, truncated, info = env.step(action.item())

        agent.update(obs, action, reward, terminated, next_obs)

        done = terminated or truncated
        obs = next_obs
        total_reward += reward

    return total_reward

In [8]:
reward_per_episode = []
while len(agent.memory) <= 5_000:
    play_episode(clinic_env, agent)

for i in tqdm(range(n_episodes)):
    total_reward = play_episode(clinic_env, agent)
    writer.add_scalar("total reward", total_reward, i)

    agent.decay_epsilon()
    agent.update_lr()

    writer.add_scalar("Learning rate", agent.scheduler.get_last_lr()[0], i)

  7%|███▊                                                   | 3411/50000 [01:19<18:09, 42.74it/s]


KeyboardInterrupt: 

In [None]:
play_episode(clinic_env, agent, randomize=True)