In [11]:
# Fix the Gym deprecation warning by using Gymnasium
import gymnasium as gym
from gym_sepsis.envs.sepsis_env import SepsisEnv

# Import necessary libraries
import warnings
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


warnings.filterwarnings('ignore')

# Step 1: Gather data for offline RL training
def collect_sepsis_data(num_episodes=1000, max_steps_per_episode=1e6, 
                       action_policy='random', verbose=False):
    """
    Collect data from the SepsisEnv environment for offline RL training.
    
    Args:
        num_episodes (int): Number of episodes to simulate
        max_steps_per_episode (int): Maximum steps per episode
        action_policy (str): Policy for action selection ('random', 'uniform', or custom function)
        verbose (bool): Whether to print detailed progress information
    
    Returns:
        dict: Dictionary containing all collected data
    """
    # Create the environment
    env = SepsisEnv(verbose=verbose)
    
    # Initialize lists to store the dataset
    observations = []
    actions = []
    rewards = []
    terminateds = []
    truncateds = []
    infos = []
    
    print(f"Starting sepsis data collection for {num_episodes} episodes...")
    print(f"Environment specs:")
    print(f"  - Observation space: {env.observation_space}")
    print(f"  - Action space: {env.action_space}")
    print(f"  - Max steps per episode: {max_steps_per_episode}")
    
    for episode in range(num_episodes):
        print(f"Episode {episode + 1} of {num_episodes}")
        observation, info = env.reset()
        done = False
        step_count = 0
    
        
        while not done and step_count < max_steps_per_episode:
            # Choose action based on policy
            if action_policy == 'random':
                action = env.action_space.sample()
            elif action_policy == 'uniform':
                # Uniform distribution over actions
                action = np.random.randint(0, env.action_space.n)
            else:
                # Custom policy function
                action = action_policy(observation, step_count, episode)
            
            # Take a step in the environment
            next_observation, reward, terminated, truncated, info = env.step(action)
            
            # Store the results
            observations.append(observation)
            actions.append(action)
            rewards.append(reward)
            terminateds.append(terminated)
            truncateds.append(truncated)
            infos.append(info)
            
            observation = next_observation
            done = terminated or truncated
            step_count += 1
        
    # Convert lists to NumPy arrays for easier manipulation
    observations_np = np.array(observations)
    actions_np = np.array(actions)
    rewards_np = np.array(rewards)
    terminateds_np = np.array(terminateds)
    truncateds_np = np.array(truncateds)
    terminals = (terminateds_np | truncateds_np).astype(int)
    
    # Calculate statistics
    total_transitions = len(observations)
    
    return {"observations": observations_np, "actions": actions_np, "rewards": rewards_np, "terminals": terminals}
    


In [12]:
data_dict = collect_sepsis_data(num_episodes=1000, max_steps_per_episode=1e6, 
                       action_policy='random', verbose=False)

Starting sepsis data collection for 1000 episodes...
Environment specs:
  - Observation space: Box(-inf, inf, (46,), float32)
  - Action space: Discrete(24)
  - Max steps per episode: 1000000.0
Episode 1 of 1000
Episode 2 of 1000
Episode 3 of 1000
Episode 4 of 1000
Episode 5 of 1000
Episode 6 of 1000
Episode 7 of 1000
Episode 8 of 1000
Episode 9 of 1000
Episode 10 of 1000
Episode 11 of 1000
Episode 12 of 1000
Episode 13 of 1000
Episode 14 of 1000
Episode 15 of 1000
Episode 16 of 1000
Episode 17 of 1000
Episode 18 of 1000
Episode 19 of 1000
Episode 20 of 1000
Episode 21 of 1000
Episode 22 of 1000
Episode 23 of 1000
Episode 24 of 1000
Episode 25 of 1000
Episode 26 of 1000
Episode 27 of 1000
Episode 28 of 1000
Episode 29 of 1000
Episode 30 of 1000
Episode 31 of 1000
Episode 32 of 1000
Episode 33 of 1000
Episode 34 of 1000
Episode 35 of 1000
Episode 36 of 1000
Episode 37 of 1000
Episode 38 of 1000
Episode 39 of 1000
Episode 40 of 1000
Episode 41 of 1000
Episode 42 of 1000
Episode 43 of 100

In [13]:
import d3rlpy
dataset = d3rlpy.dataset.MDPDataset(
    **data_dict
)

2025-10-05 20:28.27 [info     ] Signatures have been automatically determined. action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]) observation_signature=Signature(dtype=[dtype('float32')], shape=[(46,)]) reward_signature=Signature(dtype=[dtype('int32')], shape=[(1,)])
2025-10-05 20:28.27 [info     ] Action-space has been automatically determined. action_space=<ActionSpace.DISCRETE: 2>
2025-10-05 20:28.27 [info     ] Action size has been automatically determined. action_size=24


In [15]:
# Save the dataset and the data dictionary
from datetime import datetime
import pickle
num_episodes=1000
action_policy='random'

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
dataset_filename = f"sepsis_data_set_{num_episodes}ep_{action_policy}_{timestamp}.pkl"
datadict_filename = f"sepsis_data_dict_{num_episodes}ep_{action_policy}_{timestamp}.pkl"
with open(dataset_filename, 'wb') as f:   
    pickle.dump(dataset, f)
    print(f"Data saved to: {dataset_filename}")
    
with open(datadict_filename, 'wb') as f:
    pickle.dump(data_dict, f)
    print(f"Data saved to: {datadict_filename}")


Data saved to: sepsis_data_set_1000ep_random_20251005_202927.pkl
Data saved to: sepsis_data_dict_1000ep_random_20251005_202927.pkl
