In [19]:
# Filter tensorflow version warnings
import os
# https://stackoverflow.com/questions/40426502/is-there-a-way-to-suppress-the-messages-tensorflow-prints/40426709
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}
import warnings
# https://stackoverflow.com/questions/15777951/how-to-suppress-pandas-future-warning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)
import tensorflow as tf
tf.get_logger().setLevel('INFO')
tf.autograph.set_verbosity(0)
import logging
tf.get_logger().setLevel(logging.ERROR)

import gym
from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy
from stable_baselines.deepq import DQN
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2, A2C


from env.SepsisEnv import SepsisEnv
from load_data import load_data
from add_reward import add_reward_df, add_end_episode_df
import pandas as pd
import numpy as np

from tqdm import tqdm

### Load Data

In [2]:
df = load_data()
df = add_reward_df(df)
df = add_end_episode_df(df)

In [3]:
df = df.reset_index()

In [7]:
total_timesteps = 20
iterations = 200

In [8]:
def train_model(env, model, total_timesteps, iterations):
    model.learn(total_timesteps=total_timesteps)
    reward_list = []
    obs = env.reset()
    patient_count = 0
    for _ in tqdm(range(iterations)):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        reward_list.append(rewards)
        if done:
            patient_count += 1
            obs = env.reset()
        # env.render()
    print('Model: ', model.__class__)
    print('Policy: ', model.policy)
    print('Total patients: ', patient_count)
    print('Total reward:', sum(reward_list))

---

In [15]:
env = DummyVecEnv([lambda: SepsisEnv(df)])

In [16]:
models = [
    PPO2(MlpPolicy, env, verbose=0),
    PPO2(MlpLstmPolicy, env, nminibatches=1, verbose=0),
    PPO2(MlpLnLstmPolicy, env, nminibatches=1, verbose=0),
    A2C(MlpPolicy, env, lr_schedule='constant'),
    A2C(MlpLstmPolicy, env, lr_schedule='constant'),
    DQN(env=env,
        policy=DQN_MlpPolicy,
        learning_rate=1e-3,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        ),
    DQN(env=env,
        policy=LnMlpPolicy,
        learning_rate=1e-3,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        )
]

In [9]:
for model in models:
    env = DummyVecEnv([lambda: SepsisEnv(df)])
    train_model(env=env, model=model, total_timesteps=total_timesteps, iterations=iterations)

100%|██████████| 200/200 [00:00<00:00, 330.21it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

Model:  <class 'stable_baselines.ppo2.ppo2.PPO2'>
Policy:  <class 'stable_baselines.common.policies.MlpPolicy'>
Total patients:  5
Total reward: [-4.6499996]


100%|██████████| 200/200 [00:00<00:00, 233.42it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

Model:  <class 'stable_baselines.ppo2.ppo2.PPO2'>
Policy:  <class 'stable_baselines.common.policies.MlpLstmPolicy'>
Total patients:  5
Total reward: [-5.500003]


100%|██████████| 200/200 [00:01<00:00, 101.75it/s]


Model:  <class 'stable_baselines.ppo2.ppo2.PPO2'>
Policy:  <class 'stable_baselines.common.policies.MlpLnLstmPolicy'>
Total patients:  5
Total reward: [-4.75]


100%|██████████| 200/200 [00:00<00:00, 372.32it/s]


Model:  <class 'stable_baselines.a2c.a2c.A2C'>
Policy:  <class 'stable_baselines.common.policies.MlpPolicy'>
Total patients:  5
Total reward: [-5.4000025]


100%|██████████| 200/200 [00:00<00:00, 344.02it/s]

Model:  <class 'stable_baselines.a2c.a2c.A2C'>
Policy:  <class 'stable_baselines.common.policies.MlpLstmPolicy'>
Total patients:  5
Total reward: [-5.250002]





---

#### 2) Random Model

In [17]:
def train_baseline_models(df, iterations, constant=False):
    reward_list = []
    env = DummyVecEnv([lambda: SepsisEnv(df)])
    obs = env.reset()
    patient_count = 0
    for _ in tqdm(range(iterations)): 
        if constant:
            obs, rewards, done, info = env.step(np.array([0]))
        else:
            action = np.random.choice([0,1], size=1)
            obs, rewards, done, info = env.step(action)
        reward_list.append(rewards)
        if done:
            patient_count += 1
            obs = env.reset()
    if constant:
        print('Model: All Non-sepsis')
    else:
        print('Model: Random')
    print('Total patients: ', patient_count)
    print('Total reward:', sum(reward_list))

In [21]:
train_baseline_models(df, iterations=iterations, constant=False)
train_baseline_models(df, iterations=iterations, constant=True)

100%|██████████| 200/200 [00:00<00:00, 547.31it/s]
 57%|█████▊    | 115/200 [00:00<00:00, 569.71it/s]

Model: Random
Total patients:  5
Total reward: [-5.2000017]


100%|██████████| 200/200 [00:00<00:00, 564.67it/s]

Model: All Non-sepsis
Total patients:  5
Total reward: [0.]



