In [1]:
# module imports
import numpy as np

from sb3_contrib import RecurrentPPO
from stable_baselines3.common.evaluation import evaluate_policy
from src.taskgym import HaydenRiskTrial

In [2]:
# load environment
env = HaydenRiskTrial(offer_amounts=(1,5,10), penalty_hyperactive=-5, reward_choice_made=0.5)

In [3]:
# reset
obs, info = env.reset()

In [4]:
# instantiate model
model = RecurrentPPO("MlpLstmPolicy", env, verbose=1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [5]:
# run 5000 training runs
model.learn(5000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 67       |
|    ep_rew_mean     | 21.6     |
| time/              |          |
|    fps             | 201      |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 128      |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 65            |
|    ep_rew_mean          | 6.6           |
| time/                   |               |
|    fps                  | 139           |
|    iterations           | 2             |
|    time_elapsed         | 1             |
|    total_timesteps      | 256           |
| train/                  |               |
|    approx_kl            | 4.8835296e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.1          |
|    explained_variance   | 0.00394       |


<sb3_contrib.ppo_recurrent.ppo_recurrent.RecurrentPPO at 0x7f3c6398f410>

In [6]:
# adjustable evaluation run
obs, info = env.reset()
lstm_states = None
episode_rewards = []
episode_reward = 0
episode_counts = 100  # Number of episodes for evaluation
results = []

for episode in range(episode_counts):
    obs, info = env.reset()
    done = False
    lstm_states = None
    episode_reward = 0
    trial_history = []
    
    while not done:
        action, lstm_states = model.predict(
            obs,
            state=lstm_states,
            deterministic=False,
            episode_start=np.array([done])
        )
        
        obs, reward, done, truncated, info = env.step(action)
        # append step to trial history
        trial_history.append({
            "obs": obs,
            "action": action,
            "reward": reward,
            "info": info
        })
        episode_reward += reward
    
    results.append({
        "episode": episode,
        "total_reward": episode_reward,
        "trials": trial_history
    })
    print(f"Episode {episode}: Reward = {episode_reward}")

Episode 0: Reward = -4.100000000000001
Episode 1: Reward = -4.5
Episode 2: Reward = -3.900000000000002
Episode 3: Reward = 1.8999999999999981
Episode 4: Reward = -4.5
Episode 5: Reward = -3.300000000000002
Episode 6: Reward = -3.100000000000002
Episode 7: Reward = 46.1
Episode 8: Reward = -4.799999999999999
Episode 9: Reward = -3.100000000000002
Episode 10: Reward = 21.599999999999998
Episode 11: Reward = 21.699999999999996
Episode 12: Reward = -3.700000000000002
Episode 13: Reward = -4.300000000000001
Episode 14: Reward = -4.300000000000001
Episode 15: Reward = -3.300000000000002
Episode 16: Reward = -4.100000000000001
Episode 17: Reward = -4.000000000000002
Episode 18: Reward = -3.5000000000000018
Episode 19: Reward = 21.9
Episode 20: Reward = -3.6000000000000014
Episode 21: Reward = 21.1
Episode 22: Reward = -3.400000000000002
Episode 23: Reward = -4.4
Episode 24: Reward = -4.699999999999999
Episode 25: Reward = -4.999999999999998
Episode 26: Reward = -3.800000000000002
Episode 27: 

### let's visualize how our agent behaved in this situation - first let's do simple bar graph and see what percent of episodes the agent picked the reward with the higher EV