In [1]:
# module imports
import numpy as np

from sb3_contrib import RecurrentPPO
from stable_baselines3.common.evaluation import evaluate_policy
from src.taskgym import HaydenRiskTrial

In [2]:
# load environment
env = HaydenRiskTrial(offer_amounts=(1,5,10))

In [3]:
# reset
obs, info = env.reset()

In [4]:
# instantiate model
model = RecurrentPPO("MlpLstmPolicy", env, verbose=1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [5]:
# run 5000 training runs
model.learn(5000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 68       |
|    ep_rew_mean     | 46.4     |
| time/              |          |
|    fps             | 174      |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 128      |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 68           |
|    ep_rew_mean          | 12.9         |
| time/                   |              |
|    fps                  | 118          |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 256          |
| train/                  |              |
|    approx_kl            | 3.240863e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.1         |
|    explained_variance   | 0.000634     |
|    learning_r

<sb3_contrib.ppo_recurrent.ppo_recurrent.RecurrentPPO at 0x7fbc5eb7d4d0>

In [6]:
# adjustable evaluation run
obs, info = env.reset()
lstm_states = None
episode_rewards = []
episode_reward = 0
episode_counts = 100  # Number of episodes for evaluation
results = []

for episode in range(episode_counts):
    obs, info = env.reset()
    done = False
    lstm_states = None
    episode_reward = 0
    trial_history = []
    
    while not done:
        action, lstm_states = model.predict(
            obs,
            state=lstm_states,
            deterministic=False,
            episode_start=np.array([done])
        )
        
        obs, reward, done, truncated, info = env.step(action)
        # append step to trial history
        trial_history.append({
            "obs": obs,
            "action": action,
            "reward": reward,
            "info": info
        })
        episode_reward += reward
    
    results.append({
        "episode": episode,
        "total_reward": episode_reward,
        "trials": trial_history
    })
    print(f"Episode {episode}: Reward = {episode_reward}")

Episode 0: Reward = -2.3000000000000007
Episode 1: Reward = -2.1000000000000005
Episode 2: Reward = -2.700000000000001
Episode 3: Reward = -2.3000000000000007
Episode 4: Reward = -2.2000000000000006
Episode 5: Reward = -2.2000000000000006
Episode 6: Reward = -2.700000000000001
Episode 7: Reward = -2.500000000000001
Episode 8: Reward = -2.1000000000000005
Episode 9: Reward = -2.400000000000001
Episode 10: Reward = -2.0000000000000004
Episode 11: Reward = -2.3000000000000007
Episode 12: Reward = -2.2000000000000006
Episode 13: Reward = -2.500000000000001
Episode 14: Reward = -3.2000000000000015
Episode 15: Reward = -3.1000000000000014
Episode 16: Reward = -2.800000000000001
Episode 17: Reward = -2.3000000000000007
Episode 18: Reward = -2.3000000000000007
Episode 19: Reward = -1.8000000000000005
Episode 20: Reward = 3.7
Episode 21: Reward = -2.800000000000001
Episode 22: Reward = -2.2000000000000006
Episode 23: Reward = -1.8000000000000005
Episode 24: Reward = -1.5000000000000002
Episode 

### let's visualize how our agent behaved in this situation - first let's do simple bar graph and see what percent of episodes the agent picked the reward with the higher EV