In [1]:
# module imports
import numpy as np

from sb3_contrib import RecurrentPPO
from stable_baselines3.common.evaluation import evaluate_policy
from src.taskgym import HaydenRiskTrial

In [2]:
# load environment
env = HaydenRiskTrial(offer_amounts=(5,10,20), reward_choice_made=10, penalty_no_choice=-20)

In [3]:
# reset
obs, info = env.reset()

In [4]:
# instantiate model
model = RecurrentPPO("MlpLstmPolicy", env, verbose=1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [None]:
# run 5000 training runs
model.learn(50000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 64       |
|    ep_rew_mean     | -49.3    |
| time/              |          |
|    fps             | 190      |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 128      |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 64            |
|    ep_rew_mean          | -76.5         |
| time/                   |               |
|    fps                  | 133           |
|    iterations           | 2             |
|    time_elapsed         | 1             |
|    total_timesteps      | 256           |
| train/                  |               |
|    approx_kl            | 2.4525449e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.1          |
|    explained_variance   | -0.000107     |


In [None]:
# adjustable evaluation run
obs, info = env.reset()
lstm_states = None
episode_rewards = []
episode_reward = 0
episode_counts = 100  # Number of episodes for evaluation
results = []

for episode in range(episode_counts):
    obs, info = env.reset()
    done = False
    lstm_states = None
    episode_reward = 0
    trial_history = []
    
    while not done:
        action, lstm_states = model.predict(
            obs,
            state=lstm_states,
            deterministic=False,
            episode_start=np.array([done])
        )
        
        obs, reward, done, truncated, info = env.step(action)
        # append step to trial history
        trial_history.append({
            "obs": obs,
            "action": action,
            "reward": reward,
            "info": info
        })
        episode_reward += reward
    
    results.append({
        "episode": episode,
        "total_reward": episode_reward,
        "trials": trial_history
    })
    print(f"Episode {episode}: Reward = {episode_reward}")

### let's visualize how our agent behaved in this situation - first let's do simple bar graph and see what percent of episodes the agent picked the reward with the higher EV