In [1]:
import functools
import itertools
import random
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy
import gymnasium as gym
import torch

import turingpoint.gymnasium_utils as tp_gym_utils
import turingpoint.sb3_utils as tp_sb3_utils
import turingpoint.utils as tp_utils
import turingpoint as tp

from tqdm import trange

In [2]:
%reload_ext mermaid

In [3]:
env_name = 'CartPole-v1'

In [4]:
def evaluate(env, agent, num_episodes: int) -> float:

  rewards_collector = tp_utils.Collector(['reward'])

  def get_participants():
    yield functools.partial(tp_gym_utils.call_reset, env=env)
    yield from itertools.cycle([
        functools.partial(tp_sb3_utils.call_predict, agent=agent, deterministic=True),
        functools.partial(tp_gym_utils.call_step, env=env),
        rewards_collector,
        tp_gym_utils.check_done
    ]) 

  evaluate_assembly = tp.Assembly(get_participants)

  for _ in range(num_episodes):
    _ = evaluate_assembly.launch()
    # Note that we don't clear the rewards in 'rewards_collector', and so we continue to collect.

  total_reward = sum(x['reward'] for x in rewards_collector.get_entries())

  return total_reward / num_episodes


def train(agent, total_timesteps):
  agent.learn(total_timesteps=total_timesteps, progress_bar=True)
  # The agent here learns from its internal Gym environment.
  # We could use a loop with participants also for training, yet this is not shown here.


def main():

  random.seed(1)
  np.random.seed(1)
  torch.manual_seed(1)

  env = gym.make(env_name)

  env.reset(seed=1)

  agent = PPO(MlpPolicy, env, verbose=0) # use verbose=1 for debugging

  mean_reward_before_train = evaluate(env, agent, 100)
  print("before training")
  print(f'{mean_reward_before_train=}')

  train(agent, total_timesteps=1_000)

  mean_reward_after_train = evaluate(env, agent, 100)
  print("after training")
  print(f'{mean_reward_after_train=}')

  agent.save("ppo_cartpole")

In [5]:
if __name__ == "__main__":
  main()

Output()

before training
mean_reward_before_train=9.11


after training
mean_reward_after_train=154.38


In [6]:
env = gym.make(env_name, render_mode="human")

In [7]:
agent = PPO.load("ppo_cartpole", env=env)

In [8]:
for _ in trange(10):
    obs, info = env.reset()
    episode_over = False
    while not episode_over:
        action, _state = agent.predict(obs)  # agent policy that uses the observation and info
        obs, reward, terminated, truncated, info = env.step(action)
    
        episode_over = terminated or truncated

100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:07<00:00,  1.32it/s]


In [9]:
env.close()

# How can we learn and What to learn?
# How shall we use what we learn, to make smart decisions?

```mermaid
graph TD
    A[Agent] -->|Takes action| B[Environment]
    B -->|Returns new state and reward| A
```

## Idea: learn a Policy

```mermaid
graph LR
    Start(( )) -->|Observation| A[Policy]
    A -->|Action| End(( ))
```

### Example: *Given the board and castling rights, what is the best Chess move?*
### Example (continues): *Given the current sensor readings, what angle should the steering wheel be in?*
###
### Can also be **non-deterministic**: for example with discrete set of actions
### (In the drawing below Softmax activation is assumed. Alternatively, replace "prob." with "logits" and go from there).

```mermaid
graph LR
    Start(( )) -->|Observation| A[Policy]
    A -->|"Action 1 (prob.)"| End1(( ))
    A -->|"Action 2 (prob.)"| End2(( ))
    A -->|"..."| End(( ))
    A -->|"Action n (prob.)"| Endn(( ))
```

[REINFORCE Turingpoing example (cart-pole)](https://github.com/zbenmo/turingpoint/blob/main/examples/cart-pole-gym-torch-reinforce.py)

### But also for continues actions (assumed to be Gaussian distributed):

```mermaid
graph LR
    Start(( )) -->|Observation| A[Policy]
    A -->|"Action μ"| End1(( ))
    A -->|"Action σ"| End2(( ))
```

## Idea: learn a State-Value Function

```mermaid
graph LR
    Start(( )) -->|Observation| A[Value]
    A -->|"Expected Total (discounted) Reward"| End(( ))
```

### Example: given the board and all possible moves, evaluate which of those moves leads to a board with the highest total reward?
### (Pay attention: Unless we've just won, it is now the other player's turn).
### Note: A value of a (Chess) state depends very much on the player:
### Myself against Magnus Carlsen or against Gukesh Dommaraju, probably -1 for almost any game.

## Idea: learn a Action-Value Function (Q-Learning)

```mermaid
graph LR
    Start1(( )) -->|Observation| A[Q]
    Start2(( )) -->|Action| A
    A -->|"Expected Total (discounted) Reward"| End(( ))
```

### Example: Compare "Nope", "Left", "Right", "Fire". Which leads to best total reward?

### When the action space is descrete, it may make sense to have multiple concurrent "heads" (note: no Softmax):

```mermaid
graph LR
    Start1(( )) -->|Observation| A[Q]
    A -->|"Expected Total (discounted) Reward (for action 1)"| End1(( ))
    A -->|"Expected Total (discounted) Reward (for action 2)"| End2(( ))
    A -->|"..."| End3(( ))
    A -->|"Expected Total (discounted) Reward (for action n)"| Endn(( ))
```


## Idea: learn an Actor and a Critic

### Ex. a Policy and a State-Value Function

### Why is it any better than just learning the State-Value Function? We're going to choose the maximum anyhow..
### Well, if we train (the Actor) using gradient decent, it helps if we calculate, for the sake of the loss, the "Advantage" of an action in a state. So it helps to center the values around zero.

```mermaid
graph LR
    Start1(( )) -->|Observation| A[Critic]
    A -->|"Expected Total (discounted) Reward"| C[Calc. Advantages]
    Start2(( )) -->|Observation| B[Actor]
    B -->|"Expected Total (discounted) Reward"| End1(( )) --> C
    B -->|"Expected Total (discounted) Reward"| End2(( )) --> C
    B -->|"..."| End3(( )) --> C
    B -->|"Expected Total (discounted) Reward"| Endn(( )) --> C
```


## Idea: learn a State-Value Function but also a Model of the Environment's transitions
## (What happens if we take action _a_ at state _s_?)
## Use the Model, for example for Planning. 

```mermaid
graph LR
    Start1(( )) -->|Observation| A[Model]
    Start2(( )) -->|"Action 1"| A
    A -->|Expected next Observation| B[Value] --> |"Expected Total (discounted) Reward (Action 1)"| End1(( ))
    Start1 -->|Observation| A2[Model]
    Start4(( )) -->|"Action 2"| A2
    A2 -->|Expected next Observation| B2[Value] --> |"Expected Total (discounted) Reward (Action 2)"| End2(( ))
```