## Define the environment

In [4]:
import numpy as np 
from pbo.environment.linear_quadratic import LinearQuadraticEnv
import jax


key = jax.random.PRNGKey(0)

env = LinearQuadraticEnv(key, max_init_state=5)

Transition: s' = As + Ba
Transition: s' = -0.44256162643432617s + 0.37337517738342285a
Reward: Qs² + Ra² + 2 Ssa
Reward: -0.41404926776885986s² + -0.48342466354370117a² + 0.8196666240692139sa


## Test random agent

In [5]:
initial_state = np.array([3])
state = env.reset(state=initial_state)
terminal = False
step = 0
sum_reward = 0

while not terminal and step < 10:
    action = np.random.uniform(-1, 1, size=1)
    next_state, reward, terminal, _ = env.step(action)

    print(f"State: {np.around(state[0], 2)}", end=" ")
    print(f"Action: {np.around(action[0], 2)}", end=" ")
    print(f"Reward: {np.around(reward, 4)}")

    state = next_state
    sum_reward += reward
    step += 1

print(f"Sum reward: {sum_reward}")

State: 3 Action: -0.95 Reward: [-6.4953]
State: -1.6799999475479126 Action: -0.45 Reward: [-0.6468]
State: 0.5799999833106995 Action: -0.7 Reward: [-0.7091]
State: -0.5199999809265137 Action: -0.94 Reward: [-0.14109999]
State: -0.11999999731779099 Action: 0.66 Reward: [-0.2857]
State: 0.29999998211860657 Action: 0.45 Reward: [-0.0239]
State: 0.029999999329447746 Action: -0.38 Reward: [-0.07889999]
State: -0.14999999105930328 Action: 0.54 Reward: [-0.2224]
State: 0.26999998092651367 Action: 0.43 Reward: [-0.0241]
State: 0.03999999910593033 Action: -0.24 Reward: [-0.0375]
Sum reward: [-8.664504]


## Test optimal agent

In [6]:
initial_state = np.array([3])
state = env.reset(state=initial_state)
terminal = False
step = 0
sum_reward = 0


while not terminal and step < 10:
    action = env.optimal_action()
    next_state, reward, terminal, _ = env.step(action)

    print(f"State: {np.around(state[0], 2)}", end=" ")
    print(f"Action: {np.around(action[0], 2)}", end=" ")
    print(f"Reward: {np.around(reward, 4)}")

    state = next_state
    sum_reward += reward
    step += 1
print(f"Sum reward: {sum_reward}")

State: 3 Action: 2.559999942779541 Reward: [-0.59959996]
State: -0.3700000047683716 Action: -0.3199999928474426 Reward: [-0.0092]
State: 0.04999999701976776 Action: 0.03999999910593033 Reward: [-1.e-04]
State: -0.009999999776482582 Action: 0.0 Reward: [0.]
State: 0.0 Action: 0.0 Reward: [0.]
State: 0.0 Action: 0.0 Reward: [0.]
State: 0.0 Action: 0.0 Reward: [0.]
State: 0.0 Action: 0.0 Reward: [0.]
State: 0.0 Action: 0.0 Reward: [0.]
State: 0.0 Action: 0.0 Reward: [0.]
Sum reward: [-0.6089273]
