In [18]:
import numpy as np 

from pbo.environment.linear_quadratic import LinearQuadraticEnv
import control

mdp = LinearQuadraticEnv(
    A=2 * np.random.random(size=(1, 1)) - 1,
    B=2 * np.random.random(size=(1, 1)) - 1,
    Q=2 * np.random.random(size=(1, 1)) - 1,
    R=2 * np.random.random(size=(1, 1)) - 1,
    S=np.zeros((1, 1)),
    max_pos=10,
    max_action=10,
    initial_state=np.array([3]),
)

# dlqr minimises J = \\Sum_0^\\infty (s[n]' Q_given s[n] + a[n]' R_given a[n] + + 2 s[n]' S_given a[n])
# here we want to maximises the quantity 
# this is why we need to have Q_given = -Q, R_given = -R, S_given = -S
K, S, E = control.dlqr(mdp.A, mdp.B, -mdp.Q, -mdp.R, -mdp.S)

print("Transition: s' = As + Ba")
print(f"Transition: s' = {np.around(mdp.A[0, 0], 2)}s + {np.around(mdp.A[0, 0], 2)}a")
print("Reward: Qs² + Ra² + 2 Sas")
print(f"Reward: {np.around(mdp.Q[0, 0], 2)}s² + {np.around(mdp.R[0, 0], 2)}a² + 2 {np.around(mdp.S[0, 0], 2)}as")

Transition: s' = As + Ba
Transition: s' = -0.66s + -0.66a
Reward: Qs² + Ra² + 2 Sas
Reward: 0.42s² + -0.14a² + 2 0.0as


In [19]:
S_hat = mdp.S + mdp.B * S * mdp.A
R_hat = mdp.R + mdp.B * S * mdp.B
K_hat = S_hat[0, 0] / R_hat[0, 0]
print("K from theory:", K_hat)
print("K given:", K[0, 0])

K from theory: 2.4123148502450835
K given: 1.377419857182332


In [62]:
import control

K, S, E = control.dlqr(mdp.A, mdp.B, mdp.Q, mdp.R)
# K = np.array([[0]])

state = mdp.reset()
terminal = False
step = 0

while not terminal and step < 10:
    next_state, reward, terminal, _ = mdp.step(- K @ state)

    print(f"State: {np.around(state[0], 2)}", end=" ")
    print(f"Action: {np.around((- K @ state)[0], 2)}", end=" ")
    print(f"Reward: {np.around(reward, 4)}")

    state = next_state
    step += 1

State: 3 Action: 0.01 Reward: -0.9793
State: 0.06 Action: 0.0 Reward: -0.0004
State: 0.0 Action: 0.0 Reward: -0.0
State: 0.0 Action: 0.0 Reward: -0.0
State: 0.0 Action: 0.0 Reward: -0.0
State: 0.0 Action: 0.0 Reward: -0.0
State: 0.0 Action: 0.0 Reward: -0.0
State: 0.0 Action: 0.0 Reward: -0.0
State: 0.0 Action: 0.0 Reward: -0.0
State: 0.0 Action: 0.0 Reward: -0.0


In [67]:
S

array([[0.10885098]])

In [64]:
K[0, 0]

-0.0021193163077151203

In [65]:
mdp.A * 3 + mdp.B * 1.91

array([[-1.41677715]])

In [66]:
(mdp.A[0, 0] * S[0, 0] * mdp.B[0, 0]) / (mdp.R[0, 0] + mdp.B[0, 0] ** 2)

-0.0012833809110176309