In [11]:
import numpy as np
import scipy.linalg as sc_linalg

controllable = False

while not controllable:
    A = np.random.uniform(-10, 10, size=1)[0]
    B = np.random.uniform(-10, 10, size=1)[0]
    Q = np.random.uniform(-10, 0, size=1)[0]
    R = np.random.uniform(-10, 10, size=1)[0]
    S = np.random.uniform(-5, 5, size=1)[0]

    P = sc_linalg.solve_discrete_are(A, B, Q, R, s=S)[0, 0]
    S_hat = S + A * B * P
    R_hat = R + B ** 2 * P
    K = S_hat / R_hat

    if R_hat < 0:
        controllable = True

        print("Transition: s' = As + Ba")
        print(f"Transition: s' = {np.around(A, 2)}s + {np.around(B, 2)}a")
        print("Reward: Qs² + Ra² + 2 Ssa")
        print(f"Reward: {np.around(Q, 2)}s² + {np.around(R, 2)}a² + {np.around(2 * S, 2)}sa")

Transition: s' = As + Ba
Transition: s' = -8.05s + -1.84a
Reward: Qs² + Ra² + 2 Ssa
Reward: -9.46s² + -7.43a² + 4.19sa


In [12]:
state = 4
sum_reward = 0

for i in range(15):
    action = - K * state
    next_state = A * state + B * action
    reward = state ** 2 * Q + action ** 2 * R + 2 * state * action * S

    print(
        "State:", np.around(state, 2), 
        "action:", np.around(action, 2), 
        "next state:", np.around(next_state, 2), 
        "reward:", np.around(reward, 2)
    )
    state = next_state
    sum_reward += reward

print(f"Sum reward: {sum_reward}")

State: 4 action: -17.24 next state: -0.44 reward: -2647.11
State: -0.44 action: 1.91 next state: 0.05 reward: -32.37
State: 0.05 action: -0.21 next state: -0.01 reward: -0.4
State: -0.01 action: 0.02 next state: 0.0 reward: -0.0
State: 0.0 action: -0.0 next state: -0.0 reward: -0.0
State: -0.0 action: 0.0 next state: 0.0 reward: -0.0
State: 0.0 action: -0.0 next state: -0.0 reward: -0.0
State: -0.0 action: 0.0 next state: 0.0 reward: -0.0
State: 0.0 action: -0.0 next state: -0.0 reward: -0.0
State: -0.0 action: 0.0 next state: 0.0 reward: -0.0
State: 0.0 action: -0.0 next state: -0.0 reward: -0.0
State: -0.0 action: 0.0 next state: 0.0 reward: -0.0
State: 0.0 action: -0.0 next state: -0.0 reward: -0.0
State: -0.0 action: 0.0 next state: 0.0 reward: -0.0
State: 0.0 action: -0.0 next state: -0.0 reward: -0.0
Sum reward: -2679.8792130815004
