In [6]:
import matplotlib, time, copy
import matplotlib.pyplot as plt
import autograd.numpy as np
import autograd.scipy.stats as sps_autograd
from autograd import grad, hessian
from statsmodels.tsa.arima_process import ArmaProcess
from scipy.optimize import minimize
from scipy.linalg import toeplitz
import pandas as pd

# Simulate ARMA data 

In [7]:
"""
Simulate ARMA(1, 1) model
"""

# Define AR and MA coefficients
ar = np.array([1, -0.5])  
ma = np.array([1, 0.4])        

# Create ARMA process object
arma_process = ArmaProcess(ar, ma)

# Simulate 10000 samples
N = 1000
y = arma_process.generate_sample(nsample=N)

# Karman Filter 

In [8]:
def initialize_FGHQ(a, b):
    """
    Construct the state-space matrices F, G, H for an ARMA(p, q) model.

    Parameters:
    - a: list or np.array of AR coefficients [a1, a2, ..., ap]
    - b: list or np.array of MA coefficients [b1, b2, ..., bq]

    Returns:
    - F: state transition matrix of shape (k, k)
    - G: noise coefficient matrix of shape (k, 1)
    - H: observation matrix of shape (1, k)
    - Q: covariance identity matrix of shape (k, k)
    - dF: derivative of F with respect to theta of shape (l, k, k)
    - dG: derivative of G with respect to theta of shape (l, k, 1)
    """
    p = len(a)
    q = len(b)
    k = max(p, q + 1)  # dimension of the state vector
    F = np.zeros((k, k))
    G = np.zeros((k, 1))
    H = np.zeros((1, k))

    # Fill the first column of F with AR coefficients
    for i in range(p):
        F[i, 0] = a[i]

    # Fill the lower subdiagonal of F with 1s (shifting the state)
    for i in range(k - 1):
        F[i, i + 1] = 1

    # Fill G with negative MA coefficients, first element = 1
    for i in range(q):
        G[i+1, 0] = -b[i]
    G[0, 0] = 1  # first element always set to 1

    # Matrix H: only first element is 1
    H[0, 0] = 1

    # Initialize covariance matrix Q as identity matrix
    Q = np.eye(k)

    # Compute derivatives of F and G with respect to theta
    dF = np.zeros((k, k, k))
    dF[0, 0, 0] = 1 # ARMA(1,1)

    dG = np.zeros((k, k, 1))
    dG[1, 1, 0] = -1 # ARMA(1,1)

    return F, G, H, Q, dF, dG 

In [9]:
def log_likelihood(sigma2_hat, r):
    N = len(r)
    return -0.5 * (N * np.log(2 * np.pi) 
                   + N * np.log(sigma2_hat) 
                   + np.sum(np.log(r)) + N)

In [10]:
def karman_filter_arma(theta):
    p = 1   # AR order
    q = 1   # MA order
    a = theta[:p]
    b = theta[p:p+q]
    k = max(p, q + 1)
    
    F, G, H, Q, dF, dG = initialize_FGHQ(a, b)

    # Initialize values
    x = np.zeros((k, 1))
    V = np.eye(k) * 100
    e = np.zeros((N, 1))
    r = np.zeros((N, 1))

    # Implement Kalman filter
    for t in range(N):
        # Predict one-step-ahead state predictive density of x_{t}
        x_predict = F @ x
        V_predict = F @ V @ F.T + G @ G.T

        # Compute forecast error and one-step-ahead predictive variance
        e[t] = y[t] - (H @ x_predict).item()
        r[t] = (H @ V_predict @ H.T).item()

        # Kalman gain
        K = V_predict @ H.T / r[t]

        # Update current state and covariance
        x = x_predict + K * e[t]
        V = (np.eye(k) - K @ H) @ V_predict

    sigma2_hat = np.sum(e**2 / r) / N

    return sigma2_hat, r

In [11]:
def obj_func_likelihood(theta):
    sigma2_hat, r = karman_filter_arma(theta)
    log_lik = log_likelihood(sigma2_hat, r)

    return -log_lik  

In [12]:
"""
Test the log-likelihood function
"""

theta_start = [0.1, 0.1]

# Minimize negative log-likelihood
result = minimize(obj_func_likelihood, theta_start, method='BFGS')

# Print results
print("Estimated parameters:", result.x)
print("Negative log-likelihood:", result.fun)

Estimated parameters: [ 0.49753443 -0.40805501]
Negative log-likelihood: 1383.1985150906685


# Algorithm for the gradient of the model ARMA(1,1)

In [13]:
def compute_initial_V_and_dV(a, b, sigma2):
    """
    Compute the initial V and the initial dV/d(theta).
    Implement the equations from Section 3.3.2 of the document.
    Parameters:
        a: float
        b: float
        sigma2: float (randomly chosen)
    Returns:
        V: numpy array of shape (k, k)
        dV: numpy array of shape (k, k, k)
    """
    p, q = 1, 1
    k = max(p, q + 1)  

    # 1. Compute initial V 
    g = np.zeros(k + 1)
    g[0] = 1.0
    g[1] = a - b
    g[2] = a * g[1]

    C = np.zeros(k + 1)
    C[0] = sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2)
    C[1] = a * C[0] - sigma2 * b
    C[2] = a * C[1]

    V = np.zeros((k, k))
    V[0, 0] = C[0]
    V[0, 1] = V[1, 0] = - b * g[0]
    V[1, 1] = b**2 * sigma2

    dV = np.zeros((k, k, k))
    dV[0, 0, 0] = (2 * sigma2 * (a-b) * (1 - a*b)) / (1 - a**2)**2
    dV[0, 0, 1] = 0
    dV[0, 1, 0] = 0
    dV[0, 1, 1] = 0
    dV[1, 0, 0] = 2 * sigma2 * (b-a) / (1 - a**2)
    dV[1, 0, 1] = -1
    dV[1, 1, 0] = -1
    dV[1, 1, 1] = 2 * b * sigma2

    return V, dV

In [None]:
def grad_obj_func_likelihood(theta):
    """
    Implement the gradient of the log-likelihood function for ARMA(1,1) model.
    """
    
    a, b = theta
    k = 2

    """ 
    # Initialize by function
    # Initialize the state-space matrices
    F, G, H, Q, dF, dG = initialize_FGHQ(np.array([a]), np.array([b]))
    
    # Initialize the x and V matrices
    V, dV = compute_initial_V_and_dV(a, b, 100)
    """

    # Initialize by hand
    sigma2 = 100
    # Simple operations that autograd can handle
    F = np.array([[a, 1.0], [0.0, 0.0]])
    G = np.array([[1.0], [-b]])
    H = np.array([[1.0, 0.0]])
    dF = np.array([
        [[1, 0], [0, 0]],  # First parameter (AR coefficient)
        [[0, 0], [0, 0]]   # Second parameter (MA coefficient)
        ])
    dF.reshape(2, 2, 2)
    dG = np.array([
        [[0], [0]],  # First parameter (AR coefficient)
        [[0], [-1]]   # Second parameter (MA coefficient)
        ])
    dG.reshape(2, 2, 1)

    g = np.array([1.0, a - b, a * (a - b)])

    C = np.array([
        sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2),
        a * (sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2)) - sigma2 * b,
        a * (a * (sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2)) - sigma2 * b)
    ])

    V = np.array([
        [C[0], -b * g[0]],
        [-b * g[0], b**2 * sigma2]
    ])

    dV = np.array([
        [[(2 * sigma2 * (a-b) * (1 - a*b)) / (1 - a**2)**2, 0],
        [0, 0]],
        [[2 * sigma2 * (b-a) / (1 - a**2), -1],
        [-1, 2 * b * sigma2]]
    ])



    x = np.zeros((k, 1))
    dx = np.zeros((2, 2, 1))
    e = np.zeros((N, 1))
    r = np.zeros((N, 1))
    de = np.zeros((N, 2))
    dr = np.zeros((N, 2))

    # 
    for t in range(N):
        # 1. Predict
        # Predict one-step-ahead state predictive density of x_{t}
        x_predict = F @ x
        V_predict = F @ V @ F.T + G @ G.T

        # Compute forecast error and one-step-ahead predictive variance
        e[t] = y[t] - (H @ x_predict)[0, 0]
        r[t] = (H @ V_predict @ H.T)[0, 0]

        GdGT = np.array([G @ dG.T[0][i].reshape(1,2) for i in range(2)])

        # Kalman filter for gradient
        dx_predict = F @ dx + dF @ x
        dV_predict = F @ dV @ F.T + dF @ V @ F.T + F @ V @ dF.T + dG @ G.T + GdGT

        # Calculate de_t and dr_t as tensor(2,1,1)
        de_t = -H @ dx_predict
        dr_t = H @ dV_predict @ H.T


        # 2. Update
        # Kalman gain
        K = V_predict @ H.T / r[t]

        # Update current state and covariance
        x = x_predict + K * e[t]
        V = (np.eye(k) - K @ H) @ V_predict

        # dK = (dV_predict @ H.T / r[t]) - np.einsum('ki,p->pki', (V_predict @ H.T / r[t]**2), dr_t)
        dK = (dV_predict @ H.T / r[t]) - (V_predict @ H.T / r[t]**2) @ dr_t
        dx = dx_predict + K @ de_t + dK * e[t]
        dV = dV_predict - dK @ H @ V_predict - K @ H @ dV_predict

        # Store value de and dr into vectors
        de[t] = de_t.flatten()
        dr[t] = dr_t.flatten()

    # Compute sigma2_hat and gradient of the log-likelihood
    sigma2_hat = np.sum(e**2 / r) / N
    
    grad = -0.5 * sum(dr / r) - (1/sigma2_hat) * sum(de * e / r) + (1/(2*sigma2_hat)) * sum(dr * e**2 / r**2)
    
    return grad

In [17]:
"""
Test the log-likelihood function
"""

theta_start = [0.1, 0.1]

# Minimize negative log-likelihood
result = minimize(obj_func_likelihood, theta_start, method='BFGS', jac=grad_obj_func_likelihood,
                  options={'gtol': 1e-04, 'maxiter': 1000, 'disp': True})

# Print results
print("Estimated parameters:", result.x)
print("Negative log-likelihood:", result.fun)

         Current function value: 1748.331548
         Iterations: 0
         Function evaluations: 56
         Gradient evaluations: 44
Estimated parameters: [0.1 0.1]
Negative log-likelihood: 1748.3315475368995


  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


# Manual test 

In [35]:
a = 0.3
b = 0.5
k = 2

# Initialize the state-space matrices
F, G, H, Q, dF, dG = initialize_FGHQ(np.array([a]), np.array([b]))

# Initialize the x and V matrices
V, dV = compute_initial_V_and_dV(a, b, 100)
x = np.zeros((k, 1))
dx = np.zeros((2, 2, 1))
e = np.zeros((N, 1))
r = np.zeros((N, 1))
de = np.zeros((N, 2))
dr = np.zeros((N, 2))


# 
t = 0
    
# 1. Predict
# Predict one-step-ahead state predictive density of x_{t}
x_predict = F @ x
V_predict = F @ V @ F.T + G @ G.T

# Compute forecast error and one-step-ahead predictive variance
e[t] = y[t] - (H @ x_predict).item()
r[t] = (H @ V_predict @ H.T).item()

GdGT = np.array([G @ dG.T[0][i].reshape(1,2) for i in range(2)])

# Kalman filter for gradient
dx_predict = F @ dx + dF @ x
dV_predict = F @ dV @ F.T + dF @ V @ F.T + F @ V @ dF.T + dG @ G.T + GdGT

# Calculate de_t and dr_t as tensor(2,1,1)
de_t = -H @ dx_predict
dr_t = H @ dV_predict @ H.T


# 2. Update
# Kalman gain
K = V_predict @ H.T / r[t]

# Update current state and covariance
x = x_predict + K * e[t]
V = (np.eye(k) - K @ H) @ V_predict

# dK = (dV_predict @ H.T / r[t]) - np.einsum('ki,p->pki', (V_predict @ H.T / r[t]**2), dr_t)
dK = (dV_predict @ H.T / r[t]) - (V_predict @ H.T / r[t]**2) @ dr_t
dx = dx_predict + K @ de_t + dK * e[t]
dV = dV_predict - dK @ H @ V_predict - K @ H @ dV_predict

# Store value de and dr 
de[t] = de_t.flatten()
dr[t] = dr_t.flatten()

# Compute the gradient of the log-likelihood function
sigma2_hat = np.sum(e**2 / r) / N

grad = -0.5 * sum(dr / r) - (1/sigma2_hat) * sum(de * e / r) + (1/(2*sigma2_hat)) * sum(dr * e**2 / r**2)


  sigma2_hat = np.sum(e**2 / r) / N
  grad = -0.5 * sum(dr / r) - (1/sigma2_hat) * sum(de * e / r) + (1/(2*sigma2_hat)) * sum(dr * e**2 / r**2)


In [95]:
dK[0, 1, 0]

0.02352118561015082

In [51]:
dK

array([[[2.22044605e-16],
        [2.35211856e-02]],

       [[4.44089210e-16],
        [1.34630170e-02]]])

In [37]:
F

array([[0.3, 1. ],
       [0. , 0. ]])

In [38]:
G

array([[ 1. ],
       [-0.5]])

In [39]:
H

array([[1., 0.]])

In [80]:
K

array([[ 1.       ],
       [-0.0142468]])

# Auto grad

In [60]:
import autograd.numpy as np

def test(a, b, sigma2):
    """Version using array indexing instead of .at"""
    b = 0.5

    # Create arrays using autograd-compatible operations
    g = np.array([1.0, a - b, a * (a - b)])
    
    sigma2 = 100
    C = np.array([
        sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2),
        a * (sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2)) - sigma2 * b,
        a * (a * (sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2)) - sigma2 * b)
    ])

    V = np.array([
        [C[0], -b * g[0]],
        [-b * g[0], b**2 * sigma2]
    ])
    
    # Simple operations that autograd can handle
    F = np.array([[a, 1.0], [0.0, 0.0]])
    G = np.array([[1.0], [-b]])
    H = np.array([[1.0, 0.0]])
    
    x = np.zeros((2, 1))
    
    # One step of Kalman filter
    x_predict = F @ x
    V_predict = F @ V @ F.T + G @ G.T
    
    # Use autograd-compatible operations
    e_t = y[0] - (H @ x_predict)[0, 0]
    r_t = (H @ V_predict @ H.T)[0, 0]
    
    K = V_predict @ H.T / r_t
    
    # Return a scalar value
    return K[1, 0]

# Test
from autograd import grad
test_grad = lambda a: test(a, b, sigma2=100)
grad_func = grad(test_grad)
result = grad_func(0.3)
print("Autograd gradient:", result)

Autograd gradient: 0.023521185610150815


# Function to compare derivatives

In [101]:
"""
Implement the gradient for AR.
"""
def a_grad(a, b):
    k = 2
    sigma2 = 100

    # Simple operations that autograd can handle
    F = np.array([[a, 1.0], [0.0, 0.0]])
    G = np.array([[1.0], [-b]])
    H = np.array([[1.0, 0.0]])
    dF = np.array([
        [[1, 0], [0, 0]],  # First parameter (AR coefficient)
        [[0, 0], [0, 0]]   # Second parameter (MA coefficient)
        ])
    dF.reshape(2, 2, 2)
    dG = np.array([
        [[0], [0]],  # First parameter (AR coefficient)
        [[0], [-1]]   # Second parameter (MA coefficient)
        ])
    dG.reshape(2, 2, 1)

    g = np.array([1.0, a - b, a * (a - b)])

    C = np.array([
        sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2),
        a * (sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2)) - sigma2 * b,
        a * (a * (sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2)) - sigma2 * b)
    ])

    V = np.array([
        [C[0], -b * g[0]],
        [-b * g[0], b**2 * sigma2]
    ])

    dV = np.array([
        [[(2 * sigma2 * (a-b) * (1 - a*b)) / (1 - a**2)**2, 0],
        [0, 0]],
        [[2 * sigma2 * (b-a) / (1 - a**2), -1],
        [-1, 2 * b * sigma2]]
    ])

    # Initialize the x and stored values
    x = np.zeros((k, 1))
    dx = np.zeros((2, 2, 1))

    # test
    dict = {
        'K1': [],
        'dK1': [],
        'K2': [],
        'dK2': [],
        'x1': [],
        'x2': [],
        'dx1': [],
        'dx2': [],
        'V11': [],
        'V12': [],
        'V21': [],
        'V22': [],
        'dV11': [],
        'dV12': [],
        'dV21': [],
        'dV22': []
    }

    # 
    for t in range(N):
        # 1. Predict
        # Predict one-step-ahead state predictive density of x_{t}
        x_predict = F @ x
        V_predict = F @ V @ F.T + G @ G.T

        # Compute forecast error and one-step-ahead predictive variance
        e_t = y[t] - (H @ x_predict)[0, 0]
        r_t = (H @ V_predict @ H.T)[0, 0]

        GdGT = np.array([G @ dG.T[0][i].reshape(1,2) for i in range(2)])

        # Kalman filter for gradient
        dx_predict = F @ dx + dF @ x
        dV_predict = F @ dV @ F.T + dF @ V @ F.T + F @ V @ dF.T + dG @ G.T + GdGT

        # Calculate de_t and dr_t as tensor(2,1,1)
        de_t = -H @ dx_predict
        dr_t = H @ dV_predict @ H.T


        # 2. Update
        # Kalman gain
        K = V_predict @ H.T / r_t

        # Update current state and covariance
        x = x_predict + K * e_t
        V = (np.eye(k) - K @ H) @ V_predict

        dK = (dV_predict @ H.T / r_t) - (V_predict @ H.T / r_t**2) @ dr_t
        dx = dx_predict + K @ de_t + dK * e_t
        dV = dV_predict - dK @ H @ V_predict - K @ H @ dV_predict
        
        # Append 
        dict['K1'].append(K[0, 0])
        dict['dK1'].append(dK[0, 0, 0])
        dict['K2'].append(K[1, 0])
        dict['dK2'].append(dK[0, 1, 0])
        dict['x1'].append(x[0, 0])
        dict['x2'].append(x[1, 0])
        dict['dx1'].append(dx[0, 0, 0])
        dict['dx2'].append(dx[0, 1, 0])
        dict['V11'].append(V[0, 0])
        dict['V12'].append(V[0, 1])
        dict['V21'].append(V[1, 0])
        dict['V22'].append(V[1, 1])
        dict['dV11'].append(dV[0, 0, 0])
        dict['dV12'].append(dV[0, 0, 1])
        dict['dV21'].append(dV[0, 1, 0])
        dict['dV22'].append(dV[0, 1, 1])

    return dict


# Compare analytical and autograd gradient
a_test = 0.1
b_test = 0.1

# Analytical gradient
analytical_grad = a_grad(a_test, b_test)

for t in range(6):
    grad_func = lambda a: a_grad(a, b=b_test)['x2'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(a_test)

    print(f"Autograd gradient t={t}:", result)
    print(f"Analytical gradient t={t}:", analytical_grad['dx2'][t])
    print("--------------------------------")

Autograd gradient t=0: 0.06070110904023112
Analytical gradient t=0: 0.06070110904023111
--------------------------------
Autograd gradient t=1: 0.03453919699603452
Analytical gradient t=1: 0.03453919699603451
--------------------------------
Autograd gradient t=2: 0.07177758330309453
Analytical gradient t=2: 0.07177758330309453
--------------------------------
Autograd gradient t=3: -0.08938646998851754
Analytical gradient t=3: -0.08938646998851753
--------------------------------
Autograd gradient t=4: -0.16890745483048453
Analytical gradient t=4: -0.1689074548304845
--------------------------------
Autograd gradient t=5: -0.11145823716143788
Analytical gradient t=5: -0.11145823716143786
--------------------------------


# Compare Auto and Analytical Gradient for a

In [120]:
def a_grad(a, b, N):
    k = 2
    sigma2 = 100

    # Simple operations that autograd can handle
    F = np.array([[a, 1.0], [0.0, 0.0]])
    G = np.array([[1.0], [-b]])
    H = np.array([[1.0, 0.0]])
    dF = np.array([
        [[1, 0], [0, 0]],  # First parameter (AR coefficient)
        [[0, 0], [0, 0]]   # Second parameter (MA coefficient)
        ])
    dF.reshape(2, 2, 2)
    dG = np.array([
        [[0], [0]],  # First parameter (AR coefficient)
        [[0], [-1]]   # Second parameter (MA coefficient)
        ])
    dG.reshape(2, 2, 1)

    g = np.array([1.0, a - b, a * (a - b)])

    C = np.array([
        sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2),
        a * (sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2)) - sigma2 * b,
        a * (a * (sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2)) - sigma2 * b)
    ])

    V = np.array([
        [C[0], -b * g[0]],
        [-b * g[0], b**2 * sigma2]
    ])

    dV = np.array([
        [[(2 * sigma2 * (a-b) * (1 - a*b)) / (1 - a**2)**2, 0],
        [0, 0]],
        [[2 * sigma2 * (b-a) / (1 - a**2), -1],
        [-1, 2 * b * sigma2]]
    ])

    # Initialize the x and dx
    x = np.zeros((k, 1))
    dx = np.zeros((2, 2, 1))

    # Store the value into dictionary
    dict = {
        'K1': [],
        'dK1': [],
        'K2': [],
        'dK2': [],
        'x1': [],
        'x2': [],
        'dx1': [],
        'dx2': [],
        'V11': [],
        'V12': [],
        'V21': [],
        'V22': [],
        'dV11': [],
        'dV12': [],
        'dV21': [],
        'dV22': []
    }

    # 
    for t in range(N):
        # 1. Predict
        # Predict one-step-ahead state predictive density of x_{t}
        x_predict = F @ x
        V_predict = F @ V @ F.T + G @ G.T

        # Compute forecast error and one-step-ahead predictive variance
        e_t = y[t] - (H @ x_predict)[0, 0]
        r_t = (H @ V_predict @ H.T)[0, 0]

        GdGT = np.array([G @ dG.T[0][i].reshape(1,2) for i in range(2)])

        # Kalman filter for gradient
        dx_predict = F @ dx + dF @ x
        dV_predict = F @ dV @ F.T + dF @ V @ F.T + F @ V @ dF.T + dG @ G.T + GdGT

        # Calculate de_t and dr_t as tensor(2,1,1)
        de_t = -H @ dx_predict
        dr_t = H @ dV_predict @ H.T


        # 2. Update
        # Kalman gain
        K = V_predict @ H.T / r_t

        # Update current state and covariance
        x = x_predict + K * e_t
        V = (np.eye(k) - K @ H) @ V_predict

        dK = (dV_predict @ H.T / r_t) - (V_predict @ H.T / r_t**2) @ dr_t
        dx = dx_predict + K @ de_t + dK * e_t
        dV = dV_predict - dK @ H @ V_predict - K @ H @ dV_predict
        
        # Append 
        dict['K1'].append(K[0, 0])
        dict['dK1'].append(dK[0, 0, 0]) # Analytic gradient
        dict['K2'].append(K[1, 0])
        dict['dK2'].append(dK[0, 1, 0])
        dict['x1'].append(x[0, 0])
        dict['x2'].append(x[1, 0])
        dict['dx1'].append(dx[0, 0, 0])
        dict['dx2'].append(dx[0, 1, 0])
        dict['V11'].append(V[0, 0])
        dict['V12'].append(V[0, 1])
        dict['V21'].append(V[1, 0])
        dict['V22'].append(V[1, 1])
        dict['dV11'].append(dV[0, 0, 0])
        dict['dV12'].append(dV[0, 0, 1])
        dict['dV21'].append(dV[0, 1, 0])
        dict['dV22'].append(dV[0, 1, 1])

    return dict


# Compare analytical and autograd gradient
a_test = 0.1
b_test = 0.2

# Dataframe to store the differences
df = pd.DataFrame(columns=['dK1', 'dK2', 'dx1', 'dx2', 'dV11', 'dV12', 'dV21', 'dV22'])

# Analytical gradient
analytical_grad = a_grad(a_test, b_test, N=20)

# threshold = 1e-5
roundup = 6

for t in range(20):

    # 1. Difference between dK1
    grad_func = lambda a: a_grad(a, b=b_test, N=t+1)['K1'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(a_test)
    df.loc[t, 'dK1'] = round(result - analytical_grad['dK1'][t], roundup)

    # 2. Difference between dK2
    grad_func = lambda a: a_grad(a, b=b_test, N=t+1)['K2'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(a_test)
    df.loc[t, 'dK2'] = round(result - analytical_grad['dK2'][t], roundup)

    # 3. Difference between dx1
    grad_func = lambda a: a_grad(a, b=b_test, N=t+1)['x1'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(a_test)
    df.loc[t, 'dx1'] = round(result - analytical_grad['dx1'][t], roundup)

    # 4. Difference between dx2
    grad_func = lambda a: a_grad(a, b=b_test, N=t+1)['x2'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(a_test)
    df.loc[t, 'dx2'] = round(result - analytical_grad['dx2'][t], roundup)

    # 5. Difference between dV11
    grad_func = lambda a: a_grad(a, b=b_test, N=t+1)['V11'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(a_test)
    df.loc[t, 'dV11'] = round(result - analytical_grad['dV11'][t], roundup)

    # 6. Difference between dV12
    grad_func = lambda a: a_grad(a, b=b_test, N=t+1)['V12'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(a_test)
    df.loc[t, 'dV12'] = round(result - analytical_grad['dV12'][t], roundup)

    # 7. Difference between dV21
    grad_func = lambda a: a_grad(a, b=b_test, N=t+1)['V21'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(a_test)
    df.loc[t, 'dV21'] = round(result - analytical_grad['dV21'][t], roundup)

    # 8. Difference between dV22
    grad_func = lambda a: a_grad(a, b=b_test, N=t+1)['V22'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(a_test)
    df.loc[t, 'dV22'] = round(result - analytical_grad['dV22'][t], roundup)


# Print result
df

Unnamed: 0,dK1,dK2,dx1,dx2,dV11,dV12,dV21,dV22
0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0
2,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0
3,0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0
4,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0
5,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0
6,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0
9,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0


# Compare Auto and Analytical Gradient for b

In [None]:
def b_grad(b, a, N):
    k = 2
    sigma2 = 100

    # Simple operations that autograd can handle
    F = np.array([[a, 1.0], [0.0, 0.0]])
    G = np.array([[1.0], [-b]])
    H = np.array([[1.0, 0.0]])
    dF = np.array([
        [[1, 0], [0, 0]],  # First parameter (AR coefficient)
        [[0, 0], [0, 0]]   # Second parameter (MA coefficient)
        ])
    dF.reshape(2, 2, 2)
    dG = np.array([
        [[0], [0]],  # First parameter (AR coefficient)
        [[0], [-1]]   # Second parameter (MA coefficient)
        ])
    dG.reshape(2, 2, 1)

    g = np.array([1.0, a - b, a * (a - b)])

    C = np.array([
        sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2),
        a * (sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2)) - sigma2 * b,
        a * (a * (sigma2 * (1 - 2 * a * b + b**2) / (1 - a**2)) - sigma2 * b)
    ])

    V = np.array([
        [C[0], -b * g[0]],
        [-b * g[0], b**2 * sigma2]
    ])

    dV = np.array([
        [[(2 * sigma2 * (a-b) * (1 - a*b)) / (1 - a**2)**2, 0],
        [0, 0]],
        [[2 * sigma2 * (b-a) / (1 - a**2), -1],
        [-1, 2 * b * sigma2]]
    ])

    # Initialize the x and dx
    x = np.zeros((k, 1))
    dx = np.zeros((2, 2, 1))

    # Store the value into dictionary
    dict = {
        'K1': [],
        'dK1': [],
        'K2': [],
        'dK2': [],
        'x1': [],
        'x2': [],
        'dx1': [],
        'dx2': [],
        'V11': [],
        'V12': [],
        'V21': [],
        'V22': [],
        'dV11': [],
        'dV12': [],
        'dV21': [],
        'dV22': []
    }

    # 
    for t in range(N):
        # 1. Predict
        # Predict one-step-ahead state predictive density of x_{t}
        x_predict = F @ x
        V_predict = F @ V @ F.T + G @ G.T

        # Compute forecast error and one-step-ahead predictive variance
        e_t = y[t] - (H @ x_predict)[0, 0]
        r_t = (H @ V_predict @ H.T)[0, 0]

        GdGT = np.array([G @ dG.T[0][i].reshape(1,2) for i in range(2)])

        # Kalman filter for gradient
        dx_predict = F @ dx + dF @ x
        dV_predict = F @ dV @ F.T + dF @ V @ F.T + F @ V @ dF.T + dG @ G.T + GdGT

        # Calculate de_t and dr_t as tensor(2,1,1)
        de_t = -H @ dx_predict
        dr_t = H @ dV_predict @ H.T


        # 2. Update
        # Kalman gain
        K = V_predict @ H.T / r_t

        # Update current state and covariance
        x = x_predict + K * e_t
        V = (np.eye(k) - K @ H) @ V_predict

        dK = (dV_predict @ H.T / r_t) - (V_predict @ H.T / r_t**2) @ dr_t
        dx = dx_predict + K @ de_t + dK * e_t
        dV = dV_predict - dK @ H @ V_predict - K @ H @ dV_predict
        
        # Append 
        dict['K1'].append(K[0, 0])
        dict['dK1'].append(dK[1, 0, 0]) # Analytic gradient
        dict['K2'].append(K[1, 0])
        dict['dK2'].append(dK[1, 1, 0])
        dict['x1'].append(x[0, 0])
        dict['x2'].append(x[1, 0])
        dict['dx1'].append(dx[1, 0, 0])
        dict['dx2'].append(dx[1, 1, 0])
        dict['V11'].append(V[0, 0])
        dict['V12'].append(V[0, 1])
        dict['V21'].append(V[1, 0])
        dict['V22'].append(V[1, 1])
        dict['dV11'].append(dV[1, 0, 0])
        dict['dV12'].append(dV[1, 0, 1])
        dict['dV21'].append(dV[1, 1, 0])
        dict['dV22'].append(dV[1, 1, 1])

    return dict


# Compare analytical and autograd gradient
a_test = -0.1
b_test = 0.3

# Dataframe to store the differences
df = pd.DataFrame(columns=['dK1', 'dK2', 'dx1', 'dx2', 'dV11', 'dV12', 'dV21', 'dV22'])

# Analytical gradient
analytical_grad = b_grad(b_test, a_test, N=20)

# threshold = 1e-5
roundup = 6

for t in range(20):

    # 1. Difference between dK1
    grad_func = lambda b: b_grad(b, a=a_test, N=t+1)['K1'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(b_test)
    df.loc[t, 'dK1'] = round(result - analytical_grad['dK1'][t], roundup)

    # 2. Difference between dK2
    grad_func = lambda b: b_grad(b, a=a_test, N=t+1)['K2'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(b_test)
    df.loc[t, 'dK2'] = round(result - analytical_grad['dK2'][t], roundup)

    # 3. Difference between dx1
    grad_func = lambda b: b_grad(b, a=a_test, N=t+1)['x1'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(b_test)
    df.loc[t, 'dx1'] = round(result - analytical_grad['dx1'][t], roundup)

    # 4. Difference between dx2
    grad_func = lambda b: b_grad(b, a=a_test, N=t+1)['x2'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(b_test)
    df.loc[t, 'dx2'] = round(result - analytical_grad['dx2'][t], roundup)

    # 5. Difference between dV11
    grad_func = lambda b: b_grad(b, a=a_test, N=t+1)['V11'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(b_test)
    df.loc[t, 'dV11'] = round(result - analytical_grad['dV11'][t], roundup)

    # 6. Difference between dV12
    grad_func = lambda b: b_grad(b, a=a_test, N=t+1)['V12'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(b_test)
    df.loc[t, 'dV12'] = round(result - analytical_grad['dV12'][t], roundup)

    # 7. Difference between dV21
    grad_func = lambda b: b_grad(b, a=a_test, N=t+1)['V21'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(b_test)
    df.loc[t, 'dV21'] = round(result - analytical_grad['dV21'][t], roundup)

    # 8. Difference between dV22
    grad_func = lambda b: b_grad(b, a=a_test, N=t+1)['V22'][t]
    auto_grad = grad(grad_func)
    result = auto_grad(b_test)
    df.loc[t, 'dV22'] = round(result - analytical_grad['dV22'][t], roundup)


# Print result
df

Unnamed: 0,dK1,dK2,dx1,dx2,dV11,dV12,dV21,dV22
0,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0
1,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0
2,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0
4,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0
5,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0
6,0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0
7,0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0
8,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0
9,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0
