# HW5

The dataset is from https://www.kaggle.com/datasets/willianoliveiragibin/healthcare-insurance?resource=download

In [1]:
import pymc as pm
import numpy as np
from scipy import stats
import pandas as pd
import arviz as az
import matplotlib.pyplot as plt
from scipy.linalg import cholesky

Q1

In [11]:
# Load data from CSV
data = pd.read_csv('insurance.csv')

# Define the dimensionality of the multivariate normal distribution
p = data.shape[1]

# Define the scale matrix for the Wishart distribution
Psi = np.eye(p)

# Generate an inverse-Wishart sample for the covariance matrix
a_cov = stats.invwishart(df=p+2, scale=Psi).rvs(1)

# Define the number of data points
n = len(data)

# Generate simulated data points from a multivariate normal distribution
y = stats.multivariate_normal(mean=np.zeros(p), cov=a_cov).rvs(size=n)

with pm.Model() as MNV_LKJ:
    packed_L = pm.LKJCholeskyCov("packed_L", n=p, eta=2.0,
                                 sd_dist=pm.Exponential.dist(1.0, shape=2), compute_corr=False)
    L = pm.expand_packed_triangular(p, packed_L)
    Sigma = pm.Deterministic('Sigma', L.dot(L.T)) # Don't use a covariance matrix parameterization
    mu = pm.MvNormal('mu', mu=np.array(0), cov=np.eye(p), shape=p);
    y = pm.MvNormal('y', mu=mu, cov=Sigma, shape=(n,1), observed=y)
    
    # Figure out how to parameterize this with a Cholesky factor to improve computational efficiency
with MNV_LKJ:
    idata = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [packed_L, mu]


  self.vm()
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 289 seconds.


Q2 & 3

In [13]:
p = data.shape[1]  # Dimensionality of the multivariate normal
Psi = np.eye(p)  # Scale matrix for the Wishart distribution
a_cov = stats.invwishart(df=p+2, scale=Psi).rvs(1)  # Inverse-Wishart sample for the covariance matrix

n = n = len(data)  # Number of data points
y = stats.multivariate_normal(mean=np.zeros(p), cov=a_cov).rvs(size=n)  # Simulated data points

# Define the PyMC3 model using the LKJCholeskyCov distribution
with pm.Model() as MNV_LKJ:
    # Define the Cholesky factor of the covariance matrix
    packed_L = pm.LKJCholeskyCov("packed_L", n=p, eta=2.0,
                                 sd_dist=pm.Exponential.dist(1.0, shape=p), compute_corr=False)
    L = pm.expand_packed_triangular(p, packed_L)
    Sigma = pm.Deterministic('Sigma', L.dot(L.T))  # Reconstructing the covariance matrix

    # Define the prior for the mean vector using a small constant for the covariance for numerical stability
    mu = pm.MvNormal('mu', mu=np.zeros(p), cov=np.eye(p)*1e-6, shape=p)

    # Define the likelihood of the observed data
    y_obs = pm.MvNormal('y_obs', mu=mu, chol=L, observed=y)

# Sampling from the model
with MNV_LKJ:
    trace = pm.sample(1000) 

# The trace object now contains the samples for the posterior distribution

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [packed_L, mu]


  self.vm()
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 331 seconds.
