Demonstration of GPR for PIRI data with unit trends

In [100]:
# load packages
import torch
import numpy as np
import pandas as pd
import gpytorch
from scipy.stats import norm
from typing import Optional, Tuple
from matplotlib import pyplot as plt
from gpytorch.means import LinearMean
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.kernels import ScaleKernel, RBFKernel
from statsmodels.stats.stattools import durbin_watson

Implement constant mean module and mask mean module

In [101]:
class ConstantVectorMean(gpytorch.means.mean.Mean):
    def __init__(self, d=1, prior=None, batch_shape=torch.Size(), **kwargs):
        super().__init__()
        self.batch_shape = batch_shape
        self.register_parameter(name="constantvector",\
                 parameter=torch.nn.Parameter(torch.zeros(*batch_shape, d)))
        if prior is not None:
            self.register_prior("mean_prior", prior, "constantvector")

    def forward(self, input):
        return self.constantvector[input.int().reshape((-1,)).tolist()]
    
class MaskMean(gpytorch.means.mean.Mean):
    def __init__(
        self,
        base_mean: gpytorch.means.mean.Mean,
        active_dims: Optional[Tuple[int, ...]] = None,
        **kwargs,
    ):
        super().__init__()
        if active_dims is not None and not torch.is_tensor(active_dims):
            active_dims = torch.tensor(active_dims, dtype=torch.long)
        self.active_dims = active_dims
        self.base_mean = base_mean
    
    def forward(self, x, **params):
        return self.base_mean.forward(x.index_select(-1, self.active_dims), **params)

load data

In [169]:
def load_PIRI_data():
    # read data
    data = pd.read_csv("hb_data_complete.csv", index_col=[0])

    # all zero PIRI for new zealand and netherland
    data = data.loc[~data['country'].isin(['N-ZEAL','NETHERL'])]

    countries = sorted(data.country.unique())
    years = data.year.unique()
    n = len(countries)
    m = len(years)

    # build data
    country_dict = dict(zip(countries, range(n)))
    year_dict = dict(zip(years, range(m)))

    # x is:
    # 1: year number
    # 2: country id
    # 3: AIShame (treatment indicator)
    # 4: cat_rat
    # 5: ccpr_rat
    # 6: democratic
    # 7: log(gdppc)
    # 8: log(pop)
    # 9: Civilwar2
    # 10: War
    x = torch.zeros(data.shape[0], 10)
    x[:,0] = torch.as_tensor(list(map(year_dict.get, data.year)))
    x[:,1] = torch.as_tensor(list(map(country_dict.get, data.country)))
    x[:,2] = torch.as_tensor(data.AIShame.to_numpy())
    x[:,3] = torch.as_tensor(data.cat_rat.to_numpy())
    x[:,4] = torch.as_tensor(data.ccpr_rat.to_numpy())
    x[:,5] = torch.as_tensor(data.democratic.to_numpy())
    x[:,6] = torch.as_tensor(data.log_gdppc.to_numpy())
    x[:,7] = torch.as_tensor(data.log_pop.to_numpy())
    x[:,8] = torch.as_tensor(data.Civilwar2.to_numpy())
    x[:,9] = torch.as_tensor(data.War.to_numpy())
    # x[:,10] = torch.as_tensor(data.PIRI.to_numpy())
    y = torch.as_tensor(data.PIRILead1.to_numpy()).double()

    unit_means = torch.zeros(n,)
    for i in range(n):
        unit_means[i] = y[x[:,1]==i].mean()

    return x.double(), y.double(), unit_means.double(), data, countries, years

train_x, train_y, unit_means, data, countries, years = load_PIRI_data()

Build GPR model with unit trends

In [160]:
# model specification: PIRI gp model with unit trends
# PIRI ~ AIShame + u_i(t) + cat_rat + ccpr_rat 
#            + democratic + log(gdppc) + log(pop) 
#            + Civilwar2 + War
# u_i(t) ~ GP(b_i, K_t)

class GPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super().__init__(train_x, train_y, likelihood)
        self.mean_module = MaskMean(active_dims=1, \
                base_mean=ConstantVectorMean(d=train_x[:,1].unique().size()[0]))
        # linear mean for continuous covariates
        self.x_mean_module = MaskMean(active_dims=[2,3,4,5,6,7,8,9], base_mean=LinearMean(input_size=8, bias=False))
        # year kernel * country kernel
        self.unit_covar_module = ScaleKernel(RBFKernel(active_dims=0)*RBFKernel(active_dims=1))
        self.x_covar_module = torch.nn.ModuleList([ScaleKernel(RBFKernel(\
            active_dims=(i))) for i in [6,7]])
        self.binary_covar_module = torch.nn.ModuleList([ScaleKernel(RBFKernel(\
            active_dims=(i))) for i in [3,4,5,8,9]])
        self.effect_covar_module = ScaleKernel(RBFKernel(active_dims=2))

    def forward(self, x):
        mean_x = self.x_mean_module(x) + self.mean_module(x)
        unit_covar_x = self.unit_covar_module(x)
        effect_covar_x = self.effect_covar_module(x)
        covar_x = unit_covar_x + effect_covar_x
        for i, _ in enumerate(self.x_covar_module):
            covar_x += self.x_covar_module[i](x)
        for i, _ in enumerate(self.binary_covar_module):
            covar_x += self.binary_covar_module[i](x)
        
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

In [196]:
import statsmodels.formula.api as sm

lm = sm.ols('PIRILead1 ~ AIShame  + cat_rat + ccpr_rat \
            + democratic + log_gdppc + log_pop \
            + Civilwar2 + War', data).fit()
print(lm.summary())

coefs = lm.params.to_dict()
covariate_names = ["AIShame" ,"cat_rat" , "ccpr_rat",
           "democratic",  "log_gdppc", "log_pop",
            "Civilwar2", "War"]
x_weights = list(map(coefs.get, covariate_names))

                            OLS Regression Results                            
Dep. Variable:              PIRILead1   R-squared:                       0.465
Model:                            OLS   Adj. R-squared:                  0.463
Method:                 Least Squares   F-statistic:                     231.0
Date:                Mon, 21 Aug 2023   Prob (F-statistic):          2.60e-282
Time:                        17:51:06   Log-Likelihood:                -4118.3
No. Observations:                2137   AIC:                             8255.
Df Residuals:                    2128   BIC:                             8306.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.1340      0.453     -0.296      0.7

initialize model

In [203]:
likelihood = GaussianLikelihood()
model = GPModel(train_x, train_y, likelihood).double()

# initialize model parameters
hypers = {
    'mean_module.base_mean.constantvector': unit_means,
    'x_mean_module.base_mean.weights': torch.tensor(x_weights),
    'likelihood.noise_covar.noise': torch.tensor(0.25),
    'unit_covar_module.base_kernel.kernels.0.lengthscale': torch.tensor(4),
    'unit_covar_module.base_kernel.kernels.1.lengthscale': torch.tensor(0.01),
    'unit_covar_module.outputscale': torch.tensor(1),
    'x_covar_module.0.outputscale': torch.tensor(0.25),
    'x_covar_module.1.outputscale': torch.tensor(0.25),
    'binary_covar_module.0.base_kernel.lengthscale': torch.tensor(0.01),
    'binary_covar_module.1.base_kernel.lengthscale': torch.tensor(0.01),
    'binary_covar_module.2.base_kernel.lengthscale': torch.tensor(0.01),
    'binary_covar_module.3.base_kernel.lengthscale': torch.tensor(0.01),
    'binary_covar_module.4.base_kernel.lengthscale': torch.tensor(0.01),
    'binary_covar_module.0.outputscale': torch.tensor(0.25),
    'binary_covar_module.1.outputscale': torch.tensor(0.25),
    'binary_covar_module.2.outputscale': torch.tensor(0.25),
    'binary_covar_module.3.outputscale': torch.tensor(0.25),
    'binary_covar_module.4.outputscale': torch.tensor(0.25),
    'effect_covar_module.base_kernel.lengthscale': torch.tensor(0.01),
    'effect_covar_module.outputscale': torch.tensor(0.25)
}    

model.initialize(**hypers)

GPModel(
  (likelihood): GaussianLikelihood(
    (noise_covar): HomoskedasticNoise(
      (raw_noise_constraint): GreaterThan(1.000E-04)
    )
  )
  (mean_module): MaskMean(
    (base_mean): ConstantVectorMean()
  )
  (x_mean_module): MaskMean(
    (base_mean): LinearMean()
  )
  (unit_covar_module): ScaleKernel(
    (base_kernel): ProductKernel(
      (kernels): ModuleList(
        (0): RBFKernel(
          (raw_lengthscale_constraint): Positive()
        )
        (1): RBFKernel(
          (raw_lengthscale_constraint): Positive()
        )
      )
    )
    (raw_outputscale_constraint): Positive()
  )
  (x_covar_module): ModuleList(
    (0): ScaleKernel(
      (base_kernel): RBFKernel(
        (raw_lengthscale_constraint): Positive()
      )
      (raw_outputscale_constraint): Positive()
    )
    (1): ScaleKernel(
      (base_kernel): RBFKernel(
        (raw_lengthscale_constraint): Positive()
      )
      (raw_outputscale_constraint): Positive()
    )
  )
  (binary_covar_module): 

train model by optimizing hypers

In [204]:
# train model
model.train()
likelihood.train()

torch.manual_seed(12345)

# freeze length scale in the country component in unit covar
# freeze constant unit means
all_params = set(model.parameters())
final_params = list(all_params - \
            {model.unit_covar_module.base_kernel.kernels[1].raw_lengthscale, \
        #    model.unit_covar_module.raw_outputscale, \
        #    model.mean_module.base_mean.constantvector, \
        #   model.x_covar_module[0].raw_outputscale,
        #   model.x_covar_module[1].raw_outputscale,
            model.binary_covar_module[0].base_kernel.raw_lengthscale,
            model.binary_covar_module[1].base_kernel.raw_lengthscale,
            model.binary_covar_module[2].base_kernel.raw_lengthscale,
            model.binary_covar_module[3].base_kernel.raw_lengthscale,
            model.binary_covar_module[4].base_kernel.raw_lengthscale,
            model.effect_covar_module.base_kernel.raw_lengthscale})
        #    model.effect_covar_module.raw_outputscale})
optimizer = torch.optim.Adam(final_params, lr=0.1)

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

training_iter = 5
for i in range(training_iter):
    # Zero gradients from previous iteration
    optimizer.zero_grad()
    # Output from model
    output = model(train_x)
    # Calc loss and backprop gradients
    loss = -mll(output, train_y)
    loss.backward()
    print('Iter %d/%d - Loss: %.3f '  % (
        i + 1, training_iter, loss.item()
    ))
    optimizer.step()

torch.save(model.state_dict(), "PIRI_GPR_model.pth")

Iter 1/5 - Loss: 2.175 
Iter 2/5 - Loss: 2.073 
Iter 3/5 - Loss: 1.987 
Iter 4/5 - Loss: 1.919 
Iter 5/5 - Loss: 1.842 


generate posterior of PIRI effects

In [209]:
model.load_state_dict(torch.load('PIRI_GPR_model.pth'))

model.train()
likelihood.train()

with torch.no_grad(), gpytorch.settings.fast_pred_var():
    out = likelihood(model(train_x))
    mu_f = out.mean
    V = out.covariance_matrix
    L = torch.linalg.cholesky(V, upper=False)

with torch.no_grad(), gpytorch.settings.fast_pred_var():
    model.unit_covar_module.outputscale = 0
    for i,_ in enumerate(model.x_covar_module):
        model.x_covar_module[i].outputscale = 0
    for i,_ in enumerate(model.binary_covar_module):
        model.binary_covar_module[i].outputscale = 0
    effect_covar = model(train_x).covariance_matrix

# get posterior effect mean
alpha = torch.linalg.solve(L.t(),torch.linalg.solve(L,train_y-mu_f))
tmp = torch.linalg.solve(L, effect_covar)
post_effect_mean =   mu_f + effect_covar @ alpha
# get posterior effect covariance
post_effect_covar = effect_covar - tmp.t() @ tmp

effect = post_effect_mean[train_x[:,2]==1].mean() - post_effect_mean[train_x[:,2]==0].mean()
effect_std = post_effect_covar.diag().mean().sqrt()
BIC = (2+4+6+1)*torch.log(torch.tensor(train_x.size()[0])) + 2*loss*train_x.size()[0]
# print("effect: {:0.3f} +- {:0.3f}\n".format(effect, effect_std))
print("model evidence: {:0.3f} \n".format(-loss*train_x.size()[0]))
print("BIC: {:0.3f} \n".format(BIC))

model evidence: 3217.434 

BIC: -6335.196 



Perform Durbin Watson tests for autocorrelation

In [206]:
# get unit trend wo AIShame
model.load_state_dict(torch.load('PIRI_GPR_model.pth'))
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    model.effect_covar_module.outputscale = 0
    unit_covar = likelihood(model(train_x)).covariance_matrix

# get posterior unit trend mean
alpha = torch.linalg.solve(L.t(),torch.linalg.solve(L,train_y-mu_f))
tmp = torch.linalg.solve(L, unit_covar)
post_unit_mean = mu_f + unit_covar @ alpha + post_effect_mean

# DW-test for sample size = 18 and 9 regressors.
dL = 0.32
dU = 2.87
n = len(countries)
DW_results = np.zeros((n,))
for i in range(n):
    mask = data.country==countries[i]
    mask = mask.to_list()
    res = train_y[mask] - post_unit_mean[mask]
    DW_results[i] = durbin_watson(res.detach().numpy())

print("{} out of {} residuals are positively correlated.\n".format(np.sum(DW_results<=dL),n))
print("{} out of {} residuals are negatively correlated.\n".format(np.sum(DW_results>=dU),n))
print("{} out of {} residuals are not correlated.\n".format(np.sum((DW_results>dL) & (DW_results<dU)),n))
    

130 out of 138 residuals are positively correlated.

0 out of 138 residuals are negatively correlated.

8 out of 138 residuals are not correlated.



plot fitted mean trend and CI

In [33]:
model.load_state_dict(torch.load('PIRI_GPR_model.pth'))

model.eval()
likelihood.eval()

with torch.no_grad(), gpytorch.settings.fast_pred_var():
    out = likelihood(model(train_x))
    mu_f = out.mean.numpy()
    lower, upper = out.confidence_region()

results = pd.DataFrame({"gpr_mean":mu_f})
results['true_y'] = train_y
results['gpr_lwr'] = lower
results['gpr_upr'] = upper
results['year'] = years[train_x[:,0].numpy().astype(int)]
results['country'] = [countries[i] for i in train_x[:,1].numpy().astype(int)]
print(results.head())
results.to_csv("./results/PIRI_fitted_gpr.csv",index=False) #save to file



   gpr_mean  true_y   gpr_lwr   gpr_upr  year country
0  0.251448     0.0 -2.366495  2.869390  1983     USA
1  0.313593     0.0 -2.286459  2.913645  1984     USA
2  0.768128     1.0 -1.815695  3.351952  1985     USA
3  0.465908     1.0 -2.100868  3.032684  1986     USA
4  0.868379     0.0 -1.686243  3.423002  1987     USA


In [34]:
results[results.country=="CHINA"]

Unnamed: 0,gpr_mean,true_y,gpr_lwr,gpr_upr,year,country
1799,5.700728,6.0,3.082248,8.319207,1983,CHINA
1800,5.467938,6.0,2.881896,8.05398,1984,CHINA
1801,4.910411,4.0,2.352177,7.468645,1985,CHINA
1802,4.910006,5.0,2.37448,7.445531,1986,CHINA
1803,5.491105,4.0,2.970757,8.011453,1987,CHINA
1804,5.516754,6.0,3.010289,8.023218,1988,CHINA
1805,5.958619,8.0,3.464083,8.453156,1989,CHINA
1806,5.796347,6.0,3.313799,8.278894,1990,CHINA
1807,5.606905,4.0,3.134507,8.079302,1991,CHINA
1808,5.648193,5.0,3.178968,8.117417,1992,CHINA


use autogradient to generate posterior variance of marginal effects in PIRI by small batches

In [207]:
model.load_state_dict(torch.load('PIRI_GPR_model.pth'))

model.eval()
likelihood.eval()

df_std = np.zeros((train_x.size(0),train_x.size(1)))
x_grad = np.zeros((train_x.size(0),train_x.size(1)))

# small batches of size 100
for i in range(train_x.size(0)//100):
    with gpytorch.settings.fast_pred_var():
        test_x = train_x[(i*100):(i*100+100)].clone().detach().requires_grad_(True)
        observed_pred = likelihood(model(test_x))
        dydtest_x = torch.autograd.grad(observed_pred.mean.sum(), test_x, retain_graph=True)[0]
        x_grad[(i*100):(i*100+100)] = dydtest_x
        loss = mll(observed_pred, train_y[(i*100):(i*100+100)])

        n_samples = 25
        sampled_pred = observed_pred.rsample(torch.Size([n_samples]))
        sampled_dydtest_x = torch.stack([torch.autograd.grad(pred.sum(), test_x, retain_graph=True)[0] for pred in sampled_pred])
        df_std[(i*100):(i*100+100)] = sampled_dydtest_x.std(0)
        
# last 100 rows
with gpytorch.settings.fast_pred_var():
    test_x = train_x[(100*i+100):].clone().detach().requires_grad_(True)
    observed_pred = likelihood(model(test_x))
    dydtest_x = torch.autograd.grad(observed_pred.mean.sum(), test_x, retain_graph=True)[0]
    x_grad[(100*i+100):] = dydtest_x
    loss = mll(observed_pred, train_y[(100*i+100):])

    sampled_pred = observed_pred.rsample(torch.Size([n_samples]))
    sampled_dydtest_x = torch.stack([torch.autograd.grad(pred.sum(), test_x, retain_graph=True)[0] for pred in sampled_pred])
    df_std[(100*i+100):] = sampled_dydtest_x.std(0)
    

accesse marginal effects of regressors

In [208]:
est_std = (df_std).mean(axis=0).round(decimals=6)
results = pd.DataFrame({"x": covariate_names, \
                        'est_mean': x_grad.mean(axis=0)[2:10],
                        'est_std': est_std[2:10]})
results["t"] = results['est_mean'].values/results['est_std'].values
results["pvalue"] = 1 - norm.cdf(np.abs(results["t"].values))
print(results)

            x  est_mean   est_std         t    pvalue
0     AIShame  0.455517  0.000000       inf  0.000000
1     cat_rat -0.177745  0.000000      -inf  0.000000
2    ccpr_rat -0.125010  0.000000      -inf  0.000000
3  democratic -1.027804  0.000000      -inf  0.000000
4   log_gdppc -0.016395  0.064941 -0.252463  0.400342
5     log_pop  0.042709  0.043206  0.988489  0.161457
6   Civilwar2  1.439676  0.000000       inf  0.000000
7         War  0.584848  0.000000       inf  0.000000


  """
