# GP Regression on Application Project Data

---
Cell for importing packages:

In [1]:
# Install a pip package in the current Jupyter kernel
import sys
# !{sys.executable} -m pip install pyro-ppl

Collecting pyro-ppl
  Downloading pyro_ppl-1.3.1-py3-none-any.whl (520 kB)
[K     |████████████████████████████████| 520 kB 704 kB/s eta 0:00:01
Collecting tqdm>=4.36
  Downloading tqdm-4.45.0-py2.py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 17.2 MB/s eta 0:00:01
[?25hCollecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.2.0-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 6.1 MB/s  eta 0:00:01
[?25hCollecting pyro-api>=0.1.1
  Downloading pyro_api-0.1.1-py3-none-any.whl (8.2 kB)
Installing collected packages: tqdm, opt-einsum, pyro-api, pyro-ppl
Successfully installed opt-einsum-3.2.0 pyro-api-0.1.1 pyro-ppl-1.3.1 tqdm-4.45.0


---
Imports cell:

In [1]:
# 441975, l.teixeira@wustl.edu, Teixeira, Lucas
# 443896, rickynoll@wustl.edu, Noll, Ricky
# XXXXXX, XXXXX@wustl.edu, Kowsari, Daria

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

---
## Read Training Input File:

In [2]:
# Read training input file
train = pd.read_csv(os.path.join(os.getcwd(),'train.csv'))
# Clip off labels from features and reset index
train_x = train.loc[:, 'ID':'Soil_Type'].set_index('ID')
# Extract labels into their own series
train_y = train.loc[:, ['ID','Horizontal_Distance_To_Fire_Points']].set_index('ID').squeeze()

# Do the same for the test file
test = pd.read_csv(os.path.join(os.getcwd(),'test.csv'))
test_x = test.set_index('ID')

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.10, random_state=651)

print("X_train: ", X_train.shape)
print("X_val  : ", X_val.shape)
print("y_train: ", y_train.shape)
print("y_val  : ", y_val.shape)
print("test_x : ", test_x.shape)

X_train:  (6694, 10)
X_val  :  (744, 10)
y_train:  (6694,)
y_val  :  (744,)
test_x :  (11157, 10)


Now we have a training set, a validation set that we can compute metrics and stuff on, and our real test set for which we don't have labels but that we'll make our actual predictions on.

In [3]:
# See the first seven training examples
X_train[:7]

Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Soil_Type
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5291270563394,3336,338,15,67,14,1370,187,218,170,8772
532143351577,3173,111,18,60,15,960,248,218,92,7202
1293730187410,3266,86,22,255,62,1585,246,196,68,7757
4462240315010,3342,101,8,277,39,1682,233,230,129,7201
1363789063680,3214,208,16,30,8,866,207,253,177,7202
2189802674010,3260,277,11,366,42,751,190,242,191,7202
8631733444322,3228,254,16,30,10,1055,181,249,204,7202


In [4]:
# See labels of training samples
y_train.squeeze()

ID
5291270563394    1620
532143351577     2207
1293730187410    1415
4462240315010    3196
1363789063680    2072
                 ... 
4599783414358    1415
1593149571047    1480
811981437170      437
8070934301674    4261
3024839830307    1298
Name: Horizontal_Distance_To_Fire_Points, Length: 6694, dtype: int64

---
## Let's Try a Bayesian Model in GPyTorch:

In [5]:
import math
import torch
import gpytorch
import pyro
from pyro.infer.mcmc import NUTS, MCMC
from matplotlib import pyplot as plt
from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))

%matplotlib inline
%load_ext autoreload
%autoreload 2

### Construct our first, basic GP model:

First we need to turn our training data into `torch.Tensor`s

In [6]:
train_x_tensor = torch.tensor(X_train.to_numpy())
train_x_tensor

tensor([[3336,  338,   15,  ...,  218,  170, 8772],
        [3173,  111,   18,  ...,  218,   92, 7202],
        [3266,   86,   22,  ...,  196,   68, 7757],
        ...,
        [3312,  301,   12,  ...,  235,  189, 7757],
        [3365,  164,   10,  ...,  243,  145, 7756],
        [3259,  139,   20,  ...,  229,  103, 7757]])

In [7]:
train_y_tensor = torch.tensor(y_train.to_numpy())
train_y_tensor

tensor([1620, 2207, 1415,  ...,  437, 4261, 1298])

Now we define a boilerplate class for Exact GP Inference with standard constant mean function and RBF kernel.

In [8]:
# We will use the simplest form of GP model, exact inference
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
    
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# initialize likelihood and model
likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = ExactGPModel(train_x_tensor, train_y_tensor, likelihood)

In [9]:
dir(gpytorch.priors)

['GammaPrior',
 'HorseshoePrior',
 'LKJCholeskyFactorPrior',
 'LKJCovariancePrior',
 'LKJPrior',
 'LogNormalPrior',
 'MultivariateNormalPrior',
 'NormalPrior',
 'Prior',
 'SmoothedBoxPrior',
 'UniformPrior',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'horseshoe_prior',
 'lkj_prior',
 'prior',
 'smoothed_box_prior',
 'torch_priors',
 'utils']

With the model now defined we train by writing our own training loop.

In [10]:
from gpytorch.priors import GammaPrior
GammaPrior(50.,0.7710).mean

tensor(64.8508)

In [11]:
1./1.297

0.7710100231303008

In [13]:
smoke_test = ('CI' in os.environ)
num_samples = 2 if smoke_test else 100
warmup_steps = 2 if smoke_test else 200


from gpytorch.priors import LogNormalPrior, NormalPrior, UniformPrior, GammaPrior
# Use a positive constraint instead of usual GreaterThan(1e-4) so that LogNormal has support over full range.
likelihood = gpytorch.likelihoods.GaussianLikelihood(noise_constraint=gpytorch.constraints.Positive())
model = ExactGPModel(train_x_tensor, train_y_tensor, likelihood)

likelihood.register_prior("noise_prior", GammaPrior(7.5, 1.), "noise")
model.mean_module.register_prior("mean_prior", GammaPrior(24., 1.), "constant")
model.covar_module.base_kernel.register_prior("lengthscale_prior", GammaPrior(64., 1.), "lengthscale")
# model.covar_module.base_kernel.register_prior("period_length_prior", UniformPrior(0.05, 2.5), "period_length")
model.covar_module.register_prior("outputscale_prior", GammaPrior(11.5, 1.), "outputscale")


mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

def pyro_model(x, y):
    model.pyro_sample_from_prior()
    output = model(x)
    loss = mll.pyro_factor(output, y)
    return y

nuts_kernel = NUTS(pyro_model, adapt_step_size=True)
mcmc_run = MCMC(nuts_kernel, num_samples=num_samples, warmup_steps=warmup_steps, disable_progbar=smoke_test)
mcmc_run.run(train_x_tensor, train_y_tensor)

Warmup:   0%|          | 0/300 [00:00, ?it/s]

ValueError: Invalid input value for prior lengthscale_prior. Error:
The value argument must be within the support
                                  Trace Shapes:      
                                   Param Sites:      
                                  Sample Sites:      
                    likelihood.noise_prior dist   1 |
                                          value   1 |
                    mean_module.mean_prior dist   1 |
                                          value   1 |
            covar_module.outputscale_prior dist     |
                                          value     |
covar_module.base_kernel.lengthscale_prior dist 1 1 |
                                          value 1 1 |

Now that we have a trained model let's take a look at our hyperparameter values

In [10]:
for param_name, param in model.named_parameters():
    print(f'Parameter name: {param_name:42} value = {param.item()}')

Parameter name: likelihood.noise_covar.raw_noise           value = 7.398657321929932
Parameter name: mean_module.constant                       value = 23.66961097717285
Parameter name: covar_module.raw_outputscale               value = 11.340603828430176
Parameter name: covar_module.base_kernel.raw_lengthscale   value = 64.85105895996094


Now we'll use our validation set to compute regression metrics to see how well we did.

First we make our validation points tensor:

In [11]:
val_x_tensor = torch.tensor(X_val.to_numpy())
val_x_tensor

tensor([[3209,   61,   23,  ...,  186,   75, 7756],
        [3352,  315,   14,  ...,  228,  186, 7202],
        [3096,  348,    7,  ...,  229,  160, 7202],
        ...,
        [3245,  100,   19,  ...,  211,   84, 7757],
        [3257,  311,   11,  ...,  232,  181, 7202],
        [3329,  318,   24,  ...,  210,  197, 8772]])

Now we set our model and likelihood to prediction mode and do constant time predictions with `gpytorch.settings.fast_pred_var()`

In [12]:
# Get into evaluation (predictive posterior) mode
model.eval()
likelihood.eval()

# Test points are regularly spaced along [0,1]
# Make predictions by feeding model through likelihood
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    validation_preds = likelihood(model(val_x_tensor))

In [13]:
validation_preds

MultivariateNormal(loc: torch.Size([744]))

In [18]:
validation_preds.mean

tensor([1465.9532,  860.9825, 1438.9178, 1154.0012, 3692.2129, 1299.8569,
        2198.1379,  994.2431, 1612.0750, 3860.8389, 1165.0715,  732.1728,
        1537.8116, 3333.6086, 2121.5017, 2492.5940,  696.3145, 1560.0292,
        3598.2771,  810.9978,  766.2330, 1004.0098,  946.8558, 1552.0851,
        4883.8022, 1145.4208, 3529.1150,  728.3445, 1158.7191, 4239.2251,
        1743.7670, 2315.5464,  222.9875, 1242.2836,  960.6544,  917.1423,
         620.7706, 1109.5702, 1600.3633, 1706.0675, 1771.5663,  236.9968,
        2712.0554,  129.3223,  932.2488, 1649.0775, 3233.6990,  680.8455,
        1271.6184, 1165.8961, 2823.7009,  524.1947,  903.3549,  783.5823,
        2015.2096, 2009.4250, 1390.5049,  574.4168, 2493.7261, 1321.2152,
        1514.9766, 1892.7499, 2101.7476, 1220.7474, 1627.4753,  879.2745,
        2279.4578, 2039.5258,  673.0369,  944.4106,  885.3189,  826.8957,
        1340.9524, 3128.1243, 3951.5920, 1440.6068, 1003.4258, 3380.4290,
        1250.3257,  885.2352, 1156.590

In [19]:
# Use the posterior mean as our prediction
print("--- Regression Metrics ---")
mse = mean_squared_error(y_val, validation_preds.mean)
mae = mean_absolute_error(y_val, validation_preds.mean)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, validation_preds.mean)

print(f"\nMSE : {mse}")
print(f"\nRMSE: {rmse}")
print(f"\nMAE : {mae}")
print(f"\nR^2  : {r2}")

--- Regression Metrics ---

MSE : 300955.77256969217

RMSE: 548.5943606798124

MAE : 315.6027186198901

R^2  : 0.8450189239802666


### Make real predictions and pipe output to file

In [32]:
test_x_tensor = torch.tensor(test_x.to_numpy())
test_x_tensor

tensor([[3229,   98,   22,  ...,  204,   72, 7757],
        [3324,   92,   18,  ...,  209,   86, 8776],
        [3433,  162,   11,  ...,  243,  143, 8771],
        ...,
        [3446,  274,    4,  ...,  240,  169, 8703],
        [3372,  281,   11,  ...,  241,  192, 7755],
        [3132,  129,    9,  ...,  235,  130, 7756]])

In [34]:
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    validation_preds = likelihood(model(test_x_tensor))
    submission_output = pd.DataFrame(data={'ID': test_x.index, 'Horizontal_Distance_To_Fire_Points': validation_preds.mean})
    submission_output.to_csv(os.path.join(os.getcwd(),'simple_gp_predictions.csv'), index=False)
submission_output