# Application of Variational Gaussian Process on Housing Price Data

## Preparation and a quick look at the data
### Import libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import sys
sys.path.append("..")
from vgp.vgp_sqExp import vgp_sqExp
from datetime import datetime
import math

### Load data
Since longitude, latitudeare, and the house price are only considered, other variables can be excluded. 

In [3]:
housingDat = pd.read_csv("data/new.csv", encoding="latin-1")
housingDat = housingDat[['Lng', 'Lat', 'price']]

  interactivity=interactivity, compiler=compiler, result=result)


### Shape of the data

In [4]:
housingDat.shape

(318851, 3)

### First five rows of the data

In [5]:
housingDat.head()

Unnamed: 0,Lng,Lat,price
0,116.475489,40.01952,31680
1,116.453917,39.881534,43436
2,116.561978,39.877145,52021
3,116.43801,40.076114,22202
4,116.428392,39.886229,48396


### Check for missingness

In [6]:
housingDat.apply(lambda x: sum(x.isna()), axis=0)

Lng      0
Lat      0
price    0
dtype: int64

It seems there are no missing observations for the variables considered.

## Modelling

### Split the data by training and test set

In [7]:
from sklearn.model_selection import train_test_split
X = housingDat[['Lng', 'Lat']].to_numpy()
y = housingDat['price'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1)

### Get RMSE based on the mean response (For benchmark)

In [8]:
m = y_train.mean()
np.linalg.norm(y_test - m)/math.sqrt(y_test.shape[0])

21668.931677314606

### Variational GP 
For inducing points, 529 points are taken based on the 23 quantiles values from longitude and latitude.

### Take 23 quantile values for longitude and latitude

In [11]:
NumInduce = 23
p = np.linspace(0, 1, 25)[1:24]
quantiles = np.quantile(X_train, p, axis=0)
X_inducing = np.zeros((23*23, 2))
for i in range(23):
    for j in range(23):
        index_num = 23*i + j
        X_inducing[index_num, 0] = quantiles[i, 0]
        X_inducing[index_num, 1] = quantiles[j, 1]

In [12]:
sigmaSqf = 1
lscale = 1
sigmaSqNoise = 1
varErr = 1

gpHousePrice = vgp_sqExp(X_train,
                    y_train,
                    X_inducing,
                    sigmaSqf,
                    lscale,
                    varErr)

gpHousePrice.train(printElapsedTime=True)

Elapsed Time (in Seconds): 237


In [13]:
pred = gpHousePrice.predictMean(X_test, y_test, True)

Test RMSE=17174.851796367933.
