In [9]:
%matplotlib inline
import numpy as np
import pandas as pd
import GPy

from IPython.display import display
GPy.plotting.change_plotting_library('matplotlib')

# Helpers

In [10]:
def filter_by_coordinates(data, coordinate_limits):
    """
    Returns the filtered coordinates from the dataset.
    
    Params:
        - coordinate_limits (tuple): in the format (MIN_LAT, MIN_LNG, MAX_LAT, MAX_LNG)
    
    Returns:
        - numpy matrix that consists only of the filtered data
    """ 
    min_lat, min_lng, max_lat, max_lng = coordinate_limits
    
    f1 = data[data[:, 0] > min_lat]
    f2 = f1[f1[:, 0] < max_lat]
    f3 = f2[f2[:, 1] > min_lng]
    f4 = f3[f3[:, 1] < max_lng]

    return f4

In [11]:
def filter_by_index(data, coordinate_indexes):
    """
    Returns the coordinates that bounds the index of the location matrix.
    
    Params:
        - data (numpy matrix): dataset
        - coordinate_indexes (tuple): in the format: (ROW_MIN, COL_MIN, ROW_MAX, COL_MAX)
        
    Returns:
        - tuple of the following format: (MIN_LAT, MIN_LNG, MAX_LAT, MAX_LNG)
    """
    row_min, col_min, row_max, col_max = coordinate_indexes

    f1 = data[data[:, 4] >= row_min]
    f2 = f1[f1[:, 4] < row_max]
    f3 = f2[f2[:, 5] >= col_min]
    f4 = f3[f3[:, 5] < col_max]

    return f4

In [12]:
def ith_coords(dataset, i):
    return dataset[i][0], dataset[i][1]

In [13]:
def plot_slices_per_location(model, *coords):
    """
    Plot the number of bikes at coordinates over time given a GP.
    
    Params:
        - model (GPy model): model to sample from
        - *coords (list(tuples)): list of tuples, with each tuple in the format (LAT, LNG)
    """
    for idx, coord in enumerate(coords):
        lat, lng = coord        

        figure = GPy.plotting.plotting_library().figure(idx+1)
        canvas = model.plot(figure=figure, fixed_inputs=[(0, lat), (1, lng)])

    GPy.plotting.show(canvas)

In [14]:
def plot_slices_per_timeslice(model, *timeslices):
    """
    Plot a density graph of bikes at coordinates for a given time.
    
    Params:
        - model (GPy model): model to sample from
        - *timeslices (list(int)): list of timeslices in UNIX timestamp
    """
    
    for idx, timestamp in enumerate(timeslices):
        figure = GPy.plotting.plotting_library().figure(idx+1)
        canvas = model.plot(figure=figure, fixed_inputs=[(2, timestamp)])

    GPy.plotting.show(canvas)

In [31]:
def load_data(filename, top_left=None, bottom_right=None):
    if top_left and bottom_right:
        MAX_AREA = (bottom_right[0], top_left[1], top_left[0], bottom_right[1])
        data = filter_by_coordinates(np.load(filename), MAX_AREA)
    else:
        data = np.load(filename)
    
    X, y = data[:,0:3], data[:,3]
    
    return X, y

In [32]:
def load_data_by_index(filename, top_left=None, bottom_right=None):
    if top_left and bottom_right:
        MAX_AREA = (top_left[0], top_left[1], bottom_right[0], bottom_right[1])
        data = filter_by_index(np.load(filename), MAX_AREA)
    else:
        data = np.load(filename)
    
    X, y = data[:,0:3], data[:,3]
    
    return X, y

In [33]:
def run_gp(dataset, kernel, top_left=None, bottom_right=None, index=False):
    """
    Runs the entire GP as a single method and returns the trained model. It is possible to continue
    training it with restarts after the initial training.
    
    Params:
        - dataset (str): name of dataset in `.npy` format to train the GP
        - kernel: GPy kernel object that defines the covariance function of our GP
        - top_left (tuple): (LAT, LNG) of the top left corner of our area concerned
        - bottom_right (tuple): (LAT, LNG) of the bottom right corner of our area concerned

    Returns:
        - a trained GPy model
    """
    if index:
        X, y = load_data_by_index(dataset, top_left=top_left, bottom_right=bottom_right)
    else:
        X, y = load_data(dataset, top_left=top_left, bottom_right=bottom_right)
    
    print('Training GP...')
    print(f'X shape: {X.shape}')
    print(f'y shape: {y.shape}')
    
    model = GPy.models.GPRegression(X, y[:,None], kernel)
    model.optimize(messages=True)
    
    display(model)
    print(model.log_likelihood())
    
    return model

# Kernels

We are going to experiment with the `Matern32` kernel in this notebook.

## Matern32 Kernel

In [17]:
matern32 = GPy.kern.Matern32(3)

## Periodic Kernel

In [18]:
periodic = GPy.kern.StdPeriodic(3)

## RBF Kernel

In [19]:
rbf_lat = GPy.kern.RBF(3)

In [20]:
rbf_lng = GPy.kern.RBF(3)

# Run Gaussian Process

In [43]:
k1 = matern32 + rbf_lat * rbf_lng + periodic
k2 = matern32 + rbf_lat * rbf_lng * periodic
k3 = matern32 + (rbf_lat + rbf_lng) * periodic
k4 = matern32 + rbf_lat + rbf_lng + periodic

In [45]:
kernels = [k1, k2, k3, k4]

I am going to try out the three kernels that would make some sense for our GP with the Sengkang-400 dataset, bottom right corner since it is the area where the hotspots are.

In [46]:
DATASET = '../data/filtered/sk-data-400.npy'
models = []

for k in kernels:
    model = run_gp(DATASET, k, top_left=(1, 12), bottom_right=(6, 16), index=True)
    models += [model]

Training GP...
X shape: (1410, 3)
y shape: (1410,)
Running L-BFGS-B (Scipy implementation) Code:
  runtime   i      f              |g|        
    02s18  0002   7.237057e+03   4.744083e+06 
    07s64  0007   4.227459e+03   1.016385e+04 
    09s83  0009   4.043465e+03   6.705725e+02 
    18s70  0017   3.838897e+03   1.718947e+02 
    32s64  0029   3.813708e+03   3.016319e+01 
 01m08s86  0061   3.811549e+03   1.001210e+01 
 01m12s41  0064   3.811549e+03   5.664702e-03 
Runtime:  01m12s41
Optimization status: Converged



GP_regression.,value,constraints,priors
sum.Mat32.variance,6.85002807956e-42,+ve,
sum.Mat32.lengthscale,1.0,+ve,
sum.mul.rbf.variance,1.11091592382e-41,+ve,
sum.mul.rbf.lengthscale,1.0,+ve,
sum.mul.rbf_1.variance,1.11091592382e-41,+ve,
sum.mul.rbf_1.lengthscale,1.0,+ve,
sum.std_periodic.variance,40.3125461181,+ve,
sum.std_periodic.period,15.0334534378,+ve,
sum.std_periodic.lengthscale,17.601442429,+ve,
Gaussian_noise.variance,12.9564894666,+ve,


-3811.54854608
Training GP...
X shape: (1410, 3)
y shape: (1410,)
Running L-BFGS-B (Scipy implementation) Code:
  runtime   i      f              |g|        
    01s45  0001   1.082313e+04   2.851182e+07 
    14s61  0010   4.090355e+03   1.769864e+02 
    29s29  0020   4.055364e+03   8.810445e-09 
Runtime:     29s29
Optimization status: Converged



GP_regression.,value,constraints,priors
sum.Mat32.variance,0.962091631713,+ve,
sum.Mat32.lengthscale,1.0,+ve,
sum.mul.rbf.variance,5.06668324991,+ve,
sum.mul.rbf.lengthscale,1.0,+ve,
sum.mul.rbf_1.variance,5.06668324991,+ve,
sum.mul.rbf_1.lengthscale,1.0,+ve,
sum.mul.std_periodic.variance,2.58896128556,+ve,
sum.mul.std_periodic.period,14.3728515644,+ve,
sum.mul.std_periodic.lengthscale,14.3753397448,+ve,
Gaussian_noise.variance,13.8376634237,+ve,


-4055.36405886
Training GP...
X shape: (1410, 3)
y shape: (1410,)
Running L-BFGS-B (Scipy implementation) Code:
  runtime   i      f              |g|        
    09s05  0007   4.275031e+03   1.013781e+04 
    14s25  0011   4.058552e+03   1.860968e+01 
    22s08  0017   4.055364e+03   1.067787e-07 
Runtime:     22s08
Optimization status: Converged



GP_regression.,value,constraints,priors
sum.Mat32.variance,0.943826696791,+ve,
sum.Mat32.lengthscale,1.0,+ve,
sum.mul.sum.rbf.variance,7.36174434517,+ve,
sum.mul.sum.rbf.lengthscale,1.0,+ve,
sum.mul.sum.rbf_1.variance,7.36174434517,+ve,
sum.mul.sum.rbf_1.lengthscale,1.0,+ve,
sum.mul.std_periodic.variance,4.51511729739,+ve,
sum.mul.std_periodic.period,14.3728447814,+ve,
sum.mul.std_periodic.lengthscale,14.3753329629,+ve,
Gaussian_noise.variance,13.8377284822,+ve,


-4055.36405888
Training GP...
X shape: (1410, 3)
y shape: (1410,)
Running L-BFGS-B (Scipy implementation) Code:
  runtime   i      f              |g|        
    05s68  0005   4.679074e+03   1.261295e+05 
    12s49  0011   3.946142e+03   3.048965e+01 
    37s57  0033   3.813320e+03   3.855507e+04 
    50s10  0044   3.811737e+03   1.754442e+01 
 01m23s30  0072   3.810452e+03   1.048308e+01 
 01m26s72  0075   3.810452e+03   9.523457e-03 
Runtime:  01m26s72
Optimization status: Converged



GP_regression.,value,constraints,priors
sum.Mat32.variance,5.18555499023e-55,+ve,
sum.Mat32.lengthscale,1.0,+ve,
sum.rbf.variance,8.43134046816e-174,+ve,
sum.rbf.lengthscale,1.0,+ve,
sum.rbf_1.variance,8.43134046816e-174,+ve,
sum.rbf_1.lengthscale,1.0,+ve,
sum.std_periodic.variance,68.3118393101,+ve,
sum.std_periodic.period,14.693877559,+ve,
sum.std_periodic.lengthscale,26.7385435669,+ve,
Gaussian_noise.variance,12.9448289892,+ve,


-3810.45175275


# Prediction

We will now try to predict the bike movements around the areas that we scraped for using our trained models.