In [6]:
%matplotlib inline
import numpy as np
import pandas as pd
import GPy

from IPython.display import display
GPy.plotting.change_plotting_library('matplotlib')

# Helpers

In [33]:
def filter_by_coordinates(data, coordinate_limits):
    """
    Returns the filtered coordinates from the dataset.
    
    Params:
        - coordinate_limits (tuple): in the format (MIN_LAT, MIN_LNG, MAX_LAT, MAX_LNG)
    
    Returns:
        - numpy matrix that consists only of the filtered data
    """ 
    min_lat, min_lng, max_lat, max_lng = coordinate_limits
    
    f1 = data[data[:, 0] > min_lat]
    f2 = f1[f1[:, 0] < max_lat]
    f3 = f2[f2[:, 1] > min_lng]
    f4 = f3[f3[:, 1] < max_lng]

    return f4

def load_data(filename, coordinates):
    data = filter_by_coordinates(np.load(filename), coordinates)    
    X, y = data[:,0:3], data[:,3]
    
    return X, y

def run_gp(dataset, kernel, coordinates):
    """
    Runs the entire GP as a single method and returns the trained model. It is possible to continue
    training it with restarts after the initial training.
    
    Params:
        - dataset (str): name of dataset in `.npy` format to train the GP
        - kernel: GPy kernel object that defines the covariance function of our GP
        - top_left (tuple): (LAT, LNG) of the top left corner of our area concerned
        - bottom_right (tuple): (LAT, LNG) of the bottom right corner of our area concerned

    Returns:
        - a trained GPy model
    """
    X, y = load_data(dataset, coordinates)
    
    print('Training GP...')
    print(f'X shape: {X.shape}')
    print(f'y shape: {y.shape}')
    
    model = GPy.models.GPRegression(X, y[:,None], kernel)
    model.optimize(messages=True)
    
    display(model)
    print('Log Likelihood: ' + model.log_likelihood())
    
    return model

# Kernel

Define your kernel here.

In [8]:
kernel = None

# Datasets

Store your datasets as a list of strings.

In [35]:
sizes = [200, 400]
datasets = [f'../data/sk-filtered_data-{size}.npy' for size in sizes] + [f'../data/tp-filtered_data-{size}.npy' for size in sizes]

# Tampines

In [36]:
MAX_AREA = (1.351800, 103.943283, 1.360377, 103.957675)

tp_models = []
for dataset in datasets:
    model = run_gp(dataset, kernel, MAX_AREA)
    model.optimize(messages=True)
    tp_models += [model]

Training GP...
X shape: (0, 3)
y shape: (0,)


error: failed in converting 2nd keyword `c' of _fblas.dsyrk to C/Fortran array

# Sengkang

In [37]:
MAX_AREA = (1.382030, 103.888635, 1.402076, 103.909292)

sk_models = []
for dataset in datasets:
    model = run_gp(dataset, kernel, MAX_AREA)
    model.optimize(messages=True)
    sk_models += [model]

Training GP...
X shape: (2904, 3)
y shape: (2904,)


KeyboardInterrupt: 

# Analysis of models

In [24]:
print('Tampines')
for dataset, model in zip(datasets, sk_models):
    print(f'{dataset}: {model.log_likelihood()}')
    
print('Sengkang')
for dataset, model in zip(datasets, tp_models):
    print(f'{dataset}: {model.log_likelihood()}')

Tampines
Sengkang
