In [None]:
# Reshaping methods
import numpy as np


def matrix_to_vector(matrix):
    return matrix.reshape(matrix.size)


def vector_to_matrix(vector, nrows, ncols):
    return vector.reshape((nrows, ncols))


# Inverse tests
x = np.random.rand(4, 4)
assert (vector_to_matrix(matrix_to_vector(x), 4, 4) == x).all()

x = np.random.rand(2, 4)
assert (vector_to_matrix(matrix_to_vector(x), 2, 4) == x).all()

x = np.random.rand(4, 3)
assert (vector_to_matrix(matrix_to_vector(x), 4, 3) == x).all()

x = np.random.rand(1, 3)
assert (vector_to_matrix(matrix_to_vector(x), 1, 3) == x).all()

In [None]:
# Grid generating
import numpy as np


def generate_latlon_vector(request):
    bot_lat, top_lat, left_lon, right_lon = request['predict_area']
    res = request['predict_resolution']
    
    # TODO Simplify
    n_lons = int(np.ceil(abs(right_lon - left_lon) / res)) + 1
    n_lats = int(np.ceil(abs(top_lat - bot_lat) / res)) + 1
    lon_grid, lat_grid = np.meshgrid(
        np.linspace(left_lon, right_lon, n_lons),
        np.linspace(top_lat, bot_lat, n_lats),
    )

    lat_vector = matrix_to_vector(lat_grid)
    lon_vector = matrix_to_vector(lon_grid)

    return lat_vector, lon_vector, n_lats, n_lons


# Grid test
lat_vector, lon_vector, n_lats, n_lons = generate_latlon_vector({
    'predict_area': [1, 5, 1, 7],
    'predict_resolution': 1.0
})

assert n_lats == 5 and n_lons == 7

correct_lats = np.array([
    [ 5.,  5.,  5.,  5.,  5.,  5.,  5.],
    [ 4.,  4.,  4.,  4.,  4.,  4.,  4.],
    [ 3.,  3.,  3.,  3.,  3.,  3.,  3.],
    [ 2.,  2.,  2.,  2.,  2.,  2.,  2.],
    [ 1.,  1.,  1.,  1.,  1.,  1.,  1.]
])

correct_lons = np.array([
    [ 1.,  2.,  3.,  4.,  5.,  6.,  7.],
    [ 1.,  2.,  3.,  4.,  5.,  6.,  7.],
    [ 1.,  2.,  3.,  4.,  5.,  6.,  7.],
    [ 1.,  2.,  3.,  4.,  5.,  6.,  7.],
    [ 1.,  2.,  3.,  4.,  5.,  6.,  7.]
])

assert np.array_equal(
    vector_to_matrix(lat_vector, n_lats, n_lons),
    correct_lats
)

assert np.array_equal(
    vector_to_matrix(lon_vector, n_lats, n_lons),
    correct_lons
)

In [None]:
# GPML main
import logging
from time import time

import GPy
import GPy.kern as K
import sklearn
import sklearn.cross_validation
from joblib import Parallel, delayed
import numpy as np

from gpml import fs_loader, plotting
%matplotlib inline


logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


def calculate_error(df, request):
    predictant = request['predictant']
    model = request['model_name']
    obs_col = predictant + '_observed'
    main_predictor = predictant + '_' + model
    error = df[obs_col] - df[main_predictor]
    return error


def load_station_dataset(request):
    print("Loading dataset..")
    dataset = fs_loader.load_dataset(request)
    dataset[request['predictant'] + '_error'] = calculate_error(dataset, request)
    dataset = fs_loader.filter_valid_dates(dataset, request)
    dataset = fs_loader.filter_forecast_hours(dataset, request)
    dataset.dropna(axis=0, how='any', inplace=True)
    dataset.reset_index(drop=True, inplace=True)
    dataset['noise'] = np.random.normal(len(dataset))
    print("Done loading dataset.")
    print("Stations in set: %d" % (len(dataset.station_id.unique())))
    return dataset


def fit_model(X, y, kernel, verbose=True):
    if verbose:
        print("Training GP..")
        start = time()
    gp = GPy.models.GPRegression(X, y, kernel=kernel, normalizer=None)
    gp.optimize(messages=verbose)
    if verbose:
        end = time()
        print("Finished GP training (%ds)." % (end - start))
    return gp


def parallel_predict(gp, input_vector):
    parts = Parallel(n_jobs=4, backend='threading', verbose=5)(
        delayed(gp.predict)(chunk) for chunk in chunks(input_vector, 20000))
    return np.concatenate([part[0] for part in parts]), np.concatenate([part[1] for part in parts])


def chunk_predict(gp, input_vector, chunk_size=5000, verbose=True):
    preds = []
    var = []
    points_done = 0
    for chunk in chunks(input_vector, chunk_size):
        chunk_preds, chunk_var = gp.predict(chunk)
        preds.append(chunk_preds)
        var.append(chunk_var)
        points_done += len(chunk)
        if verbose:
            print("Finished %d / %d predictions.." % (points_done, len(input_vector)))
    return np.concatenate(preds), np.concatenate(var)


def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i+n]

In [None]:
# Load spatial feature dataset
from collections import OrderedDict

import pandas as pd

from gpml import terrain_features as tf
from gpml import plotting


class DatasetBuilder:
    
    @classmethod
    def build_spatial_dataset(cls, request):
        assert isinstance(request, dict)

        features = OrderedDict()
        features['latitude'], features['longitude'], n_lats, n_lons = \
            generate_latlon_vector(request)
        features['noise'] = np.random.normal(len(features['latitude']))
        # Elevations
        blur_radius = 1.
        elevation = tf.blur(
            tf.get_elevations(
                zip(features['latitude'], features['longitude'])
            ), 
            blur_radius
        )
        features['elevation'] = elevation

        # Gradients, aspect and slope
        elevation_grid = vector_to_matrix(elevation, n_lats, n_lons)
        gradients, aspect, slope = \
            tf.get_gradient_features(request, elevation_grid)
        features['longitude-gradient'] = \
            tf.blur(matrix_to_vector(gradients[:,:,0]), blur_radius)
        features['latitude-gradient'] = \
            tf.blur(matrix_to_vector(gradients[:,:,1]), blur_radius)
        features['elevation-gradient'] = \
            tf.blur(matrix_to_vector(gradients[:,:,2]), blur_radius)
        features['aspect'] = tf.blur(matrix_to_vector(aspect), blur_radius)
        features['slope'] = tf.blur(matrix_to_vector(slope), blur_radius)

        # Hill shade
        if 'shade' in request['features']:
            shade = tf.get_shade_features(request, gradients)
            features['shade'] = matrix_to_vector(shade)
            plotting.plt_matrix(shade, 'Hill shade')

        # Shadows
        if 'shadow' in request['features']:
            shadow = tf.get_shadow_features(request, elevation_grid)
            features['shadow'] = matrix_to_vector(shadow)
            plotting.plt_matrix(shadow, 'Shadow')

        
        return pd.DataFrame.from_dict(features), n_lats, n_lons


In [None]:
# Merge datasets
def merge_station_with_spatial_dataset(station_dataset, spatial_dataset):
    locations = station_dataset[['latitude', 'longitude']].values
    join_index = locations_to_index(locations, spatial_dataset)
    right_df = spatial_dataset.loc[join_index].reset_index(drop=True).drop([
        'latitude',
        'longitude',
        'elevation'
    ], axis=1)
    station_dataset = station_dataset.join(
        right_df,
        rsuffix='_spatial', 
        how='left'
    )
    return station_dataset

def locations_to_index(locations, spatial_dataset):
    indexes = [None] * len(locations)
    for count, location in enumerate(locations):
        distance = (spatial_dataset['latitude'] - location[0]) ** 2 \
            + (spatial_dataset['longitude'] - location[1]) ** 2
        indexes[count] = distance.idxmin()
    return indexes

In [None]:
from gpml import area

def log_request(request):
    print("Mean cell size: x=%.3fm, y=%.3fm" % 
          area.calculate_request_lengths(request)
    )

In [None]:
# Example use case
import datetime as dt
import matplotlib.pyplot as plt


request = {
    'predictant': 'TT2m',
    'model_elements': ['TT2m', 'FF10m'],
#     'features': ['latitude', 'longitude', 'elevation'],
#     'features': ['latitude-gradient', 'longitude-gradient', 'elevation-gradient'],
    'features': ['latitude', 'longitude', 'elevation', 'aspect', 'slope', 'shadow'],
#     'features': ['shade'],
#     'features': ['shadow'],
    'forecast_hours': [42],
    'model_name': 'ModelMix',
    'start': dt.datetime(2015, 11, 2),
    'end': dt.datetime(2015, 11, 3),
#     'predict_area': (45, 48, 6, 9),
    'predict_area': (45, 50, 4, 11),  # Swiss Alps
#     'predict_area': (33, 60, -12, 20),  # Europe
    'predict_resolution': 0.01,
}
log_request(request)
random_state = 1337
input_dim = len(request['features'])

# Kernel definition
# kernel = K.Matern52(input_dim, ARD=True) + K.White(input_dim)
kernel1 = K.PeriodicMatern52(1, active_dims=[3]) + K.PeriodicMatern52(1, active_dims=[4]) + K.White(2, active_dims=[3,4])
kernel2 = K.Matern52(3, ARD=True, active_dims=[0,1,2]) + K.White(3)
kernel3 = K.Linear(1, active_dims=[5])
kernel = kernel1 + kernel2 + kernel3
# Load datasets
station_dataset = load_station_dataset(request)
# TODO Cache call to loading spatial dataset
spatial_dataset, n_lats, n_lons = DatasetBuilder.build_spatial_dataset(request)
dataset = merge_station_with_spatial_dataset(station_dataset, spatial_dataset)

# Create train and validate dataset splits
observation_column = request['predictant'] + '_error'
X_train, X_test, y_train, y_test = \
    fs_loader.split_dataset(dataset, request['features'], observation_column, 0.2, random_state)

print("Dataset: %d rows, %d features" % X_train.shape)
gp = fit_model(X_train, y_train, kernel)

# Validate model
predictions, _ = parallel_predict(gp, X_test)
print("MAE: %.3f." % (sklearn.metrics.mean_absolute_error(y_true=y_test, y_pred=predictions)))
print("RMSE: %.3f." % np.sqrt((sklearn.metrics.mean_squared_error(y_true=y_test, y_pred=predictions))))

# Feature importance plot
kernel.plot_ARD(legend=True)
plt.xticks(np.arange(input_dim), request['features'])
plt.title("Dimension relevance")
plt.show()

# Create prediction dataset
dataset2 = spatial_dataset[request['features']]
print("Predicting for %d points and %d features.." % dataset2.shape)
spatial_predictions, _ = chunk_predict(gp, dataset2.values)

# Post-processing
spatial_predictions = vector_to_matrix(spatial_predictions, n_lats, n_lons)

# Prediction plots
plotting.plot_area(dataset, request, spatial_predictions)
plotting.plt_matrix(spatial_predictions)
plotting.plot_prediction_distribution(spatial_predictions)
