In [None]:
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.cross_validation
import joblib
import GPy

import fs_loader
import model
import plotting

%matplotlib inline


def select_single_valid_date(df, request):
    assert 'longitude' in df.columns and 'latitude' in df.columns
    spatial_df = df[df.valid_date.apply(lambda d: str(d).startswith(request['valid_date']))].copy()
    assert len(spatial_df) > 0, "date '%s' not contained in dataset." % request['valid_date']
    return spatial_df


def add_time_based_features(df):
    df['valid_hour'] = df['valid_date'].apply(lambda s: int(str(s)[-2:]))
    df['valid_month'] = df['valid_date'].apply(lambda s: int(str(s)[4:6]))
    

def calculate_error(df, request):
    predictant = request['predictant']
    model = request['model_name']
    obs_col = predictant + '_observed'

    main_predictor = predictant + '_' + model

    error = df[obs_col] - df[main_predictor]
    return error


def split_dataset(dataset, feature_cols, obs_col):
    station_ids = dataset.station_id.unique()
    train_stations, test_stations = sklearn.cross_validation.train_test_split(
        station_ids, test_size=0.10
    )

    X_train = dataset.loc[
        dataset.station_id.isin(train_stations), 
        feature_cols
    ].values

    X_test = dataset.loc[
        dataset.station_id.isin(test_stations),
        feature_cols
    ].values

    y_train = dataset.loc[
        dataset.station_id.isin(train_stations),
        [obs_col]
    ].values

    y_test = dataset.loc[
        dataset.station_id.isin(test_stations),
        [obs_col]
    ].values
    return X_train, X_test, y_train, y_test


def fit_model_and_return_dataset(kernel, request):
    print("Loading dataset..")
    dataset = fs_loader.load_dataset(request)
    print("Done loading dataset.")
    # Modify dataset to fit geo-spatial problem
    dataset = select_single_valid_date(dataset, request)
    add_time_based_features(dataset)

    # Transform observation to error
    dataset[request['predictant'] + '_error'] = calculate_error(dataset, request)

    # TODO TdR 17/07/16: Randomly distort latitude and longitude by a small offset (100m).
    # TODO TdR 17/07/16: preserve original row-mapping for later reference
    # TODO TdR 19/07/16: Try feature embeddings
    print("Available features:")
    print(dataset.columns.values)
    #     ['station_id' 'latitude' 'longitude' 'elevation' 'forecast_hour'
    #  'valid_date' 'TT2m_observed' 'TT2m_ModelMix' 'FF10m_ModelMix' 'valid_hour'
    #  'valid_month' 'TT2m_error']
    feature_columns = ['latitude', 'longitude']
    observation_column = request['predictant'] + '_error'
    X_train, X_test, y_train, y_test = \
        split_dataset(dataset, feature_columns, observation_column)

    print("Training GP..")
    gp = GPy.models.GPRegression(X_train, y_train, kernel)
    gp.optimize(messages=True, max_f_eval = 1000)
    print("Done training GP.")

    # Validate model
    predictions, _ = chunk_predict(gp, X_test)
    print("MAE: %.3f." % (sklearn.metrics.mean_absolute_error(y_true=y_test, y_pred=predictions)))
    print("RMSE: %.3f." % np.sqrt((sklearn.metrics.mean_squared_error(y_true=y_test, y_pred=predictions))))
    return gp, dataset


def predict_area(gp, request):
    top_lat, bot_lat, left_lon, right_lon = request['predict_area']
    res = request['predict_resolution']
    n_lons = int(np.ceil(abs(right_lon - left_lon) / res))
    n_lats = int(np.ceil(abs(top_lat - bot_lat) / res))
    lat_grid, lon_grid = np.meshgrid(np.linspace(bot_lat, top_lat, n_lats),
                                     np.linspace(left_lon, right_lon, n_lons)
                                    )
    lat_grid.reshape(lat_grid.size, 1)
    latlon_vector = np.hstack([
            lat_grid.reshape(lat_grid.size, 1), 
            lon_grid.reshape(lon_grid.size, 1)])
    print("Predicting for %d points.." % latlon_vector.shape[0])
    y_pred, var = chunk_predict(gp, latlon_vector)
    y_pred = y_pred.reshape((n_lons, n_lats)).T
    var = var.reshape((n_lons, n_lats)).T
    return y_pred, var


def chunk_predict(gp, input_vector, chunk_size=5000):
    preds = []
    var = []
    points_done = 0
    for chunk in chunks(input_vector, chunk_size):
        chunk_preds, chunk_var = gp.predict(chunk)
        preds.append(chunk_preds)
        var.append(chunk_var)
        points_done += len(chunk)
        print("Finished %d / %d predictions.." % (points_done, len(input_vector)))
    return np.concatenate(preds), np.concatenate(var)


def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i+n]


def plot_area(dataset, request, area_prediction):
    top_lat, bot_lat, left_lon, right_lon = request['predict_area']

    res = '50m'
    alpha = 1.0
    plt.figure(figsize=(20,15))
    ax = plt.axes(projection=ccrs.PlateCarree())
    plt.axis('equal')
    plotting.add_background_map(ax, res, alpha)
    
    # Plot stations
    sp = plt.scatter(
        dataset.longitude, dataset.latitude, 
        s=10, c=dataset['TT2m_error'], 
        edgecolor='face',
        vmin=-5, vmax=5,
    )
    
    # Plot prediction contours
    cs = plt.contour(
        area_prediction,
        10,
        extent=(left_lon, right_lon, bot_lat, top_lat),
        antialiased=True,
        zorder=999
    )
    plt.clabel(cs, fontsize=11)

    cb = plt.colorbar(sp)
    cb.set_label('Temperature error')
    plt.xlim([left_lon, right_lon])
    plt.ylim([bot_lat, top_lat])
    plt.show()
    

def plot_prediction_distribution(prediction):
    plt.figure()
    plt.title("Prediction distribution")
    plt.ylabel("Frequency")
    plt.xlabel("Temperature error")
    plt.hist(prediction.reshape(prediction.size, 1), np.arange(-5, 5, 0.1))
    plt.show()

In [None]:
from IPython.display import display

test_request = {
    'predictant': 'TT2m',
    'model_elements': ['TT2m', 'FF10m'],
    'forecast_hours': [18],
    'model_name': 'ModelMix',
    'valid_date': '20151201',
    'predict_area': (60, 33, -12, 20),
    'predict_resolution': 0.1
}

kernel = GPy.kern.Matern52(2, ARD=True) + GPy.kern.White(2)
gp, dataset = fit_model_and_return_dataset(kernel, test_request)
area_prediction, area_std = predict_area(gp, test_request)
plot_prediction_distribution(area_prediction)
plot_area(dataset, test_request, area_prediction)