In [1]:
%matplotlib inline

import seaborn as sns
sns.set_color_codes()

In [2]:
import pandas as pd
import numpy as np

data = pd.DataFrame(np.column_stack([
    [1,2,4,8,16,32,64,128,256,512],#np.arange(1,11), 
    np.arange(101,121,2)]), 
    columns=['wind_speed', 'other'])

data

Unnamed: 0,wind_speed,other
0,1,101
1,2,103
2,4,105
3,8,107
4,16,109
5,32,111
6,64,113
7,128,115
8,256,117
9,512,119


In [3]:
def first_order_difference(data, columns):
    for column in columns:
        data[column+'_d'] = data[column].diff(periods=1)
    
    return data.dropna()

In [4]:
import pytest

diff = first_order_difference(data, ['wind_speed'])

assert(len(diff) == len(data)-1)
assert(diff.wind_speed_d.values[0] == 1)
assert(diff.wind_speed_d.values[-1] == 256)
assert(diff.other.values[0] == 103) #droped 1st row becuase of diff

diff

Unnamed: 0,wind_speed,other,wind_speed_d
1,2,103,1.0
2,4,105,2.0
3,8,107,4.0
4,16,109,8.0
5,32,111,16.0
6,64,113,32.0
7,128,115,64.0
8,256,117,128.0
9,512,119,256.0


In [5]:
pd.options.mode.chained_assignment = None  # default='warn'
def derive_prediction_columns(data, column, horizons):
    # get prediction - (i.e. change between now and lookahead) and shift back lookahead
    
    for look_ahead in horizons:
        data['prediction_' + str(look_ahead)] = data[column].diff(periods=look_ahead).shift(-look_ahead)
    
    return data.dropna()

In [6]:
look_ahead = 3
horizons = [1, look_ahead]
features = derive_prediction_columns(diff, 'wind_speed', horizons)

assert(len(features) == len(diff)-look_ahead)
assert(features.wind_speed_d.values[0] == 1)
assert(features.prediction_3.values[0] == 14) #(2,4,8 = 14)
assert(features.prediction_1.values[0] == 2)

features

Unnamed: 0,wind_speed,other,wind_speed_d,prediction_1,prediction_3
1,2,103,1.0,2.0,14.0
2,4,105,2.0,4.0,28.0
3,8,107,4.0,8.0,56.0
4,16,109,8.0,16.0,112.0
5,32,111,16.0,32.0,224.0
6,64,113,32.0,64.0,448.0


In [7]:
from sklearn import preprocessing

def scale_features(scaler, features):
    scaler.fit(features)
    
    scaled = scaler.transform(features)
    scaled = pd.DataFrame(scaled, columns=features.columns)
    
    return scaled

def inverse_prediction_scale(scaler, predictions, original_columns, column):
    loc = original_columns.get_loc(column)
    
    inverted = np.zeros((len(predictions), len(original_columns)))
    inverted[:,loc] = np.reshape(predictions, (predictions.shape[0],))
    
    inverted = scaler.inverse_transform(inverted)[:,loc]
    
    return inverted

def invert_all_prediction_scaled(scaler, predictions, original_columns, horizons):
    inverted = np.zeros(predictions.shape)
    
    for col_idx, horizon in enumerate(horizons):
        inverted[:,col_idx] = inverse_prediction_scale(
            scaler, predictions[:,col_idx], 
            original_columns,
            "prediction_" + str(horizon))
        
    return inverted

In [8]:
scaler = preprocessing.StandardScaler()#.MinMaxScaler(feature_range=(-1, 1)) 
scaled = scale_features(scaler, features)

assert(len(scaled) == len(features))

scaled.describe()

Unnamed: 0,wind_speed,other,wind_speed_d,prediction_1,prediction_3
count,6.0,6.0,6.0,6.0,6.0
mean,-3.700743e-17,-3.700743e-17,-3.700743e-17,-3.700743e-17,-3.700743e-17
std,1.095445,1.095445,1.095445,1.095445,1.095445
min,-0.8773385,-1.46385,-0.8773385,-0.8773385,-0.8773385
25%,-0.7388114,-0.7319251,-0.7388114,-0.7388114,-0.7388114
50%,-0.4155814,0.0,-0.4155814,-0.4155814,-0.4155814
75%,0.32323,0.7319251,0.32323,0.32323,0.32323
max,1.985556,1.46385,1.985556,1.985556,1.985556


In [9]:
unscaled_predictions = inverse_prediction_scale(scaler, 
                               scaled['prediction_3'].values, 
                               scaled.columns, 'prediction_3')


assert(len(unscaled_predictions) == len(scaled))
assert(all(np.rint(unscaled_predictions) == features.prediction_3.values))

unscaled_predictions

array([  14.,   28.,   56.,  112.,  224.,  448.])

In [10]:
def inverse_prediction_difference(predictions, original):
    return predictions + original

def invert_all_prediction_differences(predictions, original):
    inverted = predictions
    
    for col_idx, horizon in enumerate(horizons):
        inverted[:, col_idx] = inverse_prediction_difference(predictions[:,col_idx], original)
        
    return inverted

In [11]:
undiff_prediction = inverse_prediction_difference(unscaled_predictions, features['wind_speed'])

assert(len(undiff_prediction) == len(data) - look_ahead - 1) #lookahead + diff loss
assert(all(np.rint(undiff_prediction) == [16,32,64,128,256,512]))
 
undiff_prediction

1     16.0
2     32.0
3     64.0
4    128.0
5    256.0
6    512.0
Name: wind_speed, dtype: float64

In [12]:
def prepare_test_train(data, features, predictions, sequence_length, split_percent=0.9):
    
    num_features = len(features)
    num_predictions = len(predictions)
    
    columns = list(features)
    columns.extend(predictions) # make sure prediction cols are at end
    
    data = data[columns].values
    
    print("Using {} predictor features ".format(num_features))
    
    result = []
    for index in range(len(data) - sequence_length + 1):
        result.append(data[index:index + sequence_length])

    result = np.array(result)
    # shape (n_samples, sequence_length, num_features + num_predictions)
    print("Shape of data: {}".format(np.shape(result)))
    
    row = round(split_percent * result.shape[0])
    train = result[:row, :]
    #np.random.shuffle(train) # not using stateful lstm
    
    X_train = train[:, :, :-num_predictions]
    y_train = train[:, -1, -num_predictions:]
    X_test = result[row:, :, :-num_predictions]
    y_test = result[row:, -1, -num_predictions:]
    
    print("Shape of X train: {}".format(np.shape(X_train)))
    print("Shape of y train: {}".format(np.shape(y_train)))
    print("Shape of X test: {}".format(np.shape(X_test)))
    
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], num_features))
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], num_features))
    
    y_train = np.reshape(y_train, (y_train.shape[0], num_predictions))
    y_test = np.reshape(y_test, (y_test.shape[0], num_predictions))
    
    return X_train, y_train, X_test, y_test, row

In [13]:
sequence_length = 2

X_train, y_train, X_test, y_test, row_split = prepare_test_train(
    scaled,
    ['wind_speed_d'],
    ['prediction_1', 'prediction_3'],
    sequence_length,
    split_percent = 0.6)

assert(len(X_train) == 3 == len(y_train))
assert(len(X_test) == 2 == len(y_test))

assert(np.shape(X_train) == (3,2,1))
assert(np.shape(y_train) == (3,2))

Using 1 predictor features 
Shape of data: (5, 2, 3)
Shape of X train: (3, 2, 1)
Shape of y train: (3, 2)
Shape of X test: (2, 2, 1)


In [15]:

unscaled_predictions = invert_all_prediction_scaled(scaler, y_test, scaled.columns, horizons)

sequence_offset = sequence_length - 1

undiff_prediction = invert_all_prediction_differences(
    unscaled_predictions, 
    features['wind_speed'][sequence_offset+row_split:])

assert(all(undiff_prediction[:,0] == [64,128]))
assert(all(undiff_prediction[:,1] == [256,512]))

undiff_prediction

array([[  64.,  256.],
       [ 128.,  512.]])