Imports

In [1]:
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import numpy as np
import pandas as pd
import time

Using TensorFlow backend.


In [2]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

## Baseline LSTM + room to change network architecture

In [109]:
def handle_wind_dir(data):
    '''
    Different ways to handle the string format:
     1. Drop it
     2. LabelEncode it
     3. One-hot encode it
     
    The reason I need to handle it is because Keras' pad_sequences function takes int() of
    all the columns.
    '''
    # I'll drop it first
    data = data.drop('wnd_dir', axis=1, inplace=False)
    return data

In [110]:
import warnings; warnings.filterwarnings('ignore')

In [111]:
df = pd.read_csv('../data/pollution.csv', header=0, index_col=0)

In [112]:
# Optionally drop wind dir
df = handle_wind_dir(df)

In [113]:
train, test = train_test_split(df, test_size=.90, random_state=789)

In [114]:
len(train)

4380

In [115]:
len(test)

39420

In [116]:
def prepare_sequences(df):
    ''' 
    Let df = train and prepare sequences.
    '''
    input_cols = list(df.columns)
    df['single_input_vector'] = df[input_cols].apply(tuple, axis=1).apply(list)\
        .apply(lambda x: [list(x)])
    df['single_output_vector'] = df['pollution'].apply(lambda x: [[x]])
    return df.reset_index(drop=True)

In [117]:
train = prepare_sequences(train)

In [118]:
train.head()

Unnamed: 0,pollution,dew,temp,press,wnd_spd,snow,rain,single_input_vector,single_output_vector
0,234.0,-21,-10.0,1032.0,4.47,0,0,"[[234.0, -21.0, -10.0, 1032.0, 4.47, 0.0, 0.0]]",[[234.0]]
1,19.0,13,14.0,1024.0,38.0,0,11,"[[19.0, 13.0, 14.0, 1024.0, 38.0, 0.0, 11.0]]",[[19.0]]
2,313.0,-10,1.0,1033.0,3.56,0,0,"[[313.0, -10.0, 1.0, 1033.0, 3.56, 0.0, 0.0]]",[[313.0]]
3,258.0,-17,-5.0,1022.0,1.79,0,0,"[[258.0, -17.0, -5.0, 1022.0, 1.79, 0.0, 0.0]]",[[258.0]]
4,45.0,-5,14.0,1011.0,4.02,0,0,"[[45.0, -5.0, 14.0, 1011.0, 4.02, 0.0, 0.0]]",[[45.0]]


In [142]:
def func(x, maxlen=None):
    '''
    Pad sequences with lists of 0s. Functional lambda programming.
    '''
    zeros_to_add = maxlen - len(x)
    prepended = [np.zeros(7).tolist()]
    y = prepended*zeros_to_add + x
    return y

def func_output(x, maxlen=None):
    '''
    Pad sequences with lists of 0s. Functional lambda programming.
    '''
    zeros_to_add = maxlen - len(x)
    prepended = [np.zeros(1).tolist()]
    y = prepended*zeros_to_add + x
    return y

def balanced_sliding_windows(df, stride=3):
    '''
    Create sliding windows of size [n - 3, n + 3]. Let df=train.
    '''
    seqs = []
    for i, value in df.iterrows():
        if i >= stride and i < len(df) - stride:
            sequences = df.iloc[i-stride:i+stride, -1].sum()
        elif i < stride:
            sequences = df.iloc[i:i+stride, -1].sum()
        else:
            sequences = df.iloc[i-stride:i, -1].sum()
        seqs.append(sequences)
    return pd.Series(seqs)
        
def past_windows(df, stride=3):
    '''
    Create windows of size [n - 3: n]. Let df=train.
    '''
    seqs = []
    for i, value in df.iterrows():
        if i >= stride and i < len(df) - stride:
            sequences = df.iloc[i-stride:i, -1].sum()
        elif i < stride:
            sequences = df.iloc[0:i, -1].sum()
        else:
            sequences = df.iloc[i-stride:i, -1].sum()
        if sequences == 0:
            # Pad empty array with arrays of 0s
            sequences = func([], stride)
        if len(sequences) < stride:
            # Pad with arrays of 0s
            sequences = func(sequences, stride)
        seqs.append(sequences)
    return pd.Series(seqs)

def future_windows(df, stride=3):
    '''
    Create windows of size [n : n+3]. Let df=train.
    '''
    seqs = []
    for i, value in df.iterrows():
        if i >= stride and i < len(df) - stride:
            sequences = df.iloc[i:i+stride, -1].sum()
        elif i < stride:
            sequences = df.iloc[i:i+stride, -1].sum()
        else:
            sequences = df.iloc[i:, -1].sum()
        if type(sequences) == 'int' and sequences == 0:
            sequences = func_output([], stride)
        if len(sequences) < stride:
            # Pad with arrays of 0s
            sequences = func_output(sequences, stride)
        seqs.append(sequences)
    return pd.Series(seqs)

In [143]:
input_vec = train['single_input_vector']
input_vec = pd.DataFrame(input_vec)

In [144]:
output_vec = train['single_output_vector']
output_vec = pd.DataFrame(output_vec)

In [145]:
# Good so far.

In [146]:
balanced_sliding_windows(input_vec).head()

0    [[234.0, -21.0, -10.0, 1032.0, 4.47, 0.0, 0.0]...
1    [[19.0, 13.0, 14.0, 1024.0, 38.0, 0.0, 11.0], ...
2    [[313.0, -10.0, 1.0, 1033.0, 3.56, 0.0, 0.0], ...
3    [[234.0, -21.0, -10.0, 1032.0, 4.47, 0.0, 0.0]...
4    [[19.0, 13.0, 14.0, 1024.0, 38.0, 0.0, 11.0], ...
dtype: object

In [147]:
past_windows(input_vec).head()

0    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0....
1    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0....
2    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [234.0, ...
3    [[234.0, -21.0, -10.0, 1032.0, 4.47, 0.0, 0.0]...
4    [[19.0, 13.0, 14.0, 1024.0, 38.0, 0.0, 11.0], ...
dtype: object

In [148]:
future_windows(output_vec).head()

0    [[234.0], [19.0], [313.0]]
1    [[19.0], [313.0], [258.0]]
2    [[313.0], [258.0], [45.0]]
3     [[258.0], [45.0], [25.0]]
4      [[45.0], [25.0], [76.0]]
dtype: object

In [149]:
df.head()

Unnamed: 0,pollution,dew,temp,press,wnd_spd,snow,rain
24,129.0,-16,-4.0,1020.0,1.79,0,0
25,148.0,-15,-4.0,1020.0,2.68,0,0
26,159.0,-11,-5.0,1021.0,3.57,0,0
27,181.0,-7,-5.0,1022.0,5.36,1,0
28,138.0,-7,-5.0,1022.0,6.25,2,0


In [150]:
# Good so far

In [151]:
train['past_sequences'] = past_windows(input_vec)
train['future_sequences'] = future_windows(output_vec)

In [152]:
train.head()

Unnamed: 0,pollution,dew,temp,press,wnd_spd,snow,rain,single_input_vector,single_output_vector,past_sequences,future_sequences,output_vector
0,234.0,-21,-10.0,1032.0,4.47,0,0,"[[234.0, -21.0, -10.0, 1032.0, 4.47, 0.0, 0.0]]",[[234.0]],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0....","[[234.0], [19.0], [313.0]]","[[234.0], [19.0], [313.0]]"
1,19.0,13,14.0,1024.0,38.0,0,11,"[[19.0, 13.0, 14.0, 1024.0, 38.0, 0.0, 11.0]]",[[19.0]],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0....","[[19.0], [313.0], [258.0]]","[[19.0], [313.0], [258.0]]"
2,313.0,-10,1.0,1033.0,3.56,0,0,"[[313.0, -10.0, 1.0, 1033.0, 3.56, 0.0, 0.0]]",[[313.0]],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [234.0, ...","[[313.0], [258.0], [45.0]]","[[313.0], [258.0], [45.0]]"
3,258.0,-17,-5.0,1022.0,1.79,0,0,"[[258.0, -17.0, -5.0, 1022.0, 1.79, 0.0, 0.0]]",[[258.0]],"[[234.0, -21.0, -10.0, 1032.0, 4.47, 0.0, 0.0]...","[[258.0], [45.0], [25.0]]","[[258.0], [45.0], [25.0]]"
4,45.0,-5,14.0,1011.0,4.02,0,0,"[[45.0, -5.0, 14.0, 1011.0, 4.02, 0.0, 0.0]]",[[45.0]],"[[19.0, 13.0, 14.0, 1024.0, 38.0, 0.0, 11.0], ...","[[45.0], [25.0], [76.0]]","[[45.0], [25.0], [76.0]]"


In [153]:
# predicting G for now, just a test example
# If your output is multi-dimensional, you need to capture those 
# dimensions in one object
# If your output is a single dimension, this step may be unnecessary
def set_output_featureset(df):
    '''
    Let df=train and prepare output vector
    '''
    df['output_vector'] = df['future_sequences']
    return df

In [154]:
train = set_output_featureset(train)

In [155]:
train.head()

Unnamed: 0,pollution,dew,temp,press,wnd_spd,snow,rain,single_input_vector,single_output_vector,past_sequences,future_sequences,output_vector
0,234.0,-21,-10.0,1032.0,4.47,0,0,"[[234.0, -21.0, -10.0, 1032.0, 4.47, 0.0, 0.0]]",[[234.0]],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0....","[[234.0], [19.0], [313.0]]","[[234.0], [19.0], [313.0]]"
1,19.0,13,14.0,1024.0,38.0,0,11,"[[19.0, 13.0, 14.0, 1024.0, 38.0, 0.0, 11.0]]",[[19.0]],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0....","[[19.0], [313.0], [258.0]]","[[19.0], [313.0], [258.0]]"
2,313.0,-10,1.0,1033.0,3.56,0,0,"[[313.0, -10.0, 1.0, 1033.0, 3.56, 0.0, 0.0]]",[[313.0]],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [234.0, ...","[[313.0], [258.0], [45.0]]","[[313.0], [258.0], [45.0]]"
3,258.0,-17,-5.0,1022.0,1.79,0,0,"[[258.0, -17.0, -5.0, 1022.0, 1.79, 0.0, 0.0]]",[[258.0]],"[[234.0, -21.0, -10.0, 1032.0, 4.47, 0.0, 0.0]...","[[258.0], [45.0], [25.0]]","[[258.0], [45.0], [25.0]]"
4,45.0,-5,14.0,1011.0,4.02,0,0,"[[45.0, -5.0, 14.0, 1011.0, 4.02, 0.0, 0.0]]",[[45.0]],"[[19.0, 13.0, 14.0, 1024.0, 38.0, 0.0, 11.0], ...","[[45.0], [25.0], [76.0]]","[[45.0], [25.0], [76.0]]"


In [156]:
# Extract your training data
X_train_init = np.asarray(train.past_sequences)
stride_length = 3
# Training data for LSTM should be in the form of a 3D tuple:
#   (# of samples, timesteps, input_dim)
# Note that the input data that comes out of the dataframe 
# will not make a 3D array. It makes an array of arrays, 
# which is not the same thing.
# So far X_train_init is an array of arrays.

# Convert to 3D vector usng hstack and reshape
# horizontal stack = hstack, essentially removes the outer array 
#  encapsulation.

# reshape into (# of records, total_timesteps, input_dim)
# The reshape essentially reshaped the inner list into an 11 by 6 matrix,
#  or a max_sequence_length (rows) by input_dim (col) matrix
X_train = np.hstack(X_train_init).reshape(len(train), 
                                          stride_length,
                                         len(df.columns))
y_train_init = np.asarray(train.output_vector)
y_train = np.hstack(y_train_init).reshape(len(train), 
                                          stride_length,
                                          len(output_vec.columns))

In [158]:
#np.hstack(y_train_init)

In [159]:
len(train), len(df.columns)

(4380, 7)

In [161]:
# Debugging
#train[train.output_vector.apply(lambda x: True if len(x) != 3 else False)]

In [162]:
print(X_train.shape)
print(y_train.shape)

(4380, 3, 7)
(4380, 3, 1)


In [163]:
# Get your input dimensions
# Input length is the length for one input sequence 
#  (i.e. the number of rows for your sample, which is
#     the max_sequence_length by construction)
input_length = X_train.shape[1]
# Input dim is the number of dimensions in one input vector 
#  (i.e. number of input columns)
input_dim = X_train.shape[2]
output_dim = len(y_train[0])

In [164]:
output_dim

3

In [199]:
from keras.models import Model, Sequential
from keras.layers import LSTM, Dense, TimeDistributed, Dropout

# Initialize the model
model = Sequential()

# arbitrarily picked the output dim to be 100
model.add(LSTM(100, input_shape=(input_length, input_dim), return_sequences=True))
# The max output value is > 1 so used relu as final activation
# 50 is the output dimension in the layer
# Input to this layer will have a shape (None, 3, 50) where 3 = stride_length, None = batch_size,
#   and 50 = output layer size

# Add dropout probability
model.add(Dropout(0.2))
# Another LSTM layer
model.add(LSTM(50, return_sequences=True))
model.add(Dropout(0.15))
# Output dimension of 7, since I have a feature length of 7 (pollution + weather stuff)
model.add(Dense(7, activation='softmax'))
# Output dimension of 1, since I have a feature length of 1 (pollution is only output feature)
model.add(TimeDistributed(Dense(1, activation='linear')))
#model.add(Dense(output_dim, activation='relu'))

# model.compile(loss='mean_squared_error',
#              optimizer='rmsprop',
#              metrics=['accuracy'])

In [200]:
model.compile(loss='mse',
             optimizer='adam',
             metrics=['accuracy'])

In [204]:
# Set batch_size to 30 to show that it doesn't have to be a factor 
# or multiple of your sample size
history = model.fit(X_train, y_train,
                   batch_size = 20, epochs=30,
                   verbose = 1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [205]:
model.predict(X_train)

array([[[ 14.11614132],
        [ 14.1161499 ],
        [ 14.1161499 ]],

       [[ 14.11614799],
        [ 14.1161499 ],
        [ 14.1161499 ]],

       [[ 14.11614799],
        [ 14.1161499 ],
        [ 14.1161499 ]],

       ..., 
       [[ 14.11614799],
        [ 14.1161499 ],
        [ 14.1161499 ]],

       [[ 14.11614799],
        [ 14.1161499 ],
        [ 14.1161499 ]],

       [[ 14.11614799],
        [ 14.1161499 ],
        [ 14.1161499 ]]], dtype=float32)

In [206]:
y_train

array([[[ 234.],
        [  19.],
        [ 313.]],

       [[ 258.],
        [  45.],
        [  25.]],

       [[  76.],
        [ 403.],
        [ 150.]],

       ..., 
       [[  53.],
        [ 259.],
        [  93.]],

       [[  85.],
        [ 118.],
        [  11.]],

       [[ 126.],
        [ 126.],
        [ 126.]]])