In [2]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt

In [3]:
def min_max(x, axis=None):
    min = x.min(axis=axis, keepdims=True)
    max = x.max(axis=axis, keepdims=True)
    result = (x-min)/(max-min)
    return result

def create_dataset(dataset, steps_of_history, steps_in_future):
    X, Y = [], []
    for i in range(0, (len(dataset)+1)-steps_of_history-steps_in_future):
        X.append(dataset[i:i+steps_of_history])
        Y.append(dataset[(i-1) + steps_of_history + steps_in_future])
    X = np.reshape(np.array(X), [-1, steps_of_history, 1])
    Y = np.reshape(np.array(Y), [-1, 1])
    return X, Y

def MAPE(y_true, y_pred):
    non_zero_idxs = np.where(y_true!=0)[0]
    y_true = y_true[non_zero_idxs]
    y_pred = y_pred[non_zero_idxs]
    return np.mean(np.abs((y_true - y_pred) / y_true))

def direct_accuracy(y_true, y_prev_true, y_pred):
    return np.mean(np.where((y_true - y_prev_true) * (y_pred - y_prev_true) >= 0, True, False))

In [4]:
data = pd.read_csv(
    "../data/application_count_from_1960_to_2015_for_LSTM.csv", 
    dtype={"label": "int", "date": "str", "count": "int"}
).rename(columns={"count": "submit"})
data.date = pd.to_datetime(data.date)
data.head()

Unnamed: 0,label,date,submit
0,1,1960-01-31,0
1,1,1960-02-29,0
2,1,1960-03-31,0
3,1,1960-04-30,0
4,1,1960-05-31,0


In [None]:
# set parameters
in_out_neurons = 1
length_of_sequence = 12 # window size
steps_in_future = 12 # forecast point (month)
steps_of_history = 12 
batch_size = 10

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=0.00001)
early_stopping = EarlyStopping(monitor='val_loss', mode='auto', patience=10)

In [None]:
predict_list = []
real_list = []
prev_real_list = []
for label, cluster_data in data.groupby("label"):
    cluster_data = cluster_data.loc[(cluster_data.date >= '1985-01-01') & (cluster_data.date < '2007-01-01')]
    cluster_data.submit = min_max(cluster_data.submit.values)
    train = cluster_data.loc[(cluster_data.date >= '1985-01-01') & (cluster_data.date < '2006-01-01')]
    test = cluster_data.loc[(cluster_data.date >= '2004-02-01') & (cluster_data.date < '2007-01-01')]

    train_X, train_y = create_dataset(train.submit.values, steps_of_history, steps_in_future)
    test_X, test_y = create_dataset(test.submit.values, steps_of_history, steps_in_future)
    
    # build LSTM
    model = Sequential()
    model.add(LSTM(128, activation='relu', input_shape=(length_of_sequence, in_out_neurons)))
    model.add(Dropout(0.2))
    model.add(Dense(in_out_neurons, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(train_X, train_y,
                batch_size=batch_size,
                epochs=100,
                validation_split=0.3,
                callbacks=[reduce_lr,early_stopping],
                verbose=0
             )
    
    predict = model.predict(test_X)[-1]
    real = test_y[-1]
    prev_real = train_y[-1]
    
    predict_list.append(predict)
    real_list.append(real)
    prev_real_list.append(prev_real)
    
real_list = np.array(real_list)
prev_real_list = np.array(prev_real_list)
predict_list = np.array(predict_list)

predict_list = np.reshape(predict_list, (len(predict_list)))
real_list = np.reshape(real_list, (len(real_list)))
prev_real_list = np.reshape(prev_real_list, (len(prev_real_list)))

In [None]:
print("MAPE: ", MAPE(real_list, predict_list))
print("Direct accuracy: ", direct_accuracy(real_list, prev_real_list, predict_list))