# LSTM Training

The purposes of this is to explore optimization for the model

In [9]:
import sys
import numpy as np
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import keras
from datetime import datetime
import warnings

This is a modified version of the model used to more easily use different combonations 

In [10]:
def get_index_of_date(df, date):
    # print(date)
    x = df.index[df['DATE'] == str(date).split(' ')[0]].tolist()
    if len(x) == 0:
        print("Date: " + str(date) + " not found in dataset")
        exit(1)

    return x[0]


def train_model(file_location,
                train_start_date=datetime(year=2018, month=8, day=1),
                train_end_date=datetime(year=2019, month=7, day=30),
                test_start_date=datetime(year=2019, month=8, day=1),
                test_end_date=datetime(year=2019, month=12, day=31),
                cols_to_use=None,
                verbose=1
                ):
    # load dataset
    if cols_to_use is None:
        cols_to_use = ['int_time', 'int_date', 'int_day']

    cols_to_use.insert(0, 'AVAILABLE BIKES')
    cols_to_use.insert(0, 'TIME')
    dataset = read_csv(file_location, usecols=cols_to_use)
    dataset['DATE'] = dataset['TIME'].apply(lambda x: x.split(' ')[0])

    if 'rain' in cols_to_use:
        dataset = dataset[dataset['rain'].str.strip().astype(bool)]

    train_start_index = (get_index_of_date(dataset, train_start_date))
    train_end_index = (get_index_of_date(dataset, train_end_date))
    # print( train_end_index - train_start_index)

    test_start_index = (get_index_of_date(dataset, test_start_date))
    test_end_index = (get_index_of_date(dataset, test_end_date))
    # print(test_end_index - test_start_index)

    dataset = dataset.drop(['TIME', 'DATE'], axis=1)
    # print(dataset.head())
    # exit(1)
    # print(dataset)
    values = dataset.values
    # print(values.shape)

    # ensure all data is float
    values = values.astype('float32')
    # normalize features
    scaler = MinMaxScaler(feature_range=(0, 1))
    # print(values.shape)
    scaled = scaler.fit_transform(values)
    # frame as supervised learning
    reframed = scaled

    # print(scaled)

    # split into train and test sets
    # values = reframed.values

    train = scaled[train_start_index:train_end_index, :]
    test = scaled[test_start_index:test_end_index, :]
    # train = values[train_start:train_end, :]
    # test = values[test_start:test_end, :]

    # split into input and outputs
    train_X, train_y = train[:, 1:], train[:, 0]
    test_X, test_y = test[:, 1:], test[:, 0]
    # reshape input to be 3D [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
    test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
    # print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

    # design network
    model = Sequential()
    model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='adam')
    # fit network
    history = model.fit(train_X, train_y,
                        epochs=150,
                        batch_size=72,
                        validation_data=(test_X, test_y),
                        verbose=verbose,
                        shuffle=False)
    # plot history
    # pyplot.plot(history.history['loss'], label='train')
    # pyplot.plot(history.history['val_loss'], label='test')
    # pyplot.legend()
    # pyplot.show()

    # make a prediction
    yhat = model.predict(test_X)
    # print(test_X)
    # print(yhat)
    test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
    # invert scaling for forecast

    inv_yhat = concatenate((yhat, test_X), axis=1)
    # print(yhat.shape)
    # print(test_X.shape)
    # print(inv_yhat.shape)
    inv_yhat = scaler.inverse_transform(inv_yhat)
    inv_yhat = inv_yhat[:, 0]
    # invert scaling for actual
    test_y = test_y.reshape((len(test_y), 1))
    inv_y = concatenate((test_y, test_X), axis=1)
    inv_y = scaler.inverse_transform(inv_y)
    inv_y = inv_y[:, 0]
    # calculate RMSE

    # np.set_printoptions(threshold=sys.maxsize)
    # temp = concatenate((inv_y, inv_yhat))
    # print(temp)
    # print(inv_y)
    # print(inv_yhat)

    # print()
    rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
    mae = mean_absolute_error(inv_y, inv_yhat)
    mse = mean_squared_error(inv_y, inv_yhat)
    r2 = r2_score(inv_y, inv_yhat)
    print('Test MAE: %.3f' % mae)
    print('Test MSE: %.3f' % mse)
    print('Test RMSE: %.3f' % rmse)
    print('Test R2: %.30f' % r2)

In [35]:
attr_list = [
    'int_time',
    'int_date',
    'int_day',
    'rain',
    'temp',
    'rhum'
]

def powerset(s):
    x = len(s)
    masks = [1 << i for i in range(x)]
    for i in range(1 << x):
        yield [ss for mask, ss in zip(masks, s) if i & mask]
    

y = list(powerset(attr_list))
y.sort
print(len(y))
y = sorted(y, key=len)
y.pop(0)
for x in y:
    print(x)
    warnings.filterwarnings("ignore")
    train_model('./datasets/bss/dublin/reorg_plus_weather/station_2.csv',
            cols_to_use=['int_time', 'int_date', 'int_day', 'rain', 'temp', 'rhum'],
            verbose=0)
    print()

64
['int_time']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.956
Test MSE: 16.119
Test RMSE: 4.015
Test R2: 0.461436363330583465547363175574

['int_date']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.977
Test MSE: 16.747
Test RMSE: 4.092
Test R2: 0.440452666545042337276072430541

['int_day']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.888
Test MSE: 15.754
Test RMSE: 3.969
Test R2: 0.473625874222083353970447205938

['rain']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.979
Test MSE: 16.618
Test RMSE: 4.077
Test R2: 0.444772538901628267105081704358

['temp']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.936
Test MSE: 16.151
Test RMSE: 4.019
Test R2: 0.460364941961777462964278129220

['rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.960
Test MSE: 16.439
Test RMSE: 4.054
Test R2: 0.450770125030653612263620289013

['int_time', 'int_date']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.954
Test MSE: 16.433
Test RMSE: 4.054
Test R2: 0.450969800989587055184415476106

['int_time', 'int_day']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.937
Test MSE: 16.292
Test RMSE: 4.036
Test R2: 0.455659787760449908056159529224

['int_date', 'int_day']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.961
Test MSE: 16.540
Test RMSE: 4.067
Test R2: 0.447377839830072843163577545056

['int_time', 'rain']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.940
Test MSE: 16.298
Test RMSE: 4.037
Test R2: 0.455481797334631122176062945073

['int_date', 'rain']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.949
Test MSE: 16.306
Test RMSE: 4.038
Test R2: 0.455189342059414747687640101503

['int_day', 'rain']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.943
Test MSE: 16.358
Test RMSE: 4.044
Test R2: 0.453476666730420752138286388799

['int_time', 'temp']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.925
Test MSE: 15.968
Test RMSE: 3.996
Test R2: 0.466483028562342694200992809783

['int_date', 'temp']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.972
Test MSE: 16.687
Test RMSE: 4.085
Test R2: 0.442480704997951890256047136063

['int_day', 'temp']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.990
Test MSE: 16.447
Test RMSE: 4.055
Test R2: 0.450498672169117764241264012526

['rain', 'temp']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.953
Test MSE: 16.254
Test RMSE: 4.032
Test R2: 0.456941029108661300739413491101

['int_time', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.959
Test MSE: 16.431
Test RMSE: 4.053
Test R2: 0.451027937956749225278940684802

['int_date', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.949
Test MSE: 16.336
Test RMSE: 4.042
Test R2: 0.454201089987863793062672357337

['int_day', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.933
Test MSE: 16.106
Test RMSE: 4.013
Test R2: 0.461888436157849691099386291171

['rain', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.900
Test MSE: 15.894
Test RMSE: 3.987
Test R2: 0.468970101456314791832369337499

['temp', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.897
Test MSE: 15.817
Test RMSE: 3.977
Test R2: 0.471531407875899533088670523284

['int_time', 'int_date', 'int_day']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.945
Test MSE: 16.574
Test RMSE: 4.071
Test R2: 0.446254560702920688441963648074

['int_time', 'int_date', 'rain']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.895
Test MSE: 15.959
Test RMSE: 3.995
Test R2: 0.466783072923257047470713132498

['int_time', 'int_day', 'rain']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.942
Test MSE: 16.187
Test RMSE: 4.023
Test R2: 0.459186487802842546734893858229

['int_date', 'int_day', 'rain']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.965
Test MSE: 16.367
Test RMSE: 4.046
Test R2: 0.453152070305160581398240537965

['int_time', 'int_date', 'temp']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.982
Test MSE: 16.639
Test RMSE: 4.079
Test R2: 0.444080146404725484288178449788

['int_time', 'int_day', 'temp']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.975
Test MSE: 16.471
Test RMSE: 4.058
Test R2: 0.449693054908651079770720571105

['int_date', 'int_day', 'temp']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.953
Test MSE: 16.360
Test RMSE: 4.045
Test R2: 0.453395219551487760334396170947

['int_time', 'rain', 'temp']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.962
Test MSE: 16.338
Test RMSE: 4.042
Test R2: 0.454119741996882875234575749346

['int_date', 'rain', 'temp']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.970
Test MSE: 16.590
Test RMSE: 4.073
Test R2: 0.445714532216439640599503491103

['int_day', 'rain', 'temp']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.996
Test MSE: 16.954
Test RMSE: 4.117
Test R2: 0.433560717611727430842449848569

['int_time', 'int_date', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.968
Test MSE: 16.486
Test RMSE: 4.060
Test R2: 0.449188525670662142097455671319

['int_time', 'int_day', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 3.042
Test MSE: 16.775
Test RMSE: 4.096
Test R2: 0.439512168639376588075151630619

['int_date', 'int_day', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.944
Test MSE: 16.291
Test RMSE: 4.036
Test R2: 0.455692862151859667818598609301

['int_time', 'rain', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.910
Test MSE: 15.941
Test RMSE: 3.993
Test R2: 0.467390564028015154107720263710

['int_date', 'rain', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.958
Test MSE: 16.180
Test RMSE: 4.022
Test R2: 0.459407223196636849138485558797

['int_day', 'rain', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 3.002
Test MSE: 16.810
Test RMSE: 4.100
Test R2: 0.438360935673108298438194196933

['int_time', 'temp', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.938
Test MSE: 16.238
Test RMSE: 4.030
Test R2: 0.457476212182770902714423755242

['int_date', 'temp', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.951
Test MSE: 16.259
Test RMSE: 4.032
Test R2: 0.456759937629649059864789251151

['int_day', 'temp', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.926
Test MSE: 16.157
Test RMSE: 4.020
Test R2: 0.460175326135502715807490403677

['rain', 'temp', 'rhum']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.931
Test MSE: 15.963
Test RMSE: 3.995
Test R2: 0.466657671173964505584308426478

['int_time', 'int_date', 'int_day', 'rain']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.964
Test MSE: 16.462
Test RMSE: 4.057
Test R2: 0.449985773618533202444780272344

['int_time', 'int_date', 'int_day', 'temp']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.939
Test MSE: 16.277
Test RMSE: 4.034
Test R2: 0.456163047934097121149932263506

['int_time', 'int_date', 'rain', 'temp']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.980
Test MSE: 16.300
Test RMSE: 4.037
Test R2: 0.455414431415739118236274407536

['int_time', 'int_day', 'rain', 'temp']


  if (await self.run_code(code, result,  async_=asy)):


Test MAE: 2.940
Test MSE: 16.431
Test RMSE: 4.053
Test R2: 0.451038070785488631742055076757

['int_date', 'int_day', 'rain', 'temp']


  if (await self.run_code(code, result,  async_=asy)):


KeyboardInterrupt: 

In [15]:
attr_list = [
    'int_time',
    'int_date',
    'int_day',
    'rain',
    'temp',
    'rhum'
]

def powerset(s):
    x = len(s)
    masks = [1 << i for i in range(x)]
    for i in range(1 << x):
        yield [ss for mask, ss in zip(masks, s) if i & mask]
    

y = list(powerset(attr_list))
y.sort
print(len(y))
y = sorted(y, key=len)
y.pop(0)
pos = 44

64


In [16]:
for x in y[pos:]:
    print(str(pos) + "/" + str(len(y)))
    pos = pos+1
    print(x)
    warnings.filterwarnings("ignore")
    train_model('./datasets/bss/dublin/reorg_plus_weather/station_2.csv',
            cols_to_use=['int_time', 'int_date', 'int_day', 'rain', 'temp', 'rhum'],
            verbose=0)
    print()
    keras.backend.clear_session()

44/63
['int_time', 'int_day', 'rain', 'temp']
Test MAE: 2.977
Test MSE: 16.276
Test RMSE: 4.034
Test R2: 0.456193071111572923115318189957

45/63
['int_date', 'int_day', 'rain', 'temp']
Test MAE: 2.929
Test MSE: 16.170
Test RMSE: 4.021
Test R2: 0.459755027314544539862595229351

46/63
['int_time', 'int_date', 'int_day', 'rhum']
Test MAE: 2.928
Test MSE: 15.988
Test RMSE: 3.999
Test R2: 0.465811652748963633818846119539

47/63
['int_time', 'int_date', 'rain', 'rhum']
Test MAE: 2.946
Test MSE: 16.417
Test RMSE: 4.052
Test R2: 0.451489542252458186943897544552

48/63
['int_time', 'int_day', 'rain', 'rhum']


KeyboardInterrupt: 

In [11]:
def test_powersets(start_position=0, file='./datasets/bss/dublin/reorg_plus_weather/station_2.csv'):
    attr_list = [
        'int_time',
        'int_date',
        'int_day',
        'rain',
        'temp',
        'rhum'
    ]

    def powerset(s):
        x = len(s)
        masks = [1 << i for i in range(x)]
        for i in range(1 << x):
            yield [ss for mask, ss in zip(masks, s) if i & mask]


    y = list(powerset(attr_list))
    y.sort
    print(len(y))
    y = sorted(y, key=len)
    y.pop(0)
    
    for x in y[start_position:]:
        print(str(start_position) + "/" + str(len(y)-1))
        start_position = start_position+1
        print(x)
        warnings.filterwarnings("ignore")
        train_model(file,
                cols_to_use=['int_time', 'int_date', 'int_day', 'rain', 'temp', 'rhum'],
                verbose=0)
        print()
        keras.backend.clear_session()

In [4]:
test_powersets(48)

64
48/63
['int_time', 'int_day', 'rain', 'rhum']
Test MAE: 2.925
Test MSE: 16.265
Test RMSE: 4.033
Test R2: 0.456575120019261482262606932636

49/63
['int_date', 'int_day', 'rain', 'rhum']
Test MAE: 2.907
Test MSE: 15.936
Test RMSE: 3.992
Test R2: 0.467552093524261080936810230924

50/63
['int_time', 'int_date', 'temp', 'rhum']
Test MAE: 2.958
Test MSE: 16.468
Test RMSE: 4.058
Test R2: 0.449787258803440725962730084575

51/63
['int_time', 'int_day', 'temp', 'rhum']
Test MAE: 2.934
Test MSE: 16.257
Test RMSE: 4.032
Test R2: 0.456850463570347642416891176254

52/63
['int_date', 'int_day', 'temp', 'rhum']
Test MAE: 2.945
Test MSE: 16.495
Test RMSE: 4.061
Test R2: 0.448869306189549122088067178993

53/63
['int_time', 'rain', 'temp', 'rhum']
Test MAE: 2.949
Test MSE: 16.087
Test RMSE: 4.011
Test R2: 0.462513652657874030360574124643

54/63
['int_date', 'rain', 'temp', 'rhum']
Test MAE: 2.949
Test MSE: 16.166
Test RMSE: 4.021
Test R2: 0.459884351803481861153954923793

55/63
['int_day', 'rain', 'te

In [12]:
test_powersets()

64
0/62
['int_time']
Test MAE: 3.238
Test MSE: 21.247
Test RMSE: 4.609
Test R2: 0.327476160993642739782671924331

1/62
['int_date']
Test MAE: 3.234
Test MSE: 21.833
Test RMSE: 4.673
Test R2: 0.308922137849465539005677783280

2/62
['int_day']
Test MAE: 3.213
Test MSE: 20.596
Test RMSE: 4.538
Test R2: 0.348098627917264979991784912272

3/62
['rain']
Test MAE: 3.297
Test MSE: 22.453
Test RMSE: 4.738
Test R2: 0.289317018164757855380742057605

4/62
['temp']
Test MAE: 3.214
Test MSE: 20.982
Test RMSE: 4.581
Test R2: 0.335860247806347600452170354401

5/62
['rhum']
Test MAE: 3.101
Test MSE: 19.895
Test RMSE: 4.460
Test R2: 0.370284627671228494527611019294

6/62
['int_time', 'int_date']
Test MAE: 3.208
Test MSE: 20.828
Test RMSE: 4.564
Test R2: 0.340743717285184932741515240195

7/62
['int_time', 'int_day']
Test MAE: 3.241
Test MSE: 21.239
Test RMSE: 4.609
Test R2: 0.327749051155038495508620144392

8/62
['int_date', 'int_day']
Test MAE: 3.182
Test MSE: 20.485
Test RMSE: 4.526
Test R2: 0.351601185