# LSTM Model generation

In [24]:
import sys
import numpy as np
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import os
from datetime import datetime
from tqdm import tqdm

In [25]:
def get_index_of_date(df, date):
    # print(date)
    x = df.index[df['DATE'] == str(date).split(' ')[0]].tolist()
    if len(x) == 0:
        print("Date: " + str(date) + " not found in dataset")
        if date.year <= 2018:
            print("Assuming before start of dataset, returning 0")
            return 0
        elif date.year >= 2020:
            print("Assuming after end of dataset, returning end")
            return (len(df) - 1)
            

    return x[0]

def get_data_split(
                file_location,
                train_start_date=datetime(year=2018, month=8, day=1),
                train_end_date=datetime(year=2019, month=7, day=30),
                test_start_date=datetime(year=2019, month=8, day=1),
                test_end_date=datetime(year=2019, month=12, day=31),
                cols_to_use=None
                ):
    if cols_to_use is None:
        cols_to_use = [
        'int_time',
        'int_date',
        'int_day',
        'rain',
        'temp',
        'rhum'
        ]

    cols_to_use.insert(0, 'AVAILABLE BIKES')
    cols_to_use.insert(0, 'TIME')
    # load dataset
    dataset = read_csv(file_location, usecols=cols_to_use)
    dataset['DATE'] = dataset['TIME'].apply(lambda x: x.split(' ')[0])

    if 'rain' in cols_to_use:
        dataset = dataset[dataset['rain'].str.strip().astype(bool)]

    train_start_index = (get_index_of_date(dataset, train_start_date))
    train_end_index = (get_index_of_date(dataset, train_end_date))
    # print( train_end_index - train_start_index)

    test_start_index = (get_index_of_date(dataset, test_start_date))
    test_end_index = (get_index_of_date(dataset, test_end_date))
    # print(test_end_index - test_start_index)

    dataset = dataset.drop(['TIME', 'DATE'], axis=1)
    # print(dataset.head())
    # print(dataset)
    values = dataset.values
    # print(values.shape)

    # ensure all data is float
    values = values.astype('float32')
    # normalize features
    scaler = MinMaxScaler(feature_range=(0, 1))
    # print(values.shape)
    scaled = scaler.fit_transform(values)
    # frame as supervised learning
    reframed = scaled

    # print(scaled)

    # split into train and test sets
    # values = reframed.values

    train = scaled[train_start_index:train_end_index, :]
    test = scaled[test_start_index:test_end_index, :]
    # train = values[train_start:train_end, :]
    # test = values[test_start:test_end, :]

    # split into input and outputs
    train_x, train_y = train[:, 1:], train[:, 0]
    test_x, test_y = test[:, 1:], test[:, 0]
    # reshape input to be 3D [samples, timesteps, features]
    train_x = train_x.reshape((train_x.shape[0], 1, train_x.shape[1]))
    test_x = test_x.reshape((test_x.shape[0], 1, test_x.shape[1]))
    # print(train_X.shape, train_y.shape, test_x.shape, test_y.shape)
    
    return train_x, train_y, test_x, test_y, scaler

def get_trained_model(train_x, train_y, test_x, test_y, verbose=1):
    # design network
    model = Sequential()
    model.add(LSTM(50, input_shape=(train_x.shape[1], train_x.shape[2])))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='adam')


    # fit network
    history = model.fit(train_x, train_y,
                        epochs=150,
                        batch_size=72,
                        validation_data=(test_x, test_y),
                        verbose=verbose,
                        shuffle=False)
    
    return model

In [26]:
from os import listdir
from os.path import isfile, join
destination_directory = './datasets/bss/dublin/ml_models/'
if not os.path.exists(destination_directory):
        os.makedirs(destination_directory)

source_directory = './datasets/bss/dublin/reorg_plus_weather/'
files = [f for f in listdir(source_directory) if isfile(join(source_directory, f))]
# for file in tqdm(files):
for file in files:
    station = file.split('.')[0]
    if os.path.exists(destination_directory + station +'.h5'):
        print(station + " station model already exists")
        continue
    print("\n Working on " + station)
    try:
        train_x, train_y, test_x, test_y, scaler = get_data_split(source_directory + file)
    except IndexError as e:
        print("File " + file + " is causing IndexError issues lol")
        continue
    except AttributeError as e:
        print("File " + file + " is causing AttributeError issues lol")
        continue
        
    model = get_trained_model(train_x, train_y, test_x, test_y, verbose=2)       
    model.save(destination_directory + station +'.h5')

  0%|                                                                                          | 0/110 [00:00<?, ?it/s]

110


  if (await self.run_code(code, result,  async_=asy)):
 17%|█████████████▉                                                                   | 19/110 [00:00<00:01, 51.76it/s]

Date: 2018-08-01 00:00:00 not found in dataset
Assuming before start of dataset, returning 0
Date: 2019-07-30 00:00:00 not found in dataset
File station_116.csv is causing IndexError issues lol


 23%|██████████████████▍                                                              | 25/110 [00:00<00:02, 29.62it/s]

Date: 2018-08-01 00:00:00 not found in dataset
Assuming before start of dataset, returning 0
Date: 2019-07-30 00:00:00 not found in dataset
File station_117.csv is causing IndexError issues lol
File station_43.csv is causing AttributeError issues lol


  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
 93%|████████████████████████████████████████████████████████████████████████▎     | 102/110 [1:27:16<06:50, 51.34s/it]


KeyboardInterrupt: 

In [None]:
import tensorflow as tf
from tensorflow import keras

model = tf.keras.models.load_model(destination_directory + 'station_2.h5')

# model = create_model()
# model.load_weights(destination_directory + 'station_2')

In [None]:
# make a prediction
yhat = model.predict(test_x)
test_x_reshaped = test_x.reshape((test_x.shape[0], test_x.shape[2]))
# invert scaling for forecast
inv_yhat = concatenate((yhat, test_x_reshaped), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:, 0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_x_reshaped), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:, 0]
# calculate RMSE

# np.set_printoptions(threshold=sys.maxsize)
# temp = concatenate((inv_y, inv_yhat))
# print(temp)
# print(inv_y)
# print(inv_yhat)

rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
mae = mean_absolute_error(inv_y, inv_yhat)
mse = mean_squared_error(inv_y, inv_yhat)
r2 = r2_score(inv_y, inv_yhat)
print('Test MAE: %.3f' % mae)
print('Test MSE: %.3f' % mse)
print('Test RMSE: %.3f' % rmse)
print('Test R2: %.30f' % r2)

