### Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

### Load train and test dataframe

In [2]:
train_df = pd.read_csv('train_df_2022_08_28.csv')
test_df = pd.read_csv('test_df_2022_08.csv')

In [3]:
# limit to just one station
train_df_subset = train_df[train_df['station'] == 16]
test_df_subset = test_df[test_df['station'] == 16]

In [4]:
# fix random seed for reproducibility
tf.random.set_seed(7)

## Split train and test datasets

In [5]:
pd.read_csv('train_df_2022_08_28.csv')

Unnamed: 0.1,Unnamed: 0,station,prediction_time,prediction_window,number_of_bikes_left,number_of_bikes_returned,net_difference
0,0,1,2022-08-28 00:15:00,15,0,0,0
1,1,1,2022-08-28 00:30:00,15,0,0,0
2,2,1,2022-08-28 00:45:00,15,0,0,0
3,3,1,2022-08-28 01:00:00,15,0,0,0
4,4,1,2022-08-28 01:15:00,15,0,0,0
...,...,...,...,...,...,...,...
47875,47875,572,2022-08-28 22:45:00,15,0,0,0
47876,47876,572,2022-08-28 23:00:00,15,0,0,0
47877,47877,572,2022-08-28 23:15:00,15,0,0,0
47878,47878,572,2022-08-28 23:30:00,15,0,0,0


In [14]:
train_dataframe = pd.read_csv('train_df_2022_08_28.csv', usecols=[5], engine='python')
test_dataframe = pd.read_csv('test_df_2022_08.csv', usecols=[5], engine='python')
dataframe = pd.concat([train_dataframe, test_dataframe])

In [15]:
# load the dataset
# dataframe = pd.read_csv('train_df_2022_08_28.csv', usecols=[5], engine='python')
dataset = dataframe.values
dataset = dataset.astype('float32')

In [16]:
# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)

In [17]:
# split into train and test sets
train_size = 47880 # the number of rows in the training dataset 
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
print(len(train), len(test))

47880 144648


In [18]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
	dataX, dataY = [], []
	for i in range(len(dataset)-look_back-1):
		a = dataset[i:(i+look_back), 0]
		dataX.append(a)
		dataY.append(dataset[i + look_back, 0])
	return np.array(dataX), np.array(dataY)

In [19]:
# reshape into X=t and Y=t+1
look_back = 1
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)


It takes 50s per epoch, for just 1 day of data for training. There are also 504 stations.

Each model takes 50*3 ~= 4 minutes.

(4*504)/60 = 33.6 hours = 8 full days

## Building and evaluating model

The model performance is better than the baseline.

RMSE is 

In [20]:
# create and fit the LSTM network
model = Sequential()
model.add(LSTM(4, input_shape=(1, look_back)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs=2, batch_size=1, verbose=2)

Epoch 1/2
47878/47878 - 49s - loss: 0.0020 - 49s/epoch - 1ms/step
Epoch 2/2
47878/47878 - 48s - loss: 0.0019 - 48s/epoch - 1ms/step


<keras.callbacks.History at 0x7f1239d68c50>

In [22]:
# make predictions
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])
# calculate root mean squared error
trainScore = np.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = np.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Test Score: %.2f RMSE' % (testScore))

Train Score: 1.05 RMSE
Test Score: 0.92 RMSE


### Station 16 results
* RMSE LSTM: 0.92
* RMSE Baseline: 1.91