## Libraries and config

In [1]:
!python --version
!python3.7 --version
!python3.7 -m pip install tensorflow==1.15
!python3.7 -m pip install numpy
!python3.7 -m pip install pandas
!python3.7 -m pip install numpy
!python3.7 -m pip install sklearn
!python3.7 -m pip install statsmodels
!python3.7 -m pip install matplotlib
!python3.7 -m pip install keras==2.3.1

Python 3.9.2
Python 3.7.10
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import matplotlib.pyplot as plt

import sklearn
import time
import random
import copy
import json

import pandas as pd # data manipulation library
import numpy as np # math library

import sklearn.metrics as sklm # metrics
import skll.metrics as skllm
import statsmodels as sm # statistical models
from sklearn.model_selection import GridSearchCV

In [3]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam, Adagrad
from keras.wrappers.scikit_learn import KerasRegressor

import tensorflow as tf # machine learning library
import os

os.environ['PYTHONHASHSEED'] = '0'
tf.compat.v1.reset_default_graph()
tf.compat.v1.random.set_random_seed(0)
np.random.seed(0)
random.seed(0)

from keras import backend as K

# 5. Configure a new global `tensorflow` session
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
K.set_session(sess)

from keras.layers import SimpleRNN

Using TensorFlow backend.


## Utils

In [4]:
PATH = "./"

In [5]:
def retrieve_data():
    path = "{0}dataset/dataset_time.csv".format(PATH)
    print(PATH)
    data = pd.read_csv(path, ';')
    
    data['Time'].apply(int)
    data['Speed'].apply(float)
    data['Sunday'].apply(int)
    data['Monday'].apply(int)
    data['Tuesday'].apply(int)
    data['Wednesday'].apply(int)
    data['Thursday'].apply(int)
    data['Friday'].apply(int)
    data['Saturday'].apply(int)
      
    return data

In [6]:
def store(obj, path, name):
  with open("{0}{1}/{2}.json".format(PATH, path, name), 'w') as json_file:
    json.dump(obj, json_file, sort_keys=True, indent=4)

In [7]:
def generate_dataset(data, useB, n_steps, n_future):
  """ Generate Dataset
  
  Generate a dataset provided a sequence. Reshape the sequence in rolling intervals from [samples, timesteps] into 
  [samples, timesteps, features] and split the sequence. The split the sequence in rolling intervals with a corresponding value 
  like the example bellow.

  Ex: split_sequence([1, 2, 3, 4, 5], 3) #([[1, 2, 3], [2, 3, 4]], [4, 5])
  
  Arguments:
    raw_seq: the sequence to reshape.
    useB: if the dataset is more complex or not.
    n_steps: size of the rolling interval
    n_future: the distance to the interval the value should be.  
  """

  sequence = np.array(data if useB else data['Time'])

  n = len(sequence)
  X, Y = list(), list()

  for i in range(n):
    j = i + n_steps
    k = j + n_future

    if k >= n:
      break

    seq_x, seq_y = sequence[i:j], sequence[k]
    X.append(seq_x)	
    Y.append(seq_y[0] if useB else seq_y)

  X, Y = np.array(X), np.array(Y)	
  
  if not useB:
    X = X.reshape((X.shape[0], X.shape[1], 1))

  return X, Y

In [8]:
def evaluate (expected, observed, times, name):
  """ Evaluate Sessions
  
  Evaluate models by RMSE, NRMSE, MAE, HR, PRE. It will store the 
  results in a object and return it.
  
  Arguments:
    expected: an array of expected instances of each 
      train&test session.
    observed: an array of observed instances of each 
      train&test session.
    times: an array of the time of each train&test session.
    name: the name of the model
  """
  n = len(expected)
  flatten = lambda l : [i for sl in l for i in sl]
  
  # Make the arrays serializable
  expected = list(map(list, expected))
  observed = list(map(list, observed))
  
  for i in range(n):
    expected[i] = list(map(float, expected[i]))
    observed[i] = list(map(float, observed[i]))
  
  raw = evaluate_raw(expected, observed, times)
  
  #n_buckets = len(raw['PRE'])
  #_pre = [[pre[i] for pre in raw['PRE']] for i in range(n_buckets)]
  
  eva = {
    'TIME': int(sum(times)),
    'RMSE': float(np.mean(raw['RMSE'])),
    # 'NRMSE': float(np.mean(raw['NRMSE'])),
    'MAE': float(np.mean(raw['MAE'])),
    'HR': float(np.mean(raw['HR'])),
    #'PRE': [float(np.mean(p)) for p in _pre],
    'has_negative': (min(flatten(observed)) < 0),
    'raw': raw
  }
  
  print("\n{0} Final Result:".format(name))
  print("\tTotal Time: {0}s".format(eva['TIME']))
  print("\tRMSE: {0}".format(eva['RMSE']))
  # print("\tNRMSE: {0}".format(eva['NRMSE']))
  print("\tMAE: {0}".format(eva['MAE']))
  print("\tHit Ratio: {0}%".format(eva['HR'] * 100))
  #print("\tPrecision: {0}".format(eva['PRE']))
    
  return eva

In [9]:
def evaluate_raw (expected, observed, times):
  """ Evaluate Raw Sessions 
  
  Evaluate each of the train&test sessions by RMSE, NRMSE, MAE, HR, PRE. 
  It will store the results in a object and return it.
  
  Arguments:
    expected: an array of expected instances of each train&test session.
    observed: an array of observed instances of each train&test session.
    times: an array of the time of each train&test session.
  """
  
  n = len(expected)

  for i in range(n):
    observed[i] = [0 if np.isnan(o) else o for o in observed[i]]

  for i in range(n):
    observed[i] = [max(o, 0) for o in observed[i]]
  
  raw = {
    'expected': expected,
    'observed': observed,
    'TIME': times,
    'RMSE': [0] * n,
    # 'NRMSE': [0] * n,
    'MAE': [0] * n,
    'HR': [0] * n,
    #'PRE': [0] * n,
  }
  
  for i in range(n):
    Y = expected[i]
    Y_hat = observed[i]
    time = times[i]

    raw['MAE'][i] = sklm.mean_absolute_error(Y, Y_hat)
    raw['RMSE'][i] = np.sqrt(sklm.mean_squared_error(Y, Y_hat))
    # raw['NRMSE'][i] = raw['RMSE'][i] / np.std(Y)
    raw['HR'][i] = evaluate_precision_hit_ratio(Y, Y_hat)
    #raw['PRE'][i] = evaluate_precision_bucket(Y, Y_hat)
    
    if VERBOSITY:
      print("({0}/{1}) Test Size: {2}, Time: {3}s".format(i+1, n, len(Y), time))
      print("\tRMSE: {0}".format(raw['RMSE'][i]))
      # print("\tNRMSE: {0}".format(raw['NRMSE'][i]))
      print("\tMAE: {0}".format(raw['MAE'][i]))
      print("\tHit Ratio: {0}%".format(raw['HR'][i] * 100))

  return raw

In [10]:
def evaluate_precision_hit_ratio (Y, Y_hat):
  """ Trend Prediction Ratio Calculation
  
  Calculates the ratio of up/down prediction.
  
  Arguments:
    Y: the expected dataset.
    Y_hat: the observed dataset.
  """
  
  cnt = 0
  
  for i in range(len(Y)):
    if i < N_FUTURE:
      continue
      
    exp = Y[i] - Y[i - N_FUTURE]
    obs = Y_hat[i] - Y[i - N_FUTURE]
    
    if exp * obs > 0:
      cnt += 1
    
  return cnt / len(Y)

In [11]:
def plot_prediction (Y, Y_hat, title):
  """ Plot Prediction
  
  Plot the prediction (Flow x Time) of what was expected and what
  was predicted.
  """

  name = f"{title}"
  path = f"plots/prediction/{name}"
  
  plt.plot(Y)
  plt.plot(Y_hat)
  plt.title(f"Predição do Modelo {title}")
  plt.ylabel('Tempo')
  plt.xlabel('N')
  plt.legend(['esperado', 'observado'], loc='upper left')
  plt.rcdefaults()
  plt.savefig(path + ".png", bbox_inches='tight')
  plt.close('all')

## Model

In [12]:
# Model Parameters
SEEABLE_PAST = 480 # in minutes
PREDICT_IN_FUTURE = 60 # in minutes
FLOW_INTERVAL = 150 # the interval size for each flow
N_SPLITS = 8

In [13]:
# Derivated Model Parameters
N_STEPS = SEEABLE_PAST * 60 // FLOW_INTERVAL # the number of flows to see in the past
N_FUTURE = PREDICT_IN_FUTURE * 60 // FLOW_INTERVAL # how much in the future we want to predict (0 = predict the flow on the next FLOW_INTERVAL minutes)
DAY_SIZE = (24 * 60 * 60) // FLOW_INTERVAL  
WEEK_SIZE = (7 * 24 * 60 * 60) // FLOW_INTERVAL
VERBOSITY = True

In [14]:
def split_dataset():
    ## last 15 days
    return 450801

In [15]:
def rnn (data, useB): 
  global result_data
  
  name = "RNN B" if useB else "RNN A"
  
  X, Y = generate_dataset(data, useB, N_STEPS, N_FUTURE)
  
  model = Sequential()		

  model.add(SimpleRNN(50, activation='relu', input_shape=(X.shape[1], X.shape[2])))		
  model.add(Dense(1))		

  model.compile(optimizer='adam', loss='mse', metrics = ["accuracy"])
  
  pointer = split_dataset()
    
  h = model.fit(X[0:pointer], Y[0:pointer], validation_split=0.33, batch_size=512, epochs=10, verbose=2)

  return h, [X[(pointer + 1):], Y[(pointer + 1):]]

In [16]:
global result_data
  
result_data = {
    'results': {},
    'meta': {
      'SEEABLE_PAST': SEEABLE_PAST,
      'PREDICT_IN_FUTURE': PREDICT_IN_FUTURE,
      'FLOW_INTERVAL': FLOW_INTERVAL,
      'N_SPLITS': N_SPLITS,
    }
}

data = retrieve_data()

history, validation = rnn(data, False)

./
Instructions for updating:
If using Keras pass *_constraint arguments to layers.

Train on 302036 samples, validate on 148765 samples
Epoch 1/10
 - 66s - loss: 93811832.0113 - accuracy: 2.2514e-04 - val_loss: 33101515.9484 - val_accuracy: 0.0000e+00
Epoch 2/10
 - 69s - loss: 32984783.1705 - accuracy: 5.0325e-04 - val_loss: 32199772.8307 - val_accuracy: 0.0000e+00
Epoch 3/10
 - 68s - loss: 32606682.5933 - accuracy: 4.2379e-04 - val_loss: 33651657.6023 - val_accuracy: 2.6888e-05
Epoch 4/10
 - 67s - loss: 32414354.9077 - accuracy: 4.2048e-04 - val_loss: 31446235.2868 - val_accuracy: 0.0000e+00
Epoch 5/10
 - 66s - loss: 32315920.9835 - accuracy: 4.1717e-04 - val_loss: 33025389.4704 - val_accuracy: 4.0332e-05
Epoch 6/10
 - 66s - loss: 32444162.0151 - accuracy: 4.4366e-04 - val_loss: 31482125.3519 - val_accuracy: 0.0000e+00
Epoch 7/10
 - 66s - loss: 32032234.6433 - accuracy: 3.4102e-04 - val_loss: 31340096.5816 - val_accuracy: 0.0000e+00
Epoch 8/10
 - 66s - loss: 32017650.7056 - accuracy:

In [17]:
prediction = history.model.predict(validation[0])

In [18]:
plot_prediction(validation[1], prediction, "RNN-time")

In [19]:
ev = evaluate([validation[1]], [prediction], [0], "RNN-time")

(1/1) Test Size: 85861, Time: 0s
	RMSE: 5474.750973778963
	MAE: 711.3057066515483
	Hit Ratio: 69.54496220635679%

RNN-time Final Result:
	Total Time: 0s
	RMSE: 5474.750973778963
	MAE: 711.3057066515483
	Hit Ratio: 69.54496220635679%


In [20]:
store(ev, 'results', 'RNN-time')