## Libraries and config

In [1]:
!pip install numpy
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install skll
!pip install keras

import matplotlib.pyplot as plt
import sklearn.metrics as sklm
import skll.metrics as skllm
import copy
import json

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


## Utils

In [2]:
PATH = "./"

In [15]:
def retrieve_data(flow_interval):
    path = f"{PATH}dataset/dataset_flow_{flow_interval}.csv"
    print(PATH)
    data = pd.read_csv(path, ';')
    
    data['Flow'].apply(int)
    data['AveSpeed'].apply(float)
    data['Density'].apply(float)
    data['Sunday'].apply(int)
    data['Monday'].apply(int)
    data['Tuesday'].apply(int)
    data['Wednesday'].apply(int)
    data['Thursday'].apply(int)
    data['Friday'].apply(int)
    data['Saturday'].apply(int)
      
    return data

In [4]:
def store(obj, path, name):
  with open("{0}{1}/{2}.json".format(PATH, path, name), 'w') as json_file:
    json.dump(obj, json_file, sort_keys=True, indent=4)

In [5]:
def generate_dataset(data, useB, n_steps, n_future):
  """ Generate Dataset
  
  Generate a dataset provided a sequence. Reshape the sequence in rolling intervals from [samples, timesteps] into 
  [samples, timesteps, features] and split the sequence. The split the sequence in rolling intervals with a corresponding value 
  like the example bellow.

  Ex: split_sequence([1, 2, 3, 4, 5], 3) #([[1, 2, 3], [2, 3, 4]], [4, 5])
  
  Arguments:
    raw_seq: the sequence to reshape.
    useB: if the dataset is more complex or not.
    n_steps: size of the rolling interval
    n_future: the distance to the interval the value should be.  
  """

  sequence = np.array(data if useB else data['Flow'])

  n = len(sequence)
  X, Y = list(), list()

  for i in range(n):
    j = i + n_steps
    k = j + n_future

    if k >= n:
      break

    seq_x, seq_y = sequence[i:j], sequence[k]
    X.append(seq_x)	
    Y.append(seq_y[0] if useB else seq_y)

  X, Y = np.array(X), np.array(Y)	
  
  if not useB:
    X = X.reshape((X.shape[0], X.shape[1], 1))

  return X, Y

In [6]:
def evaluate (expected, observed, times, name):
  """ Evaluate Sessions
  
  Evaluate models by RMSE, NRMSE, MAE, HR, PRE. It will store the 
  results in a object and return it.
  
  Arguments:
    expected: an array of expected instances of each 
      train&test session.
    observed: an array of observed instances of each 
      train&test session.
    times: an array of the time of each train&test session.
    name: the name of the model
  """
  n = len(expected)
  flatten = lambda l : [i for sl in l for i in sl]
  
  # Make the arrays serializable
  expected = list(map(list, expected))
  observed = list(map(list, observed))
  
  for i in range(n):
    expected[i] = list(map(float, expected[i]))
    observed[i] = list(map(float, observed[i]))
  
  raw = evaluate_raw(expected, observed, times)
  
  #n_buckets = len(raw['PRE'])
  #_pre = [[pre[i] for pre in raw['PRE']] for i in range(n_buckets)]
  
  eva = {
    'TIME': int(sum(times)),
    'RMSE': float(np.mean(raw['RMSE'])),
    'MAE': float(np.mean(raw['MAE'])),
    'Kappa': float(np.mean(raw['Kappa'])),
    'HR': float(np.mean(raw['HR'])),
    'has_negative': (min(flatten(observed)) < 0),
    'raw': raw
  }
  
  print(f"\n{name} Final Result:")
  print(f"\tTotal Time: {eva['TIME']}s")
  print(f"\tRMSE: {eva['RMSE']}")
  print(f"\tMAE: {eva['MAE']}")
  print(f"\tKappa: {eva['Kappa']}")
  print(f"\tHit Ratio: {eva['HR'] * 100}%")
    
  return eva

In [7]:
def evaluate_raw (expected, observed, times):
  """ Evaluate Raw Sessions 
  
  Evaluate each of the train&test sessions by RMSE, NRMSE, MAE, HR, PRE. 
  It will store the results in a object and return it.
  
  Arguments:
    expected: an array of expected instances of each train&test session.
    observed: an array of observed instances of each train&test session.
    times: an array of the time of each train&test session.
  """
  
  n = len(expected)

  for i in range(n):
    observed[i] = [0 if np.isnan(o) else o for o in observed[i]]

  for i in range(n):
    observed[i] = [max(o, 0) for o in observed[i]]
  
  raw = {
    'expected': expected,
    'observed': observed,
    'TIME': times,
    'RMSE': [0] * n,
    'MAE': [0] * n,
    'Kappa': [0] * n,
    'HR': [0] * n,
  }
  
  for i in range(n):
    Y = expected[i]
    Y_hat = observed[i]
    time = times[i]

    raw['RMSE'][i] = np.sqrt(sklm.mean_squared_error(Y, Y_hat))
    raw['MAE'][i] = sklm.mean_absolute_error(Y, Y_hat)
    raw['Kappa'][i] = skllm.kappa(Y, Y_hat)
    raw['HR'][i] = evaluate_precision_hit_ratio(Y, Y_hat)
    
    if VERBOSITY:
      print(f"({i+1}/{n}) Test Size: {len(Y)}, Time: {time}s")
      print(f"\tRMSE: {raw['RMSE'][i]}")
      print(f"\tMAE: {raw['MAE'][i]}")
      print(f"\tKappa: {raw['Kappa'][i]}")
      print(f"\tHit Ratio: {raw['HR'][i] * 100}%")

  return raw

In [8]:
def evaluate_precision_hit_ratio (Y, Y_hat):
  """ Trend Prediction Ratio Calculation
  
  Calculates the ratio of up/down prediction.
  
  Arguments:
    Y: the expected dataset.
    Y_hat: the observed dataset.
  """
  
  cnt = 0
  
  for i in range(len(Y)):
    if i < N_FUTURE:
      continue
      
    exp = Y[i] - Y[i - N_FUTURE]
    obs = Y_hat[i] - Y[i - N_FUTURE]
    
    if exp * obs > 0:
      cnt += 1
    
  return cnt / len(Y)

In [9]:
def plot_prediction (Y, Y_hat, title):
  """ Plot Prediction
  
  Plot the prediction (Flow x Time) of what was expected and what
  was predicted.
  """

  name = f"{title}"
  path = f"plots/prediction/{name}"
  
  plt.plot(Y)
  plt.plot(Y_hat)
  plt.title(f"Predição do Modelo {title}")
  plt.ylabel('Fluxo')
  plt.xlabel('Tempo')
  plt.legend(['esperado', 'observado'], loc='upper left')
  plt.rcdefaults()
  plt.savefig(path + ".png", bbox_inches='tight')
  plt.close('all')

## Model

In [10]:
# Model Parameters
SEEABLE_PAST = 480 # in minutes
PREDICT_IN_FUTURE = 60 # in minutes
FLOW_INTERVAL = 150 # the interval size for each flow
N_SPLITS = 8

In [11]:
# Derivated Model Parameters
N_STEPS = SEEABLE_PAST * 60 // FLOW_INTERVAL # the number of flows to see in the past
N_FUTURE = PREDICT_IN_FUTURE * 60 // FLOW_INTERVAL # how much in the future we want to predict (0 = predict the flow on the next FLOW_INTERVAL minutes)
DAY_SIZE = (24 * 60 * 60) // FLOW_INTERVAL  
WEEK_SIZE = (7 * 24 * 60 * 60) // FLOW_INTERVAL
VERBOSITY = True

In [12]:
def split_dataset(lenX):
    return lenX - (WEEK_SIZE*2)

In [13]:
def logistic_regression(data, useB):
    global result_data
  
    name = "LR B" if useB else "LR A"

    expected, observed, times = [], [], []

    X, Y = generate_dataset(data, useB, FLOW_INTERVAL, N_STEPS, N_FUTURE)
    X = X.reshape(X.shape[0], X.shape[1] * X.shape[2])

    model = LogisticRegression()

    pointers = split_dataset(len(X), SET_SPLIT, TEST_SPLIT)
  
    for i, j, k in pointers:
        start = time.time()
            
        model.fit(X[i:j], Y[i:j])
            
        expected.append(Y[j:k])
        observed.append(model.predict(X[j:k]))
        times.append(time.time() - start)
    
    result_data['results'][name] = evaluate(expected, observed, times, name)

In [16]:
global result_data
  
result_data = {
    'results': {},
    'meta': {
      'SEEABLE_PAST': SEEABLE_PAST,
      'PREDICT_IN_FUTURE': PREDICT_IN_FUTURE,
      'FLOW_INTERVAL': FLOW_INTERVAL,
      'N_SPLITS': N_SPLITS,
    }
}

data = retrieve_data(FLOW_INTERVAL)

logistic_regression(data, False)
logistic_regression(data, True)

./


FileNotFoundError: [Errno 2] No such file or directory: './dataset/dataset_flow_150.csv'

In [19]:
ev = evaluate([validation[1]], [prediction], [0], "LR A")

(1/1) Test Size: 8063, Time: 0s
	RMSE: 6.069194007743126
	MAE: 4.6463469482305015
	Hit Ratio: 56.54222993922857%

RNN-150 Final Result:
	Total Time: 0s
	RMSE: 6.069194007743126
	MAE: 4.6463469482305015
	Hit Ratio: 56.54222993922857%


In [20]:
store(ev, 'results', 'LR A')