In [1]:
from datetime import datetime
import itertools
import json

import importlib
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
import torch
from torch.utils.data import DataLoader

from database import data_utils, data_loader, model_utils
from models import basic_rnn
import run_models

importlib.reload(data_utils)
importlib.reload(basic_rnn)
importlib.reload(data_loader)


run_folder = "../results/throwaway/"
fold_num = 0
network_folder = "kcm/"

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

# Given sequence of bus data, predict average speed at next step
# Next step can be current -> next gps coordinate
# Should also benchmark with arima?
# Can do single or multi step prediction
# Can do fixed or variable sequence inputs

In [30]:

# Construct dataloaders for Pytorch models
importlib.reload(data_loader)
importlib.reload(basic_rnn)
importlib.reload(model_utils)

### Set run and hyperparameters
EPOCHS = 50
BATCH_SIZE = 16
LEARN_RATE = 1e-3
HIDDEN_SIZE = 32

### Load train/test data
print("="*30)
data_folder = run_folder + network_folder + "deeptte_formatted/"
print(f"Loading data from '{data_folder}'...")
# Load config
with open(data_folder + "config.json", "r") as f:
    config = json.load(f)
# Load GTFS-RT samples
train_data_chunks, valid_data = data_utils.load_train_test_data(data_folder, config['n_folds'])
# Load GTFS data
print(f"Loading and merging GTFS files from '{config['gtfs_folder']}'...")
gtfs_data = data_utils.merge_gtfs_files("."+config['gtfs_folder'])

print("="*30)
print(f"FOLD: {fold_num}")
# Set aside the train/test data according to the current fold number
test_data = train_data_chunks[fold_num]
train_data = [x for i,x in enumerate(train_data_chunks) if i!=fold_num]
# Combine the training data to single object
train_data = list(itertools.chain.from_iterable(train_data))


train_dataset = data_loader.make_sequence_dataset(train_data, config)
test_dataset = data_loader.make_sequence_dataset(test_data, config)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=4)
print(f"Successfully loaded {len(train_data)} training samples and {len(test_data)} testing samples.")

Loading data from '../results/throwaway/kcm/deeptte_formatted/'...
Loading and merging GTFS files from './data/kcm_gtfs/2020_09_23/'...
FOLD: 0
Successfully loaded 21589 training samples and 5391 testing samples.


In [36]:
importlib.reload(model_utils)
importlib.reload(basic_rnn)

print("="*30)
print(f"Training basic rnn model...")
embed_dict = {
    'timeID': {
        'vocab_size': 1440,
        'embed_dims': 24,
        'col': 8
    },
    'weekID': {
        'vocab_size': 7,
        'embed_dims': 4,
        'col': 9
    },
    'driverID': {
        'vocab_size': config['n_unique_veh'],
        'embed_dims': 12,
        'col': 10
    }
}
rnn_model = basic_rnn.BasicRNN(
    train_dataloader.dataset.tensors[0][0].shape[2],
    1,
    HIDDEN_SIZE,
    BATCH_SIZE,
    embed_dict
).to(device)


rnn_train_losses, rnn_test_losses = model_utils.fit_to_data(rnn_model, train_dataloader, test_dataloader, LEARN_RATE, EPOCHS, device)
torch.save(rnn_model.state_dict(), run_folder + network_folder + f"models/rnn_model_{fold_num}.pt")
rnn_model.eval()
rnn_labels, rnn_preds = model_utils.predict(rnn_model, test_dataloader, config, device)

Training basic rnn model...
EPOCH: 0
here
torch.Size([16, 1, 32])


RuntimeError: Expected hidden size (1, 16, 32), got [16, 1, 32]

In [None]:
# Plot training curves
plot_data = pd.DataFrame(
    {
        "Epoch": [x for x in range(0,len(ff_train_losses))],
        "Training Loss": ff_train_losses,
        "Validation Loss": ff_test_losses
    }
)
sns.lineplot(x='Epoch', y='value', hue='variable', data=pd.melt(plot_data, ['Epoch']))

In [None]:
# Look at predictions
print(f"MAPE: {metrics.mean_absolute_percentage_error(ff_labels, ff_preds)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(ff_labels, ff_preds))}")
print(f"MAE: {metrics.mean_absolute_error(ff_labels, ff_preds)}")