In [None]:
import datetime
import os
import json

from database import data_utils

import contextily as cx
import geopandas
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle


feature_names = ['trip_id','file','locationtime','lat','lon','vehicle_id']
data_types = ['object','object','int','float','float','object']
feature_lookup = dict(zip(feature_names, data_types))


import importlib
importlib.reload(data_utils)

In [None]:
# Get traces from all data (KCM)
given_names = ['tripid','file','locationtime','lat','lon','vehicleid']
train_data, train_fail_dates = data_utils.combine_specific_folder_data("../data/kcm_all", data_utils.get_date_list("2022_11_01", 14), given_names, feature_lookup)
test_data, test_fail_dates = data_utils.combine_specific_folder_data("../data/kcm_all", data_utils.get_date_list("2022_11_15", 7), given_names, feature_lookup)
print(train_fail_dates)
print(test_fail_dates)

# Calculate distance between points in each trajectory, do some filtering on speed, n_points
train_traces = data_utils.calculate_trace_df(train_data, 'America/Los_Angeles')
test_traces = data_utils.calculate_trace_df(test_data, 'America/Los_Angeles')

# Match trajectories to timetables and do filtering on stop distance, availability
train_traces = data_utils.clean_trace_df_w_timetables(train_traces, '../data/kcm_gtfs/2022_09_19/')
test_traces = data_utils.clean_trace_df_w_timetables(test_traces, '../data/kcm_gtfs/2022_09_19/')

# Save trace data for results analysis
with open("../results/kcm2weeks/data/train_traces.pkl", "wb") as f:
    pickle.dump(train_traces, f)
with open("../results/kcm2weeks/data/test_traces.pkl", "wb") as f:
    pickle.dump(test_traces, f)

In [None]:
# # Get traces from all data (ATB)
# # For now, we can use Norway dates that are post-2022_11_02
# # Need to get mapping of old IDs to new IDs in order to use schedule data from prior to that date
# # These dates are also somewhat low on data compared to previous
# train_dates = [
#     "2022_11_02.pkl",
#     "2022_11_03.pkl",
#     "2022_11_06.pkl",
#     "2022_11_07.pkl",
#     "2022_11_08.pkl",
#     "2022_11_09.pkl",
#     "2022_11_10.pkl",
#     "2022_11_13.pkl",
#     "2022_11_14.pkl",
#     "2022_11_15.pkl",
#     "2022_11_16.pkl",
#     "2022_11_17.pkl",
#     "2022_11_20.pkl",
#     "2022_11_21.pkl"
# ]
# test_dates = [
#     "2022_11_22.pkl",
#     "2022_11_23.pkl",
#     "2022_11_24.pkl",
#     "2022_11_25.pkl",
#     "2022_11_26.pkl",
#     "2022_11_27.pkl",
#     "2022_11_28.pkl"
# ]
# given_names = ['datedvehiclejourney','file','locationtime','lat','lon','vehicle']
# train_data, train_fail_dates = data_utils.combine_specific_folder_data("../data/nwy_all", train_dates, given_names, feature_lookup)
# test_data, test_fail_dates = data_utils.combine_specific_folder_data("../data/nwy_all", test_dates, given_names, feature_lookup)
# print(train_fail_dates)
# print(test_fail_dates)

# # Calculate distance between points in each trajectory, do some filtering on speed
# train_traces = data_utils.calculate_trace_df(train_data, 'Europe/Oslo')
# test_traces = data_utils.calculate_trace_df(test_data, 'Europe/Oslo')

# # Match trajectories to timetables and do filtering on stop distance, availability
# train_traces = data_utils.clean_trace_df_w_timetables(train_traces, '../data/nwy_gtfs/2022_12_01/')
# test_traces = data_utils.clean_trace_df_w_timetables(test_traces, '../data/nwy_gtfs/2022_12_01/')

# # Save trace data for results analysis
# with open("../results/nwy2weeks/data/train_traces.pkl", "wb") as f:
#     pickle.dump(train_traces, f)
# with open("../results/nwy2weeks/data/test_traces.pkl", "wb") as f:
#     pickle.dump(test_traces, f)

In [None]:
all_vehicle_ids = pd.concat([train_traces['vehicle_id'], test_traces['vehicle_id']]).values.flatten()

In [None]:
# Recode vehicle ids to start from 0
mapping = {v:k for k,v in enumerate(set(all_vehicle_ids))}
train_recode = [mapping[y] for y in train_traces['vehicle_id'].values.flatten()]
test_recode = [mapping[y] for y in test_traces['vehicle_id'].values.flatten()]

train_traces['vehicle_id_recode'] = train_recode
test_traces['vehicle_id_recode'] = test_recode

In [None]:
# This must be put into models/base/Attr.py
# It is possible that not all vehicle ids are in the training data
len(pd.unique(all_vehicle_ids))

In [None]:
batch_size = 10
num_files = 5
# If there is a minibatch of 1, it will crash DeepTTE. So we just drop a sample if true.
if len(test_traces) % batch_size == 1:
    test_traces = test_traces.iloc[0:len(test_traces)-1,:]

# Just drop last minibatch for train data that is split across files
extras = len(train_traces) % (batch_size * num_files)
train_traces = train_traces.iloc[0:len(train_traces)-extras,:]

In [None]:
train_traces_dict = data_utils.map_to_deeptte(train_traces)
test_traces_dict = data_utils.map_to_deeptte(test_traces)

In [None]:
# config.json
summary_dict = {
    'dist_gap_mean': np.mean(train_traces['dist_calc_km']),
    'dist_gap_std': np.std(train_traces['dist_calc_km']),
    'time_gap_mean': np.mean(train_traces['time_calc_s']),
    'time_gap_std': np.std(train_traces['time_calc_s']),
    'lngs_std': np.std(train_traces['lon']),
    'lngs_mean': np.mean(train_traces['lon']),
    'lats_mean': np.mean(train_traces['lat']),
    'dist_std': np.std(train_traces.groupby(['file','trip_id']).max()[['dist_cumulative_km']].values.flatten()),
    "dist_mean": np.mean(train_traces.groupby(['file','trip_id']).max()[['dist_cumulative_km']].values.flatten()),
    "lats_std": np.std(train_traces['lat']),
    "time_mean": np.mean(train_traces.groupby(['file','trip_id']).max()[['time_cumulative_s']].values.flatten()),
    "time_std": np.std(train_traces.groupby(['file','trip_id']).max()[['time_cumulative_s']].values.flatten()),
    "train_set": ["train_00", "train_01", "train_02", "train_03"],
    "eval_set": ["train_04"],
    "test_set": ["test"]
}

In [None]:
# Where to save data before copying it to deeptte folder
json_path = "../data/deeptte_formatted/"

# Split data evenly into train/test files (must rename one to test)
for j, obj in enumerate(list(train_traces_dict.keys())):
    i = j % num_files
    with open(json_path+"train_0"+str(i), mode='a') as out_file:
        json.dump(train_traces_dict[obj], out_file)
        out_file.write("\n")

# Save separate date for test file
for j, obj in enumerate(list(test_traces_dict.keys())):
    with open(json_path+"test", mode='a') as out_file:
        json.dump(test_traces_dict[obj], out_file)
        out_file.write("\n")

# Write summary dict to config file
with open(json_path+"config.json", mode="a") as out_file:
    json.dump(summary_dict, out_file)