In [1]:
import datetime
import os
import json

from database import data_utils

import contextily as cx
import geopandas
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from shapely.ops import nearest_points

import importlib
importlib.reload(data_utils)

<module 'database.data_utils' from '/Users/zack/Desktop/valle/src/database/data_utils.py'>

In [2]:
# Get traces from all data
train_dates = data_utils.get_date_list("2022_11_01", 14)
test_dates = data_utils.get_date_list("2022_11_15", 7)
train_data = data_utils.combine_specific_folder_data("../data/kcm_all", train_dates)
test_data = data_utils.combine_specific_folder_data("../data/kcm_all", test_dates)
train_traces = data_utils.calculate_trace_df(train_data, 'file', 'tripid', 'locationtime', 'lat', 'lon', ['orientation','scheduledeviation','tripdistance','locationtime'], 'America/Los_Angeles', use_coord_dist=True)
test_traces = data_utils.calculate_trace_df(test_data, 'file', 'tripid', 'locationtime', 'lat', 'lon', ['orientation','scheduledeviation','tripdistance','locationtime'], 'America/Los_Angeles', use_coord_dist=True)

# Save trace data for results analysis
with open("../results/kcm2weeks/data/train_traces.pkl", "wb") as f:
    pickle.dump(train_traces, f)
with open("../results/kcm2weeks/data/test_traces.pkl", "wb") as f:
    pickle.dump(test_traces, f)

In [7]:
# # Get traces from all data
# train_dates = data_utils.get_date_list("2022_09_01", 14)
# test_dates = data_utils.get_date_list("2022_09_14", 7)
# train_data = data_utils.combine_specific_folder_data("../data/nwy_all", train_dates)
# test_data = data_utils.combine_specific_folder_data("../data/nwy_all", test_dates)
# train_traces = data_utils.calculate_trace_df(train_data, 'file', 'datedvehiclejourney', 'locationtime', 'lat', 'lon', ['bearing','locationtime'], 'Europe/Oslo', use_coord_dist=True)
# test_traces = data_utils.calculate_trace_df(test_data, 'file', 'datedvehiclejourney', 'locationtime', 'lat', 'lon', ['bearing','locationtime'], 'Europe/Oslo', use_coord_dist=True)

# # Save trace data for results analysis
# with open("../results/nwy2weeks/data/train_traces.pkl", "wb") as f:
#     pickle.dump(train_traces, f)
# with open("../results/nwy2weeks/data/test_traces.pkl", "wb") as f:
#     pickle.dump(test_traces, f)

In [4]:
print("Train:")
print(train_dates)
print(len(train_traces))
print()
print("Test:")
print(test_dates)
print(len(test_traces))

Train:
['2022_11_01.pkl', '2022_11_02.pkl', '2022_11_03.pkl', '2022_11_04.pkl', '2022_11_05.pkl', '2022_11_06.pkl', '2022_11_07.pkl', '2022_11_08.pkl', '2022_11_09.pkl', '2022_11_10.pkl', '2022_11_11.pkl', '2022_11_12.pkl', '2022_11_13.pkl', '2022_11_14.pkl']
5601830

Test:
['2022_11_15.pkl', '2022_11_16.pkl', '2022_11_17.pkl', '2022_11_18.pkl', '2022_11_19.pkl', '2022_11_20.pkl', '2022_11_21.pkl']
3987209


In [5]:
all_vehicle_ids = pd.concat([train_traces['vehicleid'], test_traces['vehicleid']]).values.flatten()

In [6]:
# all_vehicle_ids = pd.concat([train_traces['vehicle'], test_traces['vehicle']]).values.flatten()

In [7]:
# Recode vehicle ids to start from 0
mapping = {v:k for k,v in enumerate(set(all_vehicle_ids))}
train_recode = [mapping[y] for y in train_traces['vehicleid'].values.flatten()]
test_recode = [mapping[y] for y in test_traces['vehicleid'].values.flatten()]

train_traces['vehicleid_recode'] = train_recode
test_traces['vehicleid_recode'] = test_recode

In [8]:
# # Recode vehicle ids to start from 0
# mapping = {v:k for k,v in enumerate(set(all_vehicle_ids))}
# train_recode = [mapping[y] for y in train_traces['vehicle'].values.flatten()]
# test_recode = [mapping[y] for y in test_traces['vehicle'].values.flatten()]

# train_traces['vehicleid_recode'] = train_recode
# test_traces['vehicleid_recode'] = test_recode

In [9]:
# This must be put into models/base/Attr.py
# It is possible that not all vehicle ids are in the training data
len(pd.unique(all_vehicle_ids))

1148

In [10]:
batch_size = 10
num_files = 5
# If there is a minibatch of 1, it will crash DeepTTE. So we just drop a sample if true.
if len(test_traces) % batch_size == 1:
    test_traces = test_traces.iloc[0:len(test_traces)-1,:]

# Just drop last minibatch for train data that is split across files
extras = len(train_traces) % (batch_size * num_files)
train_traces = train_traces.iloc[0:len(train_traces)-extras,:]

In [13]:
train_traces_dict = data_utils.map_to_deeptte(train_traces, 'file', 'tripid')
test_traces_dict = data_utils.map_to_deeptte(test_traces, 'file', 'tripid')

In [None]:
# train_traces_dict = data_utils.map_to_deeptte(train_traces, 'file', 'datedvehiclejourney')
# test_traces_dict = data_utils.map_to_deeptte(test_traces, 'file', 'datedvehiclejourney')

In [14]:
# config.json
summary_dict = {
    'dist_gap_mean': np.mean(train_traces['dist_calc_km']),
    'dist_gap_std': np.std(train_traces['dist_calc_km']),
    'time_gap_mean': np.mean(train_traces['locationtime_diff']),
    'time_gap_std': np.std(train_traces['locationtime_diff']),
    'lngs_std': np.std(train_traces['lon']),
    'lngs_mean': np.mean(train_traces['lon']),
    'lats_mean': np.mean(train_traces['lat']),
    'dist_std': np.std(train_traces.groupby(['file','tripid']).max()[['dist_cumulative']].values.flatten()),
    "dist_mean": np.mean(train_traces.groupby(['file','tripid']).max()[['dist_cumulative']].values.flatten()),
    "lats_std": np.std(train_traces['lat']),
    "time_mean": np.mean(train_traces.groupby(['file','tripid']).max()[['time_cumulative']].values.flatten()),
    "time_std": np.std(train_traces.groupby(['file','tripid']).max()[['time_cumulative']].values.flatten()),
    "train_set": ["train_00", "train_01", "train_02", "train_03"],
    "eval_set": ["train_04"],
    "test_set": ["test"]
}

In [None]:
# train_traces['lat'] = train_traces['lat'].astype(float)
# train_traces['lon'] = train_traces['lon'].astype(float)

# # config.json
# summary_dict = {
#     'dist_gap_mean': np.mean(train_traces['dist_calc_km']),
#     'dist_gap_std': np.std(train_traces['dist_calc_km']),
#     'time_gap_mean': np.mean(train_traces['locationtime_diff']),
#     'time_gap_std': np.std(train_traces['locationtime_diff']),
#     'lngs_std': np.std(train_traces['lon']),
#     'lngs_mean': np.mean(train_traces['lon']),
#     'lats_mean': np.mean(train_traces['lat']),
#     'dist_std': np.std(train_traces.groupby(['file','datedvehiclejourney']).max()[['dist_cumulative']].values.flatten()),
#     "dist_mean": np.mean(train_traces.groupby(['file','datedvehiclejourney']).max()[['dist_cumulative']].values.flatten()),
#     "lats_std": np.std(train_traces['lat']),
#     "time_mean": np.mean(train_traces.groupby(['file','datedvehiclejourney']).max()[['time_cumulative']].values.flatten()),
#     "time_std": np.std(train_traces.groupby(['file','datedvehiclejourney']).max()[['time_cumulative']].values.flatten()),
#     "train_set": ["train_00", "train_01", "train_02", "train_03"],
#     "eval_set": ["train_04"],
#     "test_set": ["test"]
# }

In [15]:
# Where to save data before copying it to deeptte folder
json_path = "../data/deeptte_formatted/"

# Split data evenly into train/test files (must rename one to test)
for j, obj in enumerate(list(train_traces_dict.keys())):
    i = j % num_files
    with open(json_path+"train_0"+str(i), mode='a') as out_file:
        json.dump(train_traces_dict[obj], out_file)
        out_file.write("\n")

# Save separate date for test file
for j, obj in enumerate(list(test_traces_dict.keys())):
    with open(json_path+"test", mode='a') as out_file:
        json.dump(test_traces_dict[obj], out_file)
        out_file.write("\n")

# Write summary dict to config file
with open(json_path+"config.json", mode="a") as out_file:
    json.dump(summary_dict, out_file)