In [1]:
import os
import json

from database import data_utils
from database import shape_utils

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

import importlib
importlib.reload(data_utils)
importlib.reload(shape_utils)

%matplotlib inline

RUN_FOLDER = "../results/end_to_end/"

In [2]:
# Get traces from all data (KCM)
given_names = ['tripid','file','locationtime','lat','lon','vehicleid']
train_data, train_fail_dates = data_utils.combine_pkl_data("../data/kcm_all", data_utils.get_date_list("2022_11_01", 14), given_names)
test_data, test_fail_dates = data_utils.combine_pkl_data("../data/kcm_all", data_utils.get_date_list("2022_11_15", 7), given_names)
print(f"Lost dates train: {train_fail_dates}")
print(f"Lost dates test: {test_fail_dates}")

# Load the GTFS
kcm_gtfs = data_utils.merge_gtfs_files("../data/kcm_gtfs/2022_09_19/")

# Calculate distance between points in each trajectory, do some filtering on speed, n_points
train_traces = data_utils.calculate_trace_df(train_data, 'America/Los_Angeles')
test_traces = data_utils.calculate_trace_df(test_data, 'America/Los_Angeles')

# Match trajectories to timetables and do filtering on stop distance, availability
train_traces = data_utils.clean_trace_df_w_timetables(train_traces, kcm_gtfs)
test_traces = data_utils.clean_trace_df_w_timetables(test_traces, kcm_gtfs)

# Get unique vehicle ids
(train_traces, test_traces), n_unique_veh = data_utils.remap_vehicle_ids([train_traces, test_traces])
print(f"{n_unique_veh} Unique vehicle IDs (ADD TO ATTR.PY)")
train_traces.head()

# Save trace data for results analysis
with open(RUN_FOLDER+"kcm/train_traces.pkl", "wb") as f:
    pickle.dump(train_traces, f)
with open(RUN_FOLDER+"kcm/test_traces.pkl", "wb") as f:
    pickle.dump(test_traces, f)

# Get the trace data in format for DeepTTE (no more transformations or filters)
train_traces_dict = data_utils.map_to_deeptte(train_traces)
test_traces_dict = data_utils.map_to_deeptte(test_traces)
summary_config = data_utils.get_summary_config(train_traces)

# Where to save data before copying it to deeptte folder
json_path = RUN_FOLDER+"kcm/deeptte_formatted/"
num_files = 5

# Delete existing files
for file in os.listdir(json_path):
    os.remove(json_path+file)

# Split data evenly into train/test files (must rename one to test)
for j, obj in enumerate(list(train_traces_dict.keys())):
    i = j % num_files
    with open(json_path+"train_0"+str(i), mode='a') as out_file:
        json.dump(train_traces_dict[obj], out_file)
        out_file.write("\n")

# Save separate dates for test file
for j, obj in enumerate(list(test_traces_dict.keys())):
    with open(json_path+"test", mode='a') as out_file:
        json.dump(test_traces_dict[obj], out_file)
        out_file.write("\n")

# Write summary dict to config file
with open(json_path+"config.json", mode="a") as out_file:
    json.dump(summary_config, out_file)

Lost dates train: []
Lost dates test: []
1147 Unique vehicle IDs (ADD TO ATTR.PY)


In [3]:
# Get traces from all data (ATB)
# For now, we can use Norway dates that are post-2022_11_02
# Need to get mapping of old IDs to new IDs in order to use schedule data from prior to that date
# These dates are also somewhat low on data compared to previous
train_dates = [
    "2022_11_02.pkl",
    "2022_11_03.pkl",
    "2022_11_06.pkl",
    "2022_11_07.pkl",
    "2022_11_08.pkl",
    "2022_11_09.pkl",
    "2022_11_10.pkl",
    "2022_11_13.pkl",
    "2022_11_14.pkl",
    "2022_11_15.pkl",
    "2022_11_16.pkl",
    "2022_11_17.pkl",
    "2022_11_20.pkl",
    "2022_11_21.pkl"
]
test_dates = [
    "2022_11_22.pkl",
    "2022_11_23.pkl",
    "2022_11_24.pkl",
    "2022_11_25.pkl",
    "2022_11_26.pkl",
    "2022_11_27.pkl",
    "2022_11_28.pkl"
]
given_names = ['datedvehiclejourney','file','locationtime','lat','lon','vehicle']
train_data, train_fail_dates = data_utils.combine_pkl_data("../data/atb_all", train_dates, given_names)
test_data, test_fail_dates = data_utils.combine_pkl_data("../data/atb_all", test_dates, given_names)
print(f"Lost dates train: {train_fail_dates}")
print(f"Lost dates test: {test_fail_dates}")

# Load the GTFS
nwy_gtfs = data_utils.merge_gtfs_files('../data/nwy_gtfs/2022_12_01/')

# Calculate distance between points in each trajectory, do some filtering on speed, remap veh_id
train_traces = data_utils.calculate_trace_df(train_data, 'Europe/Oslo')
test_traces = data_utils.calculate_trace_df(test_data, 'Europe/Oslo')

# Match trajectories to timetables and do filtering on stop distance, availability
train_traces = data_utils.clean_trace_df_w_timetables(train_traces, nwy_gtfs)
test_traces = data_utils.clean_trace_df_w_timetables(test_traces, nwy_gtfs)

# Get unique vehicle ids
(train_traces, test_traces), n_unique_veh = data_utils.remap_vehicle_ids([train_traces, test_traces])
print(f"{n_unique_veh} Unique vehicle IDs (ADD TO ATTR.PY)")

# Save trace data for results analysis
with open(RUN_FOLDER+"atb/train_traces.pkl", "wb") as f:
    pickle.dump(train_traces, f)
with open(RUN_FOLDER+"atb/test_traces.pkl", "wb") as f:
    pickle.dump(test_traces, f)

# Get the trace data in format for DeepTTE (no more transformations or filters)
train_traces_dict = data_utils.map_to_deeptte(train_traces)
test_traces_dict = data_utils.map_to_deeptte(test_traces)
summary_config = data_utils.get_summary_config(train_traces)

# Where to save data before copying it to deeptte folder
json_path = RUN_FOLDER+"atb/deeptte_formatted/"
num_files = 5

# Delete existing files
for file in os.listdir(json_path):
    os.remove(json_path+file)

# Split data evenly into train/test files (must rename one to test)
for j, obj in enumerate(list(train_traces_dict.keys())):
    i = j % num_files
    with open(json_path+"train_0"+str(i), mode='a') as out_file:
        json.dump(train_traces_dict[obj], out_file)
        out_file.write("\n")

# Save separate dates for test file
for j, obj in enumerate(list(test_traces_dict.keys())):
    with open(json_path+"test", mode='a') as out_file:
        json.dump(test_traces_dict[obj], out_file)
        out_file.write("\n")

# Write summary dict to config file
with open(json_path+"config.json", mode="a") as out_file:
    json.dump(summary_config, out_file)

Lost dates train: []
Lost dates test: []
207 Unique vehicle IDs (ADD TO ATTR.PY)
