In [1]:
import os
import glob
import ast

In [2]:
import pandas as pd
import osmnx as ox
import networkx as nx
import numpy as np
import datetime
import time
from shapely.geometry import Polygon
import gc
from os import walk
import geopandas as gpd
import math
import random
from random import shuffle
import csv
import pickle

In [3]:
# Get a list of all files to process
file_pattern = "data/processed/Murphy/*_processed.csv"
matched_files = glob.glob(file_pattern)

# Specify the destination folder for processed files
output_folder = "data/features/Murphy"
os.makedirs(output_folder, exist_ok=True)  # Create the destination folder if it doesn't exist

In [4]:
file_cnt = 0
matched_files.sort()
for filename in matched_files:
    if file_cnt == 0:
        df_edge = pd.read_csv(filename)
        df_edge['trip_id'] = df_edge['trip_id'].apply(ast.literal_eval)
    else:
        df = pd.read_csv(filename)
        df['trip_id'] = df['trip_id'].apply(ast.literal_eval)
        df.trip_id = df.trip_id.apply(lambda x: (file_cnt, x[1]))
        df_edge = pd.concat([df_edge,df], ignore_index=True)
    file_cnt += 1

In [6]:
df_edge.columns

Index(['trip_id', 'position', 'mass', 'elevation_change',
       'energy_consumption_total_kwh', 'simulated_energy_consumption_kwh',
       'time', 'sumo_time', 'speed', 'sumo_speed', 'fastsim_speed', 'time_acc',
       'time_stage', 'week_day', 'tags', 'osmid', 'road_type', 'speed_limit',
       'length', 'lanes', 'bridge', 'endpoint_u', 'endpoint_v',
       'direction_angle', 'previous_orientation'],
      dtype='object')

In [12]:
synthetic_data_flg = True # or False for processing real-world data
columns_to_process = ['trip_id', 'position', 'mass', 'elevation_change',
       'energy_consumption_total_kwh',   'time', 'speed', 'time_acc',
       'time_stage', 'week_day', 'tags', 'osmid', 'road_type', 'speed_limit',
       'length', 'lanes', 'bridge', 'endpoint_u', 'endpoint_v',
       'direction_angle', 'previous_orientation']

if synthetic_data_flg:
    columns_to_process[4] = 'simulated_energy_consumption_kwh'
    columns_to_process[5] = 'sumo_time'
    columns_to_process[6] = 'fastsim_speed'
    df_edge = df_edge[columns_to_process]
    newNames = {'simulated_energy_consumption_kwh': 'energy_consumption_total_kwh', 'sumo_time': 'time', 'fastsim_speed': 'speed'}
    df_edge = df_edge.rename(columns=newNames)
    
else:
    df_edge = df_edge[columns_to_process]


In [13]:
df_edge

Unnamed: 0,trip_id,position,mass,elevation_change,energy_consumption_total_kwh,time,speed,time_acc,time_stage,week_day,...,osmid,road_type,speed_limit,length,lanes,bridge,endpoint_u,endpoint_v,direction_angle,previous_orientation
0,"(0, 0)",0,30000.0,-3.5,0.989861,19,"[81.79718784449044, 82.124822699712, 82.854529...",2020-08-10 12:32:09,4,1,...,"[640843257, 1080549314]",tertiary,48.2802,350.159,2,0,0,0,-90.527956,0.000000
1,"(0, 0)",1,30000.0,-2.9,2.610350,11,"[25.958930233970946, 29.37444979918777, 33.423...",2020-08-10 12:32:09,4,1,...,1201539002,tertiary,48.2802,215.293,0,0,0,0,-85.893518,-1.606653
2,"(0, 0)",2,30000.0,0.1,2.034210,12,"[65.61472430563272, 68.16477343579126, 70.5949...",2020-08-10 12:32:09,4,1,...,"[1201539002, 39179870]",tertiary,48.2802,209.335,0,0,0,0,-89.217843,0.296164
3,"(0, 0)",3,30000.0,0.6,0.139522,3,"[18.091194032702823, 19.644411130682677, 22.60...",2020-08-10 12:32:09,4,1,...,1027628929,motorway_link,48.2802,19.645,2,0,0,0,-0.000000,-89.320385
4,"(0, 0)",4,30000.0,3.7,5.576379,26,"[26.494924413398724, 30.952742943000356, 35.75...",2020-08-10 12:32:09,4,1,...,"[998758729, 39195141]",motorway_link,48.2802,563.586,2,0,0,0,13.260348,-14.954275
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99616,"(157, 4)",52,30000.0,-7.8,1.819134,52,"[41.224273887952876, 39.111167825269355, 38.30...",2021-02-03 11:06:22,3,3,...,"[636154193, 6012871]",tertiary,40.2335,534.378,2,0,0,0,143.717016,-0.073855
99617,"(157, 4)",53,30000.0,-0.3,3.183193,92,"[30.633814396078304, 28.5806723152969, 29.2018...",2021-02-03 11:06:22,3,3,...,"[636154193, 5998138]",tertiary,32.1868,787.510,2,0,0,4,-179.277774,0.312897
99618,"(157, 4)",54,30000.0,-2.4,1.788509,49,"[21.33135270801623, 24.068019212049744, 27.737...",2021-02-03 11:06:22,3,3,...,6036088,unclassified,32.1868,415.024,2,0,0,0,-90.564911,91.315328
99619,"(157, 5)",0,8000.0,0.0,0.940321,26,"[0.0, 4.828773783757796, 10.581281767363732, 1...",2021-02-05 09:34:20,3,5,...,6040042,residential,32.1868,171.945,2,0,0,0,88.121576,0.000000


In [38]:
df_edge.iloc[72]

trip_id                                                                    (0, 1)
position                                                                        0
mass                                                                       8000.0
elevation_change                                                             10.5
energy_consumption_total_kwh                                             1.304077
time                                                                           32
speed                           [0.0, 7.133007841291272, 12.904106082362334, 1...
time_acc                                                      2020-08-10 13:40:07
time_stage                                                                      4
week_day                                                                        1
tags                                                      (34559009, 33323569, 0)
osmid                                                                   202308024
road_type       

In [15]:
df_edge['osmNodeIdUV'] = df_edge.tags.apply(lambda x: tuple(list(map(int, x[1:-1].split(", ")))[:-1]))

In [17]:
dual_graph_nodes_file_name = "results/dualGraphNodes.pkl"
with open(dual_graph_nodes_file_name, "rb") as open_file:
    dualGraphNode = pickle.load(open_file)

In [18]:
df_edge['osmNode'] = df_edge.osmNodeIdUV.apply(lambda x: dualGraphNode.index((x[0], x[1], 0)))

In [19]:
df_edge = df_edge.fillna(axis=0,method='ffill')

In [20]:
df_edge['segment_count'] = df_edge.groupby('trip_id')['osmNodeIdUV'].transform('count')

In [26]:
df_edge['network_id'] = df_edge['osmid']

In [28]:
df_edge = df_edge.drop(df_edge[df_edge['segment_count']<3].index)

df_edge = df_edge.reset_index(drop = True)

In [35]:
per_ele_001 = df_edge.elevation_change.quantile(0.01)
per_ele_99 = df_edge.elevation_change.quantile(0.99)
print(per_ele_001,per_ele_99)

-27.5 26.600000000000012


In [37]:
df_edge = df_edge.drop(df_edge[df_edge['elevation_change'] >per_ele_99].index).\
    drop(df_edge[df_edge['elevation_change'] < per_ele_001].index).reset_index(drop = True)

df_test = df_edge


In [39]:
len(df_test)

97258

In [40]:
counterFunc = df_test.apply(lambda x: True if abs(x['previous_orientation']) > 179 else False, axis=1)
df_test.drop(counterFunc[counterFunc == True].index,inplace=True)
df_test.reset_index(drop = True, inplace = True)
len(df_test)

96348

In [41]:
cnt = 0
for i in range(len(df_test)):
    if i > 0 and df_test.loc[i,'trip_id'] != df_test.loc[i-1,'trip_id']:
        cnt += 1
    df_test.loc[i,'trip']  = cnt

In [42]:
random.seed(1234)
trip_num = len(df_test['trip_id'].unique())
k_folder_list = list(range(trip_num))
shuffle(k_folder_list)
print('num of trips', len(k_folder_list))

num of trips 1888


In [43]:
outputFolderPrefix = 'data/pretrainingData'

In [44]:
test_list  = k_folder_list[int(0.8*len(k_folder_list)):]
k_folder_list = k_folder_list[:int(0.8*len(k_folder_list))]

In [45]:
df_test.columns

Index(['trip_id', 'position', 'mass', 'elevation_change',
       'energy_consumption_total_kwh', 'time', 'speed', 'time_acc',
       'time_stage', 'week_day', 'tags', 'osmid', 'road_type', 'speed_limit',
       'length', 'lanes', 'bridge', 'endpoint_u', 'endpoint_v',
       'direction_angle', 'previous_orientation', 'osmNodeIdUV', 'osmNode',
       'segment_count', 'network_id', 'trip'],
      dtype='object')

In [54]:
for datasetenumerate in range(1,3):
    outputpath = os.path.join(output_folder, "datasets/" + str(datasetenumerate))
    if not os.path.exists(outputpath):
        os.mkdir(outputpath)
    
    
    random.seed(datasetenumerate)
    shuffle(k_folder_list)
    #60-20-20
    train_list  = k_folder_list[: int(0.75*len(k_folder_list))]
    val_list  = k_folder_list[int(0.75*len(k_folder_list)):]
    
    print(len(train_list), len(val_list), len(test_list))

    df_test = df_test[['network_id', 'position',
           'road_type', 'speed_limit', 'mass', 'elevation_change',
           'previous_orientation', 'length', 'energy_consumption_total_kwh', 
           'time',  'direction_angle', 'time_stage', 'week_day',
            'lanes', 'bridge', 'endpoint_u', 'endpoint_v', 'segment_count', 'trip','osmNodeIdUV','osmNode' ]]

    df_test['segment_count'] = df_test.groupby('trip')['network_id'].transform('count')

    trip_before = -1
    position = 1
    for i in range(len(df_test)):
        if df_test.loc[i,'trip'] != trip_before:
            position = 1
            trip_before = df_test.loc[i,'trip']
        else:
            position += 1
        df_test.loc[i,'position']  = position

    d = df_test.groupby('road_type')['speed_limit'].mean()

    d.sort_values()

    dictionary = {}
    road_tp = 0
    for i in d.sort_values().index:
        dictionary[i] = road_tp
        road_tp += 1


    output_root = "results/road_type_dictionary.csv"
    csvFile = open(output_root, "w")
    writer = csv.writer(csvFile)
    writer.writerow(["road type", "value"])
    for i in dictionary:
        writer.writerow([i, dictionary[i]])
    csvFile.close()
    np.save('results/road_type_dictionary.npy', dictionary)

    endpoints_dictionary = np.load('results/endpoints_dictionary.npy', allow_pickle=True).item()


    output_root = "results/endpoints_dictionary.csv"
    csvFile = open(output_root, "w")
    writer = csv.writer(csvFile)
    writer.writerow(["endpoint", "value"])
    for i in endpoints_dictionary:
        writer.writerow([i, endpoints_dictionary[i]])
    csvFile.close()

    df_test['road_type']=df_test['road_type'].apply(lambda x:dictionary[x])

    # for lookuptable method
#     output = outputFolderPrefix + str(datasetenumerate)
#     outputpath = os.path.join("lookupdata", output)
#     print(outputpath)
#     if not os.path.exists(outputpath):
#         os.mkdir(outputpath)
#     # df_train = df_test[df_test['trip'].apply(lambda x: x in train_list_3 or x in val_list_3)]
#     df_train = df_test[df_test['trip'].apply(lambda x: x in train_list_3)]
#     df_val = df_test[df_test['trip'].apply(lambda x: x in test_list)]
#     df_train.to_csv(os.path.join(outputpath,"train_data.csv"))
#     df_val.to_csv(os.path.join(outputpath,"val_data.csv"))

#     print('lookuptable finished')
    new_columns = [
     'speed_limit',
     'mass',
     'elevation_change',
     'previous_orientation',
     'length',
     'direction_angle',
     'network_id',
     'position',
     'road_type',
     'time_stage',
     'week_day',
     'lanes',
     'bridge',
     'endpoint_u',
     'endpoint_v',
     'energy_consumption_total_kwh',
     'time',
     'segment_count',
     'trip',
     'osmNodeIdUV',
     'osmNode',
        ''
    ]

    df02 = df_test.reindex(columns=new_columns)

    output_root = "results/mean_std.csv"
    csvFile = open(output_root, "w")
    writer = csv.writer(csvFile)
    writer.writerow(["attribute", "mean","std"])
    for i,val in enumerate(df02.columns):
        if i < 6:
            x_mean = df02[val].mean()
            x_std = df02[val].std()
            writer.writerow([val,x_mean,x_std])
            df02[val] = df02[val].apply(lambda x: (x - x_mean) / x_std)
        elif val == 'energy_consumption_total_kwh' or val == 'time':
            x_mean = df02[val].mean()
            x_std = df02[val].std()
            writer.writerow([val, x_mean, x_std])
    csvFile.close()

    df_train = df02[df02['trip'].apply(lambda x: x in train_list)]
    df_val  = df02[df02['trip'].apply(lambda x: x in val_list)]
    df_t = df02[df02['trip'].apply(lambda x: x in test_list)]
    
    file_name_list = ["train_data.csv", "val_data.csv", "test_data.csv"]
    file_cnt = 0
    for df in [df_train, df_val, df_t]:
        df = df.fillna(axis=0,method='ffill')
        df.reset_index(drop = True, inplace = True)
        df['data'] = df.apply(lambda x: [x['speed_limit'],x['mass'],x['elevation_change'],x['previous_orientation'],x['length'],x['direction_angle']], axis = 1)
        df['label'] = df.apply(lambda x: [x["energy_consumption_total_kwh"],x["time"]], axis = 1)
        trip_before = -1
        position = 1
        for i in range(len(df)):
            if df.loc[i,'trip'] != trip_before:
                position = 1
                trip_before = df.loc[i,'trip']
            else:
                position += 1
            df.loc[i,'position_new'] = position
        df['trip'] = df['trip'].apply(lambda x: int(x))
        df = df[['data','label','network_id','segment_count',"position_new","road_type","time_stage", "week_day", "lanes", "bridge", "endpoint_u", "endpoint_v","trip",'osmNode']]
        file = file_name_list[file_cnt]
        file_cnt += 1
        df.to_csv(os.path.join(outputpath,file),header=False, index = False)

1132 378 378
1132 378 378
