In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [None]:
# python 3.8.13
import os
import pandas as pd ## 1.4.2
import numpy as np ## 1.22.4
import time
import csv
import datetime
import pickle
import gc
from glob import glob
from shapely.geometry import Point, LineString, Polygon ## 1.8.2

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Parameter Definition

## OSM bounding box
osmbb = (37.47426965000426, 126.98132850000002, 37.5241478000146, 127.06978235003825) ## Site1 : Seoul-Core

speed_max_threshold = 120
timestep = 10
parking_time_criteria = 3

## Study area generation
p1_buffer = (37.521370, 127.052866)
p2_buffer = (37.493310, 127.065862)
p3_buffer = (37.471532, 126.997581)
p4_buffer = (37.501859, 126.985552)
target_area = Polygon([p1_buffer, p2_buffer, p3_buffer, p4_buffer])

In [None]:
# Reading data and configurating the initial dataframe format.
def read_data(filename):
    datalist = pd.DataFrame()
    
    for f in filename:
        try:
            data = np.loadtxt(f, delimiter=',')
            pseudo_df = pd.DataFrame(data)
            datalist = pd.concat([datalist, pseudo_df])
        except:
            pass
        
    datalist.columns = ["TripID", "X", "Y", "Z", "Time", "Head", "Speed", "GPS type", "Occupied"]
    
    # datetime foramtting.
    datalist = datalist.astype({'Time':'str', 'TripID':'int', 'Occupied':'int'})
    datalist['Time_new'] = datalist.apply(lambda x: x.Time[:4]+'/'+x.Time[4:6]+'/'+x.Time[6:8]+' '+x.Time[8:10]+':'+x.Time[10:12]+':'+x.Time[12:14],
                                          axis=1)
    datalist['Time'] = datalist.apply(lambda x: datetime.datetime.strptime(x.Time_new, '%Y/%m/%d %H:%M:%S'), axis=1)
    
    datalist.X = datalist.X/(10**7)
    datalist.Y = datalist.Y/(10**7)
    
    datalist.reset_index(drop=True, inplace=True)
    datalist = datalist[['TripID', 'X', 'Y', 'Time', 'Speed', 'Occupied']]
        
    print('All data have been successfully loaded...')
    return datalist

In [None]:
# Remove over- or under-recorded data (±5%)
def prep_measure(dataframe, timestep = 10):
    counts = dataframe.TripID.value_counts()
    
    counts_min_condition = counts.values >= (6*timestep-6*timestep*0.05)
    counts_max_condition = counts.values <= (6*timestep+6*timestep*0.05)
    counts = counts[counts_min_condition & counts_max_condition]
    
    prep_index = list(counts.index)
    datalist = dataframe[dataframe['TripID'].isin(prep_index)]
    datalist.reset_index(drop=True, inplace=True)
    
    return datalist

In [None]:
# Spatial Preprocessing (alleviate)
def prep_spatial1(dataframe, bounding_box): ## bounding_box = (lat_min, lng_min, lat_max, lng_max)
    bb = bounding_box
    
    lng_min = bb[1]
    lng_max = bb[3]
    lat_min = bb[0]
    lat_max = bb[2]
    
    lng_con = (dataframe['X'] >= lng_min) & (dataframe['X'] <= lng_max)
    lat_con = (dataframe['Y'] >= lat_min) & (dataframe['Y'] <= lat_max)
    
    datalist = dataframe[lng_con & lat_con]
    
    return datalist

In [None]:
# Speed Outlier Eliminate
def prep_speed(dataframe, speed_max = 120):
    datalist = dataframe[dataframe['Speed'] <= speed_max]
    
    return datalist

In [None]:
# Remove long-term idle vehicles
def prep_parking(dataframe, parking_time_criteria = 3):
    parking_ID_list = []
    ID_list = list(dataframe.TripID.unique())
    
    for idx in ID_list:
        df_sample = dataframe[dataframe['TripID']==idx]['Speed']
        zero_speed_count = 0
        for i in range(len(df_sample)-1):
            if (df_sample.iloc[i]==0) and (df_sample.iloc[i+1]==0):
                if zero_speed_count==0:
                    zero_speed_count = 2
                else:
                    zero_speed_count += 1
                    if zero_speed_count >= 6*parking_time_criteria:
                        parking_ID_list.append(idx)
                        break
            else:
                zero_speed_count = 0
                
    original = set(ID_list)
    pakings = set(parking_ID_list)
    preprocessed = original - pakings
    
    datalist = dataframe[dataframe.TripID.isin(preprocessed)]
    
    return datalist

In [None]:
# Spatial Preprocessing (locally)
def prep_spatial2(dataframe, area): # area should have a Polygon style geometry format.
    dataframe['Point'] = dataframe.apply(lambda x: Point(x['Y'], x['X']), axis=1)
    dataframe['Targeted'] = dataframe.apply(lambda x: x['Point'].within(area), axis=1)
    
    datalist = dataframe[dataframe['Targeted'] == True]
    datalist.reset_index(drop=True, inplace=True)
    
    return datalist

In [None]:
# Final preprocessing process
# TripID_order column and trajectory dictionary generation & Save
def prep_fin(dataframe):
    
    ID_list = list(dataframe.TripID.unique())
    datalist = pd.DataFrame()
    traj_dict = {}
    
    for idx in ID_list:
        df_sample = dataframe[dataframe.TripID == idx]
        orders = [str(idx)+'_'+str(k+1) for k in range(len(df_sample))]
        df_sample['TripID_order'] = orders
        datalist = pd.concat([datalist, df_sample])
        traj_dict[idx] = list(df_sample[['Y', 'X']].itertuples(index=False, name=None))
    
    return datalist, traj_dict

In [None]:
def filename_generator(day, order):
    model_object_file_path = 'DTGoutput'+format(day, '02')
    if not os.path.exists(model_object_file_path):
        os.makedirs(model_object_file_path)
    directory = './'+model_object_file_path+'/'
    filename1 = 'data_'+format(day, '02')+'_'+format(order, '03')+'.csv'
    filename2 = 'data_'+format(day, '02')+'_'+format(order, '03')+'.pkl'
    filename3 = 'traj_'+format(day, '02')+'_'+format(order, '03')+'.pickle'
    full_name1 = directory + filename1
    full_name2 = directory + filename2
    full_name3 = directory + filename3
    
    return full_name1, full_name2, full_name3

In [None]:
## Enter the target day
target_day = 2

In [None]:
args = {'year': 2018,
        'month': 4,
        'day': target_day,
        'half': 'AM'}  ## Set to either 'AM' or 'PM'.
file_path1 = '2018-04-'+format(target_day, '02')+'_'+args['half']
file_path2 = '../data/'+file_path1+'/*.DAT'

args['hour'] = 0 if args['half'] == 'AM' else 12
args['start'] = 0 if args['half'] == 'AM' else 72
args['finish'] = 72 if args['half'] == 'AM' else 144

pd.options.display.float_format = '{:.6f}'.format
start_time = time.time()

datanames = glob(file_path2)
## Ensure the raw data are pre-placed under '../data/~'.

df = read_data(datanames)
print(f'Time taken to load files: {time.time() - start_time}s')

start_time = time.time()
time_standard = datetime.datetime(args['year'], args['month'], args['day'], args['hour'], 0, 0)
time_min = time_standard

for i in range(args['start'], args['finish']):
    data_filename1, data_filename2, traj_filename = filename_generator(args['day'], i)
    time_max = time_min + datetime.timedelta(minutes=timestep)
    
    sample0 = df[(time_min <= df.Time) & (df.Time < time_max)]
    
    if len(sample0) == 0:
        pass
    else:
        sample1 = prep_measure(sample0, timestep=10)
        if len(sample1) == 0:
            pass
        else:
            sample2 = prep_spatial1(sample1, bounding_box = osmbb)
            if len(sample2) == 0:
                pass
            else:
                sample3 = prep_speed(sample2, speed_max = 120)
                if len(sample3) == 0:
                    pass
                else:
                    sample4 = prep_parking(sample3, parking_time_criteria = 3)
                    if len(sample4) == 0:
                        pass
                    else:
                        sample5 = prep_spatial2(sample4, area = target_area)
                        if len(sample5) == 0:
                            pass
                        else:
                            prep_sample, prep_traj = prep_fin(sample5)
                            prep_sample.to_csv(data_filename1,
                                               sep=',',
                                               na_rep='NaN',
                                               encoding='utf-8',
                                               index=False)
                            prep_sample.to_pickle(data_filename2)
                            with open(traj_filename,'wb') as fw:
                                pickle.dump(prep_traj, fw)
        
    time_min = time_max
    
print('Preprocessing has completed...')
print(f'Time taken for preprocessing1: {time.time() - start_time}s')