**road traffic speed data:** Request from Datamall API: 2.14 ESTIMATED TRAVEL TIMES


In [3]:
import os
import copy
import datetime
import pandas as pd
from shapely.geometry import LineString

# 1. Extract Road Link Location

In [None]:
def read_road_loc(path):
    '''

    Parameters
        path : TYPE
            DESCRIPTION.

    Returns
        data : TYPE
            DESCRIPTION.
        
    Example
    -------
    path = '2022-06/2022-06-01/2022-06-01-00-49-27.csv' 
    data = read_road_loc(path)

    '''
    # initialization
    if path is None:
        data = pd.DataFrame(columns = ['LinkID', 'RoadName', 'RoadCategory', 'Location'], dtype=str)

    # read data
    try:
        data = pd.read_csv(path, header=0, index_col=None, dtype=str)
    except:
        return None
    
    data.drop(['SpeedBand', 'MinimumSpeed', 'MaximumSpeed'], axis=1, inplace=True)
    
    # link location
    data['Location'] = data['Location'].apply(lambda x: LineString([(float(x.split(' ')[1]), float(x.split(' ')[0])), 
                                                                    (float(x.split(' ')[3]), float(x.split(' ')[2]))]
                                                                   ).wkt 
                                              )
    # drop duplicate
    try:
        assert data['LinkID'].unique().shape[0] == data.shape[0]
    except:
        print(data['LinkID'].unique().shape[0], data.shape[0])
        data.drop_duplicates(subset=['LinkID'], inplace=True)
    
    return data
# =============================================================================


def extract_road_location(folder_path, save_path=None):
    '''
    read one month road traffic speed data, and extract the road location

    Parameters:
        folder_path (str): DESCRIPTION.
        save_path (str): DESCRIPTION.

    Returns:
        data_month_df (pandas.DataFrame): 

    '''
    # initialization
    data_month_df = read_road_loc(path=None)
    
    fd_day_li = os.listdir(folder_path)
    
    # for one day files
    for fdn in fd_day_li:
        fd_day_path = os.path.join(folder_path, fdn)
        fn_data_li = os.listdir(fd_day_path)
        
        # for one sample data
        for fn_data in fn_data_li:
            
            data_path = os.path.join(fd_day_path, fn_data)
            data_new = read_road_loc(data_path)
            if data_new is None:
                continue
            
            data_month_df = pd.concat([data_month_df, data_new], axis=0, ignore_index=True)
            data_month_df.drop_duplicates(inplace=True, ignore_index=True)
        
        # drop duplicate
        try:
            assert data_month_df['LinkID'].unique().shape[0] == data_month_df.shape[0]
        except:
            print(data_month_df['LinkID'].unique().shape[0], data_month_df.shape[0])
            data.drop_duplicates(subset=['LinkID'], inplace=True)
            
        # since the above processing is time-cosuming, 
        # when one-day data is added, the file will be saved.
        if not (save_path is None):
            print('save: ', fdn, data_month_df.shape[0])
            data_month_df.to_csv(save_path, index=False)
    
    return data_month_df
# =============================================================================

In [None]:
# extract from only one file
path = '2022-06/2022-06-21/2022-06-21-23-16-33.csv' 
data = read_road_loc(path)
data.to_csv('road_location_2022-06.csv', index=None)

In [None]:
# extract from one-month file
path = os.path.join(os.getcwd(), '2022-06')
save_path = 'traffic_speed_data/road_location_2022-06.csv'

data = extract_road_location(path, save_path)

# 2. Extract Monthly Average Traffic Speed

## 2.1 Read function of one single file 

In [None]:
def read_traffic_speed(path):
    
    try:
        data = pd.read_csv(path, header=0, index_col=None, dtype={'LinkID': str})
    except:     # data is None
        data = pd.DataFrame(columns = ['id', 'speed'], dtype = str)
        return None
    
    data.dropna(axis=0, how='any', subset=['MinimumSpeed'], inplace=True)
    
    data = data.rename(columns = {'LinkID' : 'id'})
    
    data['speed'] = data['MinimumSpeed'] + 5
    
    data = data[['id', 'speed']]
    data.drop_duplicates(inplace=True, ignore_index=True)
    # print(data['id'].value_counts())

    try:
        assert data['id'].unique().shape[0] == data.shape[0]
    except:
        print(data['id'].unique().shape[0], data.shape[0])
        data.drop_duplicates(subset=['id'], inplace=True, ignore_index=True)
    
    return data
# =============================================================================

### 2.2.1 Load one-month data

- **Resample frequency:** 1 hour

- **Resample method:** average


In [4]:
def load_one_month_data(root_folder, data_value_name, save_path=None):
    '''
    '''
    # one-month data
    data_month_df = pd.DataFrame(columns=['id', data_value_name, 'datetime'])
    data_month_df.to_csv(save_path, index=False, header=True)
    
    # one month data
    for fd_day in os.listdir(root_folder):

        fd_day_path = os.path.join(root_folder, fd_day)
        fn_data_li = os.listdir(fd_day_path)

        # one day data
        data_day_df = pd.DataFrame(columns=['id', data_value_name, 'datetime'])

        for fn_data in fn_data_li:
            # read data
            data_path = os.path.join(fd_day_path, fn_data)
            data_df = read_traffic_speed(data_path)
            # print('read data:', data_path)

            if data_df is None:
                continue 

            dt_str = fn_data.split('.')[0]
            data_df['datetime'] = dt_str

            # data_df['year']   = int(dt_str.split('-')[0])
            # data_df['month']  = int(dt_str.split('-')[1])
            # data_df['day']    = int(dt_str.split('-')[2])
            # data_df['hour']   = int(dt_str.split('-')[3])
            # data_df['minute'] = int(dt_str.split('-')[4])
            # data_df['second'] = int(dt_str.split('-')[5])
            data_day_df = pd.concat([data_day_df, data_df], ignore_index=True)

        if data_day_df.shape[0] == 0:
            continue

        data_day_df = data_day_df.pivot(index='datetime', columns='id', values=data_value_name)
        data_day_df.dropna(how='all', inplace=True)

        dt_index = list(map(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d-%H-%M-%S"),
                            data_day_df.index.to_list()))

        data_day_df.index = dt_index

        # resample 1 hour (1H) mean
        data_day_df = data_day_df.resample(rule='1H', closed='left', label='left').mean()
        
        # id list
        id_li = data_day_df.columns.to_list()
        
        data_day_df['datetime'] = list(map(str, data_day_df.index))
        data_day_df.reset_index(drop=True, inplace=True)
        
        # one-day data
        # data_oneday_df = pd.DataFrame(columns=['id', data_value_name, 'datetime'])
        
        for id_ in id_li:

            data_day_df_id = data_day_df[[id_, 'datetime']].copy()
            data_day_df_id['id'] = id_
            data_day_df_id = data_day_df_id.rename(columns = {id_ : data_value_name})
            # sort columns
            data_day_df_id = data_day_df_id[['id', data_value_name, 'datetime']]
            # data_oneday_df = pd.concat([data_oneday_df, data_day_df_id], ignore_index=True)
            
            # save one id data
            if not (save_path is None):
                # print('save: ', fn_data, id_)
                data_day_df_id.to_csv(save_path, index=False, header=False,  mode='a')
        
        # data_month_df = pd.concat([data_month_df, data_oneday_df], ignore_index=True)
        # # since the above processing is time-cosuming, 
        # # when one-day data is added, the file will be saved.
        if not (save_path is None):
            print('save: ', fn_data)
        #     data_month_df.to_csv(save_path, index=False, header=False,  mode='a')
            
    return None

In [None]:
root_folder = os.path.join(os.getcwd(), '2022-06')
data_value_name = 'speed'
save_file = 'traffic_speed_data'

load_one_month_data(root_folder, data_value_name, save_path=None)

### 2.2.2 Compute monthly average results

In [None]:
# read one-month data
data_month_df = pd.read_csv('carpark_data/carpark_1h_2022-06.csv',
                            parse_dates = [2])

data_month_df['hour'] = data_month_df['datetime'].dt.hour