
**Carpark Availability data:** Request from Datamall API: 2.12 CARPARK AVAILABILITY


In [1]:
import os
import copy
import datetime
import pandas as pd

# 1. Extract Carpark Location

In [3]:
def read_carpark_loc(path):
    '''

    Parameters
        path : TYPE
            DESCRIPTION.

    Returns
        data : TYPE
            DESCRIPTION.
        
    Example
    -------
    path = '2022-06/2022-06-01/2022-06-01-00-49-27.csv' 
    data = read_carpark_loc(path)

    '''
    # initialization
    if path is None:
        data = pd.DataFrame(columns = ['CarParkID', 'Development', 'LotType', 'Agency', 'id', 'lat', 'lng'],
                            dtype = str)
    
    try:
        data = pd.read_csv(path, header=0, index_col=None, dtype=str)
    except:
        return None
    
    # new carpark ID
    data['id'] = data['CarParkID'].str.cat(data['LotType'].to_list(), sep='_')
    
    data['lat'] = data['Location'].str.split(' ').str.get(0)
    data['lng'] = data['Location'].str.split(' ').str.get(1)
    
    data.drop(['Location', 'Area', 'AvailableLots'], axis=1, inplace=True)
    
    data = data.astype(dtype = str)
    
    return data
# =============================================================================


def extract_carpark_location(folder_path, save_path=None):
    '''
    read one month carpark availability data, and extract the carpark location

    Parameters:
        folder_path (str) : DESCRIPTION.
        save_path (str) : DESCRIPTION.

    Returns:
        data_month_df (pandas.DataFrame) : 
    
    '''
    # initialization
    data_month_df = read_carpark_loc(path=None)
    
    fd_day_li = os.listdir(folder_path)
    
    for fdn in fd_day_li:
        fd_day_path = os.path.join(folder_path, fdn)
        fn_data_li = os.listdir(fd_day_path)
        
        for fn_data in fn_data_li:
            
            data_path = os.path.join(fd_day_path, fn_data)
            data_new = read_carpark_loc(data_path)
            
            if data_new is None:
                continue
            
            data_month_df = pd.concat([data_month_df, data_new], axis=0, ignore_index=True)
            data_month_df.drop_duplicates(inplace=True, ignore_index=True)
        
        # drop duplicate
        try:
            assert data_month_df['id'].unique().shape[0] == data_month_df.shape[0]
        except:
            print('duplicate :', data_month_df['id'].unique().shape[0], data_month_df.shape[0])
            data_month_df.drop_duplicates(subset=['id'], inplace=True)
            
        # since the above processing is time-cosuming, 
        # when one-day data is added, the file will be saved.
        if not (save_path is None):
            print('save: ', fdn, data_month_df.shape[0])
            data_month_df.to_csv(save_path, index=False)
    
    return data_month_df
# =============================================================================

In [6]:
path = '2022-06/2022-06-01/2022-06-01-00-49-27.csv' 
data = read_carpark_loc(path)
data

In [None]:
# Extract the carpark location from one-month files

folder_path = os.path.join(os.getcwd(), '2022-06')
save_path = 'carpark_data/carpark_location_2022-06.csv'

data = extract_carpark_location(folder_path, save_path)

# 2. Extract Monthly Average Carpark Availability Data

## 2.1 Read function of one single file 

In [7]:
def read_carpark_avali(path):
    '''
    
    Parameters:
        path (str) : 

    Returns:
        data (pandas.DataFrame) :

    '''
    try:
        data = pd.read_csv(path, header=0, index_col=None, dtype=str)
    except:     # data is None
        data = pd.DataFrame(columns=['id', 'lots'], dtype=str)
        return None
    
    data['id'] = data['CarParkID'].str.cat(data['LotType'].to_list(), sep='_')
    
    data.drop_duplicates(inplace=True)
    # print(data['id'].value_counts())
    try:
        assert data['id'].unique().shape[0] == data.shape[0]
    except:
        print(data['id'].unique().shape[0], data.shape[0])
        data.drop_duplicates(subset='id', inplace=True)
    
    # data.drop(['Location', 'Area', 'CarParkID', 'Development', 'LotType', 'Agency'], 
    #           axis=1, inplace=True)    
    data = data.rename(columns = {'AvailableLots' : 'lots'})
    data = data[['id', 'lots']]
    data['lots'] = data['lots'].map(np.int32)

    return data
# =============================================================================

In [None]:
path = '2022-06/2022-06-01/2022-06-01-00-49-27.csv' 
data = read_carpark_loc(path)
data

## 2.2 Compute Monthly Average Data

### 2.2.1 Load one-month data

- **Resample frequency:** 1 hour

- **Resample method:** average


In [None]:
def load_one_month_data(root_folder, data_value_name, save_path=None):
    '''
    '''
    # one-month data
    data_month_df = pd.DataFrame(columns=['id', data_value_name, 'datetime'])
    data_month_df.to_csv(save_path, index=False, header=True)
    
    # one month data
    for fd_day in os.listdir(root_folder):

        fd_day_path = os.path.join(root_folder, fd_day)
        fn_data_li = os.listdir(fd_day_path)

        # one day data
        data_day_df = pd.DataFrame(columns=['id', data_value_name, 'datetime'])

        for fn_data in fn_data_li:
            # read data
            data_path = os.path.join(fd_day_path, fn_data)
            data_df = read_carpark_avali(data_path)
            # print('read data:', data_path)

            if data_df is None:
                continue 

            dt_str = fn_data.split('.')[0]
            data_df['datetime'] = dt_str

            # data_df['year']   = int(dt_str.split('-')[0])
            # data_df['month']  = int(dt_str.split('-')[1])
            # data_df['day']    = int(dt_str.split('-')[2])
            # data_df['hour']   = int(dt_str.split('-')[3])
            # data_df['minute'] = int(dt_str.split('-')[4])
            # data_df['second'] = int(dt_str.split('-')[5])
            data_day_df = pd.concat([data_day_df, data_df], ignore_index=True)

        if data_day_df.shape[0] == 0:
            continue

        data_day_df = data_day_df.pivot(index='datetime', columns='id', values=data_value_name)
        data_day_df.dropna(how='all', inplace=True)

        dt_index = list(map(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d-%H-%M-%S"),
                            data_day_df.index.to_list()))

        data_day_df.index = dt_index

        # resample 1 hour (1H) mean
        data_day_df = data_day_df.resample(rule='1H', closed='left', label='left').mean()
        
        # id list
        id_li = data_day_df.columns.to_list()
        
        data_day_df['datetime'] = list(map(str, data_day_df.index))
        data_day_df.reset_index(drop=True, inplace=True)
        
        # one-day data
        # data_oneday_df = pd.DataFrame(columns=['id', data_value_name, 'datetime'])
        
        for id_ in id_li:

            data_day_df_id = data_day_df[[id_, 'datetime']].copy()
            data_day_df_id['id'] = id_
            data_day_df_id = data_day_df_id.rename(columns = {id_ : data_value_name})
            # sort columns
            data_day_df_id = data_day_df_id[['id', data_value_name, 'datetime']]
            # data_oneday_df = pd.concat([data_oneday_df, data_day_df_id], ignore_index=True)
            
            # save one id data
            if not (save_path is None):
                # print('save: ', fn_data, id_)
                data_day_df_id.to_csv(save_path, index=False, header=False,  mode='a')
        
        # data_month_df = pd.concat([data_month_df, data_oneday_df], ignore_index=True)
        # # since the above processing is time-cosuming, 
        # # when one-day data is added, the file will be saved.
        if not (save_path is None):
            print('save: ', fn_data)
        #     data_month_df.to_csv(save_path, index=False, header=False,  mode='a')
            
    return None

In [None]:
root_folder = os.path.join(os.getcwd(), '2022-06')
data_value_name = 'lots'
save_path = 'carpark_data/carpark_1h_2022-06.csv'
load_one_month_data(root_folder, data_value_name, save_path)

### 2.2.2 Compute monthly average results

In [None]:
# read one-month data
file_path = 'carpark_data/carpark_1h_2022-06.csv'
data_month_df = pd.read_csv(file_path, parse_dates = [2])

data_month_df['hour'] = data_month_df['datetime'].dt.hour

weekday and weekends/holiday information

In [None]:
# create 'is_weekday' column to indicate 
data_month_df['is_weekday'] = data_month_df['datetime'].dt.dayofweek
data_month_df['is_weekday'] = data_month_df['is_weekday'].apply(lambda x: True if x <= 4 else False)

In [None]:
data_month_df.drop('datetime', axis=1, inplace=True)

# x = data_month_df[data_month_df['is_weekday'] == False]
# x['day'] = data_month_df['datetime'].dt.day
# x = x.drop_duplicates(subset=['day', 'is_weekday'])

data_month_df = data_month_df.groupby(['id', 'hour', 'is_weekday'], as_index=False).mean()
data_month_df.to_csv('carpark_data/carpark_1h_mean_2022-06.csv', index=False)

# data_all_df1.plot(lw=1, alpha=0.5, color='skyblue', legend=False)