In [20]:
import pathlib

import pandas as pd
import geopandas as gpd

In [21]:
from codes.utils.load_data.load_data_metro import load_metro_station_list_v1, preprocess_metro_station

excel_path = pathlib.Path('station_line_list/station_list.xlsx')
line_info = pd.read_excel(excel_path, sheet_name='line_info')

# Load metro station data
sta_df = load_metro_station_list_v1(excel_path)
# Preprocess metro station data
sta_df = preprocess_metro_station(sta_df, time_status='current')


# Search string for OneMap API
sta_df['search_val'] = sta_df.apply(
    lambda x : '{0} {1} Station ({2})'.format(x['station_name'], x['line_type'], x['station_code']).upper(), axis=1)

print('\nShape of data:', sta_df.shape)

Loading data...: 14it [00:01,  7.51it/s]


Shape of data: (215, 24)



  data['closed_date'] = data['closed_date'].infer_objects(copy=True).fillna(pd.Timestamp.max)


# 1. Metro station location

#### (1) OneMap API

- Search string format: 'XXX MRT|LRT Station (XXX)'

    - Example: CHOA CHU KANG LRT STATION (BP1)

In [11]:
import time
from codes.utils.request_data.one_map_token import access_token
from codes.utils.request_data.request_data_onemap import address_search, request_address_from_list

# Request location data from OneMap API
sta_search = sta_df['search_val'].unique().tolist()
sta_loc = request_address_from_list(sta_search, sleep_time=0.05)
sta_loc = pd.DataFrame(sta_loc) \
            .drop_duplicates(subset=['SEARCHVAL'])


# "search value" and "building" name are inconsistent
print('\nShape of data:', sta_loc.shape,
      '\nMismatched data:', sta_loc[sta_loc['SEARCHVAL'] != sta_loc['BUILDING']])


# merge location info
sta_loc = sta_df.merge(sta_loc, left_on='search_val', right_on='SEARCHVAL', how='left') \
    .sort_values(by=['sheet_name', 'no'])

# Unavailable location data
print('\nUnavailable location data:', sta_loc[sta_loc['SEARCHVAL'].isnull()])

# save location data
# sta_loc.to_csv('station_line_list/station_location.csv', index=False)

Requesting data: 100%|██████████| 228/228 [00:29<00:00,  7.65it/s]



Shape of data: (228, 10) 
Mismatched data: Empty DataFrame
Columns: [SEARCHVAL, BLK_NO, ROAD_NAME, BUILDING, ADDRESS, POSTAL, X, Y, LATITUDE, LONGITUDE]
Index: []

Unavailable location data:      no station_code        station_name  build_date         opening_date  \
73   29         CC29        HarbourFront         NaN  2011-10-08 00:00:00   
78    2          CE1            Bayfront         NaN  2012-01-14 00:00:00   
151   1           CG         Tanah Merah         NaN  1989-11-04 00:00:00   
83    4          DT4                Hume         NaN                  u/c   
95   16         DT16            Bayfront         NaN  2013-12-22 00:00:00   
117  37         DT37        Sungei Bedok         NaN                  u/c   
130  13         EW13           City Hall         NaN  1987-12-12 00:00:00   
131  14         EW14       Raffles Place         NaN  1987-12-12 00:00:00   
141  24         EW24         Jurong East         NaN  1988-11-05 00:00:00   
154   1          NE1        HarbourFro

In [19]:
# research for unavailable location data
search_val_li = sta_loc[sta_loc['SEARCHVAL'].isnull()]
search_val_li = search_val_li.apply(
    lambda x : '{0} {1} Station'.format(x['station_name'], x['line_type']).upper(), axis=1)
search_val_li = search_val_li.unique().tolist()

# mamually search for unavailable location data
search_val_li.extend([
    'HARBOURFRONT MRT STATION (NE1 / CC29)', 
    'CITY HALL MRT STATION (EW13 / NS25)',
    'RAFFLES PLACE MRT STATION (EW14 / NS26)',
    'JURONG EAST MRT STATION (EW24 / NS1)',
    'PUNGGOL COAST MRT STATION (NE18)',
    'RAFFLES PLACE MRT STATION (EW14 / NS26)',
    'TANJONG RHU MRT STATION (TE23)',
])

sta_loc_2 = request_address_from_list(search_val_li, sleep_time=0.05)
sta_loc_2 = pd.DataFrame(sta_loc_2)
sta_loc_2.to_csv('station_line_list/station_location_2.csv', index=False)

Requesting data: 100%|██████████| 21/21 [00:03<00:00,  6.15it/s]
