In [1]:
import json
import pandas as pd
from typing import Literal
import re

HOUSENUMBER_PATTERN = r'[０-９之臨]+號'
POI_COLUMNS = ['Name', 'Pure', 'Informal', 'Street', 'Lane', 'Alley', 'Housenumber', 'X', 'Y', 'Codes']

In [None]:
# Handle the raw data and create an intermediate JSON file.
df = pd.read_csv('../data/demand/registration.csv',
                 dtype={
                     '行業代號': str,
                     '行業代號1': str,
                     '行業代號2': str,
                     '行業代號3': str
                 },
                 encoding='utf-8',
                 usecols=['營業地址', '營業人名稱', '行業代號', '行業代號1', '行業代號2', '行業代號3'])
df = df.loc[df['營業地址'].str.contains('臺北市')].reset_index(drop=True).rename(columns={
    '營業地址': 'Address',
    '營業人名稱': 'Name',
    '行業代號': 'Code_1',
    '行業代號1': 'Code_2',
    '行業代號2': 'Code_3',
    '行業代號3': 'Code_4',
})
df['Codes'] = df[['Code_1', 'Code_2', 'Code_3', 'Code_4']].apply(
    lambda row: [x for x in row if pd.notna(x)],
    axis=1
)
all_codes = pd.concat([df["Code_1"], df["Code_2"], df["Code_3"], df["Code_4"]])
unique_codes = all_codes.dropna().unique().tolist()
df = df.drop(['Code_1', 'Code_2', 'Code_3', 'Code_4'], axis=1)
display(df.head(10))

if isinstance(unique_codes, list):
    unique_codes.sort()
    unique_codes = pd.Series(unique_codes)
    unique_codes.to_csv('../data/demand/registration_codes.csv', index=False, header=False)

df.to_json('../data/demand/registration.json', force_ascii=False, index=False)
print(unique_codes)

In [2]:
# Read the intermediate JSON file.
reg = pd.read_json('../data/demand/registration.json')

def sanitize_address(address: pd.Series) -> pd.DataFrame:
    df = pd.DataFrame(columns=['Original', 'Pure', 'Informal'])
    df['Original'] = address
    add = (address
                  # Remove the city name
                  .str.replace(r'(?:台|臺)北市', '', regex=True)

                  # Remove the district name
                  .str.replace(r'(?:北投|士林|大同|中山|萬華|中正|松山|大安|信義|內湖|南港|文山|龍山|延平|古亭|建成|城中|雙園|景美|木柵)區', '', regex=True)  

                  # Remove the village name  
                  .str.replace(r'\w\w里', '', regex=True)

                  # Re-format the scetion name
                  .str.replace('１段', '一段')
                  .str.replace('２段', '二段')
                  .str.replace('３段', '三段')
                  .str.replace('４段', '四段')
                  .str.replace('５段', '五段')
                  .str.replace('６段', '六段')
                  .str.replace('７段', '七段')
                  .str.replace('８段', '八段')
                  .str.replace('９段', '九段')  # Yanping North Rd. has the most sections in Taipei City, up to nine sections

                  # Remove the neighborhood name
                  .str.replace('０', '0')
                  .str.replace('１', '1')
                  .str.replace('２', '2')
                  .str.replace('３', '3')
                  .str.replace('４', '4')
                  .str.replace('５', '5')
                  .str.replace('６', '6')
                  .str.replace('７', '7')
                  .str.replace('８', '8')
                  .str.replace('９', '9')
                  .str.replace(r'\d+鄰', '', regex=True)

                  # Revert the conversion of full-width numbers
                  .str.replace('0', '０')
                  .str.replace('1', '１')
                  .str.replace('2', '２')
                  .str.replace('3', '３')
                  .str.replace('4', '４')
                  .str.replace('5', '５')
                  .str.replace('6', '６')
                  .str.replace('7', '７')
                  .str.replace('8', '８')
                  .str.replace('9', '９')
          )
    
    df['Pure'] = add

    # The addresses that need special care
    df['Informal'] = df['Original'].str.contains(r'（|）|\(|\)|\.|﹒|、|,|，|;|；|\-|－|及|(?<!安|中|雲|萬|民|雙)和(?!平|豐|興)|至(?!善|誠)', regex=True) | (df['Pure'] == '')
    return df

reg_address = sanitize_address(reg['Address'])
display(reg_address.head(10))

Unnamed: 0,Original,Pure,Informal
0,臺北市,,True
1,臺北市,,True
2,臺北市,,True
3,臺北市,,True
4,臺北市,,True
5,臺北市中山區,,True
6,臺北市中山區一江街１０號（１樓）,一江街１０號（１樓）,True
7,臺北市中山區一江街３８之１號１樓,一江街３８之１號１樓,False
8,臺北市中山區一江街４３號１樓,一江街４３號１樓,False
9,臺北市中山區下埤里下埤里龍江路４０３號,龍江路４０３號,False


In [None]:
# Create the hierachy structure of the addresses
def build_hierarchy(address_df: pd.DataFrame) -> dict | Literal['']:
    def clean_value(val: str) -> str:
        return '' if pd.isna(val) else val
    
    def collapse(d):
        if isinstance(d, dict):
            if not d:
                return ''
            return {k: collapse(v) for k, v in d.items()}
        return d
    
    def get_housenumber(val: str) -> str:
        match = re.match(HOUSENUMBER_PATTERN, str(val))
        return match[0] if match is not None else str(val)

    nested = {}

    for _, row in address_df.iterrows():
        street = clean_value(row['Street'])
        lane   = clean_value(row['Lane'])
        alley  = clean_value(row['Alley'])
        housenumber = get_housenumber(row['Housenumber'])
        coord  = [float(row['X']), float(row['Y'])]

        if street not in nested:
            nested[street] = {}

        if lane:
            if lane not in nested[street]:
                nested[street][lane] = {}

        if lane and alley:
            if alley not in nested[street][lane]:
                nested[street][lane][alley] = {}

        if lane and alley:
            nested[street][lane][alley].setdefault('numbers', dict())[housenumber] = coord

        elif lane:
            nested[street][lane].setdefault('numbers', dict())[housenumber] = coord
            
        else:
            nested[street].setdefault('numbers', dict())[housenumber] = coord

    return collapse(nested)

address = (pd.read_csv('../data/demand/address.csv', usecols=['街路段', '巷', '弄', '號', '橫座標', '縱座標'])
             .rename(columns={'街路段': 'Street', '巷': 'Lane', '弄': 'Alley', '號': 'Housenumber', '橫座標': 'X', '縱座標': 'Y'}))

hierarchy = build_hierarchy(address)

# Save to the intermediate file
with open('../data/demand/hierachy.json', 'w', encoding='utf-8') as f:
    json.dump(hierarchy, f, ensure_ascii=False)

del address

In [3]:
# Load the hierarchy file
with open('../data/demand/hierachy.json', 'r', encoding='utf-8') as f:
    hierarchy = json.load(f)

In [4]:
# Perform the first address match
def match_address(hierarchy: dict, address_str: str):
    address_str_copy = address_str
    street_str, lane_str, alley_str, housenumber_str = None, None, None, None
    coord: list[float] | None = None
    isValid, hasLane, hasAlley = False, False, False

    for street in hierarchy.keys():
        if re.search(street, address_str_copy):
            street_str = street
            address_str_copy = re.sub(street, '', address_str_copy)
            isValid = True
            break

    if isValid and isinstance(hierarchy.get(street_str), dict):
        for lane in hierarchy[street_str].keys():
            if lane and re.search(lane, address_str_copy):
                lane_str = lane
                address_str_copy = re.sub(lane, '', address_str_copy)
                hasLane = True
                break

    if hasLane and isinstance(hierarchy[street_str].get(lane_str), dict):
        for alley in hierarchy[street_str][lane_str].keys():
            if alley and re.search(alley, address_str_copy):
                alley_str = alley
                address_str_copy = re.sub(alley, '', address_str_copy)
                hasAlley = True
                break

    if isValid:
        housenumber_match = re.match(HOUSENUMBER_PATTERN, address_str_copy)
        if housenumber_match is not None:
            housenumber_str = housenumber_match[0]
        
    if housenumber_str is not None:
        if hasAlley:
            coord_dict: dict = hierarchy[street_str][lane_str][alley_str]

        elif hasLane:
            coord_dict: dict = hierarchy[street_str][lane_str]

        else:
            coord_dict: dict = hierarchy[street_str]

        number_dict = coord_dict.get('numbers', {})
        coord = number_dict.get(housenumber_str, None)

    return {
        'Street': street_str,
        'Lane': lane_str,
        'Alley': alley_str,
        'Housenumber': housenumber_str,
        'X': None if coord is None else coord[0],
        'Y': None if coord is None else coord[1]
    }

first_match = pd.DataFrame(reg_address['Pure'].apply(lambda addr: match_address(hierarchy, addr)).tolist())
reg_first_batch = pd.concat([reg[['Name', 'Codes']], reg_address[['Pure', 'Informal']], first_match], axis=1)
match_count = reg_first_batch['X'].isna().value_counts().iloc[0]
print(f'Records with match: {match_count} ({round(match_count / len(reg) * 100)}%)')
print(f'Records without match: {len(reg) - match_count} ({round((len(reg) - match_count) / len(reg) * 100)}%)')
reg = reg_first_batch[POI_COLUMNS].rename(columns={'Pure': 'Address'})
display(reg.head(20))

# Export match results.
valid_filter = ~reg['Informal'] & ~reg['X'].isna()
reg[valid_filter][['Name', 'Address', 'Codes', 'X', 'Y']].to_json('../data/demand/registration_first_batch.json', force_ascii=False, index=False)
reg[~valid_filter].to_json('../data/demand/registration_first_iteration.json', force_ascii=False, index=False)

del first_match, match_count, reg_address, valid_filter

Records with match: 227941 (90%)
Records without match: 25614 (10%)


Unnamed: 0,Name,Address,Informal,Street,Lane,Alley,Housenumber,X,Y,Codes
0,Ｅｌｓｅｖｉｅｒ ＢＶ,,True,,,,,,,"[581112, 581212, 581900]"
1,ＵｐＴｏＤａｔｅ，　Ｉｎｃ．,,True,,,,,,,"[582099, 581112, 631299, 631100]"
2,Ｒａｋｕｔｅｎ　Ｋｏｂｏ　Ｉｎｃ．,,True,,,,,,,[581312]
3,Ａｉｒｂｎｂ　Ｉｒｅｌａｎｄ　ＵＣ,,True,,,,,,,[631299]
4,Ａｔｌａｓｓｉａｎ　Ｐｔｙ　Ｌｔｄ,,True,,,,,,,[582099]
5,ＴｒａｄｉｎｇＶｉｅｗ，　Ｉｎｃ．,,True,,,,,,,[639099]
6,經典數位印刷有限公司松江分公司,一江街１０號（１樓）,True,一江街,,,１０號,303616.453185,2771559.0,"[820300, 820912, 581900, 631299]"
7,一江鎖印店,一江街３８之１號１樓,False,一江街,,,３８之１號,303619.526062,2771845.0,"[485218, 959917, 969018]"
8,白貝殼廣告影像有限公司,一江街４３號１樓,False,一江街,,,４３號,303641.266417,2771902.0,[760199]
9,龍江女子美容院,龍江路４０３號,False,龍江路,,,４０３號,304585.61648,2773174.0,[962111]
