In [1]:
import json

buildings = None

with open('buildings.json') as f:
    buildings = json.load(f)

In [2]:
# tl;dr go through each of the entries
# and create one with "BUILDINGS"
collection_by_address = dict()

def full_address(entry):
    return (entry['BLK_NO'], entry['ROAD_NAME'], entry['POSTAL'])

for b in buildings:
    same_address = collection_by_address.setdefault(full_address(b), [])
    same_address.append(b)


In [4]:
buildings_by_address = [
    (k, [v['BUILDING'] for v in vs])
    for k, vs in collection_by_address.items()
]
buildings_by_address[0]
address_with_multiple_buildings = list(filter(lambda x: len(x[1]) > 1, buildings_by_address))
address_with_multiple_buildings[0:1], len(address_with_multiple_buildings)

([(('21', 'PARK STREET', '018925'),
   ['DBS MARINA BAY MRT STATION',
    'MARINA BAY MRT STATION',
    'MARINA BAY MRT STATION (NS27)',
    'OCBC MARINA BAY MRT',
    'UOB MARINA BAY MRT STATION'])],
 5985)

In [7]:
from collections import Counter
import re

def transform(name):
    name = re.sub('\\bMRT STATION', 'MRT', name)
    name = re.sub('^THE\\b', '', name)
    return name

NAUGHTY_NAMES = [
    '\\bKINDERGARTEN\\b',
    '\\bPRESCHOOL\\b',
    '^DBS\\b',
    '^UOB\\b',
    '^CITIBANK\\b',
    '^OCBC\\b',
    '^MAYBANK\\b',
    '^STANDARD CHARTERED BANK\\b',
    '\\bMONTESSORI\\b',
    '\\bPTE\\b',
    '\\bLTD\\b',
    '\\bSKOOL\\b',
    '\\bSTUDENT CARE\\b',
    '\\bNASCANS\\b',
    '\\bCHILD CARE CENTRE\\b',
]

def estimate_weight(name):
    # If a building name ends with "XX BUILDING" it is probably the building name?
    if re.search('\\bBUILDING$', name):
        return 50
    # Annoyingly, Onemap includes all the kindergartens
    # and ATMs, *past and present*. Give these entries much less weight
    if any([re.search(n, name) for n in NAUGHTY_NAMES]):
        return 1
    return 10


def best_building_names(building_names):
    ctr = Counter()

    if len(building_names) <= 1:
        return building_names

    # Drop station IDs in "STATION (EW1)" --
    # most ATM landmarks don't include the station ID, so the
    # station ID prevents the station name from being selected as the
    # most canonical entry
    building_names = [
        re.sub(' STATION \\([A-Z]{2}[0-9]{1,2}( / [A-Z]{2}[0-9]{1,2})*\\)$', ' STATION', name)
        for name in building_names
    ]
    
    # Try to find the most "general" building name, i.e.
    # the name that can be embedded in most other names,
    # after discounting "THE"
    for s in building_names:
        for t in building_names:
            if transform(s) in t:
                ctr.update({s: estimate_weight(s)})
            elif transform(t) in s:
                ctr.update({t: estimate_weight(t)})

    # No embeddings...
    if len(ctr) <= 1:
        return building_names

    # If tied, return all
    top_two = ctr.most_common(2)
    if top_two[1][1] == top_two[0][1]:
        return sorted(building_names, key=lambda x: len(x))
    else:
        return [top_two[0][0]]

def cleanup(n):
    if 'NIL' in n:
        return []
    if 'MULTI STOREY CAR PARK' in n:
        return []
    if 'HDB PUBLIC SHELTERS' in n:
        return []
    if any([re.search('^HDB-', i) for i in n]):
        return []
    if any([re.search('MARKET (AND|&) (HAWKER|FOOD) CENTRE', i) for i in n]):
        return []
    # return n
    return best_building_names(n)

In [9]:
buildings_by_address = [
    (k, cleanup([v['BUILDING'] for v in vs]))
    for k, vs in collection_by_address.items()
]
buildings_by_address[0]
address_with_multiple_buildings = list(filter(lambda x: len(x[1]) > 1, buildings_by_address))
address_with_multiple_buildings[0:2], len(address_with_multiple_buildings)

([(('8', 'MARINA VIEW', '018960'),
   ['DBS ASIA SQUARE',
    'ASIA SQUARE TOWER 1',
    'CONSULATE OF THE REPUBLIC OF SLOVENIA',
    'CONSULATE OF SAINT VINCENT AND THE GRENADINES']),
  (('12', 'MARINA VIEW', '018961'),
   ['ASIA SQUARE TOWER 2', 'THE WESTIN SINGAPORE'])],
 992)

In [13]:
# Here, I tried fetching the list of ATMs from Onemap
# and using that data to clean the Onemap postcode entries.
# The effectiveness is only around 80% (500+ OCBC ATMs -> 100+ OCBC ATMs)
import json

ocbc = None

with open('./ocbc-atms.json') as f:
    ocbc = json.load(f)


In [34]:
import re

def maybe_add_ocbc(n):
    if re.search('^OCBC ', n):
        return n.upper().strip()
    return ('OCBC ' + n.upper()).strip()

ocbc_addresses = set([
    maybe_add_ocbc(o['landmark']) for o in ocbc['nearby_ocbc_atms']
])
sorted(ocbc_addresses)

['OCBC 112 KATONG',
 'OCBC 313@SOMERSET',
 'OCBC 7-4 DEPTFORD ROAD',
 'OCBC ADMIRALTY MRT STATION',
 'OCBC ADMIRALTY PLACE',
 'OCBC ALEXANDRA ROAD - FAIRPRICE',
 'OCBC ALEXANDRA VILLAGE BRANCH',
 'OCBC ALJUNIED AVENUE 2 - FAIRPRICE',
 'OCBC ALJUNIED BRANCH',
 'OCBC ALJUNIED MRT STATION',
 'OCBC AMARA HOTEL - FAIRPRICE',
 'OCBC AMK HUB',
 'OCBC AMOY STREET FOOD CENTRE',
 'OCBC ANG MO KIO AVENUE 1',
 'OCBC ANG MO KIO AVENUE 1 - 7-ELEVEN',
 'OCBC ANG MO KIO AVENUE 1 - FAIRPRICE',
 'OCBC ANG MO KIO AVENUE 10',
 'OCBC ANG MO KIO AVENUE 10 - 7-ELEVEN',
 'OCBC ANG MO KIO AVENUE 10 - CHEERS',
 'OCBC ANG MO KIO AVENUE 3',
 'OCBC ANG MO KIO AVENUE 3 - ESSO STATION',
 'OCBC ANG MO KIO AVENUE 6 - FAIRPRICE',
 'OCBC ANG MO KIO AVENUE 8 - NANYANG POLYTECHNIC',
 'OCBC ANG MO KIO BRANCH',
 'OCBC ANG MO KIO BUS INTERCHANGE',
 'OCBC ANG MO KIO CENTRAL',
 'OCBC ANG MO KIO CENTRAL BRANCH',
 'OCBC ANG MO KIO MRT STATION',
 'OCBC ASIA SQUARE TOWER 1',
 'OCBC BALESTIER BRANCH',
 'OCBC BALESTIER ROAD - CALTEX

In [35]:
import re

ocbc_atms = sorted([
    b
    for a in address_with_multiple_buildings
    for b in a[1]
    if re.search('^OCBC ', b)
])
ocbc_atms

['OCBC 313@SOMERSET',
 'OCBC 7-4 DEPTFORD ROAD',
 'OCBC ADMIRALTY MRT',
 'OCBC ADMIRALTY PLACE',
 'OCBC ALEXANDRA RD - FAIRPRICE',
 'OCBC ALJUNIED AVE 2 - FAIRPRICE',
 'OCBC ALJUNIED MRT',
 'OCBC AMARA HOTEL - FAIRPRICE',
 'OCBC AMK HUB',
 'OCBC AMOY STREET FOOD CENTRE',
 'OCBC ANG MO KIO AVE 1',
 'OCBC ANG MO KIO AVE 1 - 7 ELEVEN',
 'OCBC ANG MO KIO AVE 1 - FAIRPRICE',
 'OCBC ANG MO KIO AVE 10',
 'OCBC ANG MO KIO AVE 10 - 7 ELEVEN',
 'OCBC ANG MO KIO AVE 10 - CHEERS',
 'OCBC ANG MO KIO AVE 3',
 'OCBC ANG MO KIO AVE 3 - ESSO STATION',
 'OCBC ANG MO KIO AVE 6 - FAIRPRICE',
 'OCBC ANG MO KIO AVE 8 - NANYANG POLYTECHNIC',
 'OCBC ANG MO KIO BRANCH',
 'OCBC ANG MO KIO BUS INTERCHANGE',
 'OCBC ANG MO KIO MRT STATION',
 'OCBC ASIA SQUARE TOWER 1',
 'OCBC BALESTIER ROAD - CALTEX STATION',
 'OCBC BALESTIER ROAD - CHEERS',
 'OCBC BANK OF SINGAPORE',
 'OCBC BEACH ROAD - CALTEX STATION',
 'OCBC BEAUTY WORLD MRT STATION',
 'OCBC BEDOK BRANCH',
 'OCBC BEDOK BUS INTERCHANGE',
 'OCBC BEDOK MALL - FAIR

In [36]:
len(set(ocbc_addresses)), len(set(ocbc_atms)), len(set(ocbc_addresses) - set(ocbc_atms))

(525, 485, 220)

In [61]:
import re

def strings_match(s1, s2, allowed_substitutions):
    map_of_substitutions = {
        ' ': '[^A-Za-z0-9]*',
        '-': '[^A-Za-z0-9]*',
    }
    
    for ls in allowed_substitutions:
        for word in ls:
            if word:
                map_of_substitutions[word] = '(' + '|'.join([re.escape(w) for w in ls]) + ')'
    
    parts = re.split("\\b", s1)

    final_re = []
    for p in parts:
        if p in map_of_substitutions:
            final_re.append(map_of_substitutions[p])
        else:
            final_re.append(re.escape(p))

    # print(''.join(final_re))
    return re.search('^' + ''.join(final_re) + '$', s2)

STANDARD_SUBSTITUTIONS = [
    ['ST', 'STREET'],
    ['RD', 'ROAD'],
    ['AVE', 'AVENUE'],
    ['STATION', ''],
]

strings_match('BEDOK SOUTH MRT STATION', 'BEDOK SOUTH MRT ', STANDARD_SUBSTITUTIONS)
    

<re.Match object; span=(0, 16), match='BEDOK SOUTH MRT '>

In [63]:
len(set([
    a
    for a in ocbc_addresses
    if not any([strings_match(a, b, STANDARD_SUBSTITUTIONS) for b in ocbc_atms])
]))

102