In [1]:
# "k" values related to address
address_k = [ 'addr:city',
              'addr:city:simc',
              'addr:country',
              'addr:county',
              'addr:door',
              'addr:housename',
              'addr:housenumber',
              'addr:inclusion',
              'addr:interpolation',
              'addr:place',
              'addr:postcode',
              'addr:state',
              'addr:street',
              'addr:substreet',
              'addr:substreet1',
              'addr:substreet:1',
              'addr:substreet:2',
              'addr:town',
              'addr:unit' ]

In [None]:
import os
os.chdir("..")

In [2]:
from audit import audit_k_v
import pprint
import re

SAMPLE = "oxford_sample.osm"

all_k_v_dict = audit_k_v(SAMPLE, ["node", "way", "relation"])

In [3]:
# Auditing "k" values related to address
for each_k in address_k:
    print each_k
    pprint.pprint( all_k_v_dict[each_k] )
    print "\n"

addr:city
set(['<different>',
     'Bergheim',
     'Besko',
     'Eynsham, Witney',
     'Genthin',
     'Headington, Oxfordshire',
     'Jardee',
     'Lexington',
     'Neuss',
     'Oxford',
     'Puconci',
     'Saint-Julien-des-Landes',
     'San Sebastian',
     'Villa Nueva',
     'Virginia Beach',
     'Woodbridge',
     u'Zuj\u016bnai'])


addr:city:simc
set(['0440325'])


addr:country
set(['DE', 'GB', 'US'])


addr:county
set(['Oxfordshire'])


addr:door
set(['Flats 13-24'])


addr:housename
set(['25',
     '33',
     'BLOCK J',
     'BLOCK V',
     'Barnwood Ho',
     'Bishops Gate',
     'CANTELOUP HOUSE',
     'Chestnut Place',
     'Courtney Pianos',
     'Damson House',
     'Edith Road Workshops',
     'Fitzgerald House',
     'Flat 11',
     'Flat 4',
     'Foundry House',
     'Furnace House',
     'G371',
     'Hayward House',
     'Jubilee House',
     'Knowle Cottage',
     'M&S',
     'Mary Powell House',
     'Matthew Arnold School',
     'Norham End',
     'Nor

### Updating house numbers

In [3]:
# Writing a function that updates a house number


num_capital = re.compile(r'\d[A-Z]')
num_single = re.compile(r'^\d+[a-z]?$')
num_comma = re.compile(r'^\d+[a-z]?(\s?,\s?\d+[a-z]?)*$')
num_semicolon = re.compile(r'^\d+[a-z]?(\s?;\s?\d+[a-z]?)*$')
num_dash = re.compile(r'^\d+\s?-\s?\d+$')


def update_housenumber(num_string):
    
    # Standardize each letter suffix into lowercase
    if num_capital.search(num_string):
        num_string = num_string.lower()
    
    # Process an input string that expresses a single housenumber
    if num_single.search(num_string):
        housenumber_list = [num_string]
    
    # Process an input string that expresses range
    elif num_dash.search(num_string):
        pair = num_string.replace(" ","").split("-")
        lower_bound = int(pair[0])
        upper_bound = int(pair[1]) + 1
        housenumber_list = range(lower_bound, upper_bound)
        housenumber_list = map(str, housenumber_list)  # For consistent format
    
    # Process an input string that uses comma listing
    elif num_comma.search(num_string):
        housenumber_list = num_string.replace(" ","").split(",")
    
    # Process an input string that uses semicolon listing
    elif num_semicolon.search(num_string):
        housenumber_list = num_string.replace(" ","").split(";")
    
    else:
        housenumber_list = [num_string]
    
    housenumber_list.sort()
    
    return housenumber_list

In [4]:
# Testing the function
print update_housenumber('304')
print update_housenumber('30,32')
print update_housenumber('315,315A,315B')
print update_housenumber('25-30')
print update_housenumber('21 - 24')
print ""
print update_housenumber('64-64a')
print update_housenumber('48A-48D')
print update_housenumber('G413')
print update_housenumber('Greentiles')
print update_housenumber('William Baker House')

['304']
['30', '32']
['315', '315a', '315b']
['25', '26', '27', '28', '29', '30']
['21', '22', '23', '24']

['64-64a']
['48a-48d']
['G413']
['Greentiles']
['William Baker House']


In [5]:
## Writing a function that checks whether an updated house number follows 
## the pre-defined standard format (i.e. list of discrete numbers in string format)


def std_housenumber(num_string_list):
    
    # Output with more than one element is of standard format
    if len(num_string_list) > 1:
        return True
    
    # Evaluate output with one element
    else:
        elem = num_string_list[0]
        if num_single.search(elem):
            return True
        else:
            return False

In [6]:
# Testing the function
print std_housenumber(['304'])
print std_housenumber(['304b'])
print std_housenumber(['21', '22', '23', '24'])
print std_housenumber(['315', '315a', '315b'])
print ""
print std_housenumber(['64-64a'])
print std_housenumber(['48a-48d'])
print std_housenumber(['G413'])
print std_housenumber(['William Baker House'])

True
True
True
True

False
False
False
False


### Updating postcodes

Auditing suggests that most postcodes follow a standard format with few exceptions. Hence, it will be more effective to identify these exceptions for manual checking and cleaning than to try systematical fixes.

In [14]:
## Writing a function that checks whether a postcode follows 
## the pre-defined standard format

postcode_std = re.compile(r'^[A-Z][A-Z]\d+\s\d+[A-Z][A-Z]$')

def std_postcode(num_string):
    if postcode_std.search(num_string):
        return True
    else:
        return False

In [15]:
# Testing the function
print std_postcode('OX2 7BY')
print std_postcode('OX29 8DJ')
print ""
print std_postcode('OX26LE')
print std_postcode('85150')
print std_postcode('38-524')

True
True

False
False
False


### Updating street names

Some auditing reveals that many street names in the dataset do not follow the desired standard format (street type specified at the end). Due to context and cultural differences, however, no simple programmatic fix is possible for all such names. But abbreviations can be programmatically fixed. Let's see what abbreviations are there to be fixed.

In [17]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

street_type_count = {}
for street_name in all_k_v_dict["addr:street"]:
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in street_type_count.keys():
            street_type_count[street_type] = 1
        else:
            street_type_count[street_type] += 1

common_street_types = []
uncommon_street_types = []
for street_type in street_type_count.keys():
    if street_type_count[street_type] > 5:
        common_street_types.append(street_type)
    else:
        uncommon_street_types.append(street_type)

In [18]:
common_street_types.sort()
print common_street_types

['Avenue', 'Close', 'Court', 'Crescent', 'Drive', 'Lane', 'Place', 'Road', 'Street', 'Way']


In [19]:
uncommon_street_types.sort()
print uncommon_street_types

['2', '4', "Aldate's", 'Birkheide', 'Broadway', 'Calle', 'Chorefields', u'Christian-Schaurte-Stra\xdfe', 'Circle', 'Clements', u'Dorfstra\xdfe', 'Down', 'Driftway', 'Furze', 'Gardens', 'Giles', "Giles'", 'Glebe', 'Grates', 'Ground', 'Hill', 'Hollow', 'Lemerje', 'Lohweg', 'Mead', 'Meadow', 'Mews', 'Moulin', u'M\xfchlenfeld', 'Parade', 'Park', 'Phelps', 'Pike', 'Point', 'Quarter', 'Quay', 'Rd', 'Rise', 'Roundway', 'Row', 'Slade', 'Square', 'St', 'Terrace', 'Town', 'Turn', 'Valencia', 'Walk', 'Winnyards', 'Woodfield', 'Zielona', u'g.']


Such abbreviations as "Rd" and "St" need to be fixed.

In [20]:
# Writing a function that fixes abbreviated street types


def update_street(street_name):
    
    mapping = { "St": "Street",
                "Rd" : "Road" }
    
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type in mapping.keys():
            correct_type = mapping[street_type]
            street_name = street_type_re.sub(correct_type, street_name)
    
    return street_name

In [21]:
# Testing the function
print update_street('Oxford Rd')
print update_street('High St')
print ""
print update_street('Hill Top Road')
print update_street('Hillsborough Close')
print update_street('Holywell Street')
print update_street('Hume Bedford Pike')

Oxford Road
High Street

Hill Top Road
Hillsborough Close
Holywell Street
Hume Bedford Pike


For reasons mentioned above, there is no need to write a function that checks whether a street name follows the pre-defined standard format (because such standardization seems inapplicable to the data).

### Putting it all together: Transforming address-related data

In [22]:
# Writing a function that transforms address-related data


lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')


def transform_address_k_v(addr_k, addr_v):
    
    transformed_k = ""
    transformed_v = ""
    
    #######  Transform k  #######
    
    # Specific transformation
    if addr_k.startswith("addr:substreet"):
        transformed_k = "substreet"
    
    # General transformation - Process input string with one colon
    elif lower_colon.search(addr_k):
        transformed_k = addr_k[5:]
    
    #######  Transform v  #######
    
    # Update a house number
    if (transformed_k == "housenumber"):
        transformed_v = update_housenumber(addr_v)
    
    # Update a street name
    elif (transformed_k in ["street", "substreet"]):
        transformed_v = update_street(addr_v)
    
    # Return the original
    else:
        transformed_v = addr_v
    
    return (transformed_k, transformed_v)