In [None]:
import os
os.chdir("..")

In [1]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import pprint
import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
lower_colon_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*:([a-z]|_)*$')

SAMPLE = "oxford_sample.osm"

In [2]:
## Writing a function that returns unique tags and their 
## respective occurrence in an XML document


def check_tags(file_name):
    
    osm_file = open(file_name, "r")
    
    events = ET.iterparse(osm_file, events=("start",))
    _, root = next(events)  # Grabbing the root element
    
    tags = {}
    for event, elem in events:
        if (event == "start"):
            if elem.tag in tags.keys():
                tags[elem.tag] += 1
                root.clear()  # Freeing up memory by clearing the root element
            else:
                tags[elem.tag] = 1
                root.clear()  # Freeing up memory by clearing the root element
    
    osm_file.close()
    
    return tags

In [3]:
check_tags(SAMPLE)

{'member': 1332,
 'nd': 39364,
 'node': 27319,
 'relation': 101,
 'tag': 20963,
 'way': 5035}

In [3]:
# Writing a function that returns unique attribute keys for each tag


def check_attributes(file_name):
    
    osm_file = open(file_name, "r")
    
    events = ET.iterparse(osm_file, events=("start",))
    _, root = next(events)  # Grabbing the root element
    
    attrib_dict = defaultdict(set)
    for event, elem in events:
        if (event == "start"):
            for attribute_name in elem.keys():
                attrib_dict[elem.tag].add(attribute_name)
                root.clear()  # Freeing up memory by clearing the root element
    
    osm_file.close()
    
    return attrib_dict

In [5]:
check_attributes(SAMPLE)

defaultdict(set,
            {'member': {'ref', 'role', 'type'},
             'nd': {'ref'},
             'node': {'changeset',
              'id',
              'lat',
              'lon',
              'timestamp',
              'uid',
              'user',
              'version'},
             'relation': {'changeset',
              'id',
              'timestamp',
              'uid',
              'user',
              'version'},
             'tag': {'k', 'v'},
             'way': {'changeset',
              'id',
              'timestamp',
              'uid',
              'user',
              'version'}})

In [2]:
# Writing a function that returns unique "k" and "v" values in a dictionary form


def audit_k_v(file_name, parent_tag_list):
    
    osm_file = open(file_name, "r")
    
    events = ET.iterparse(osm_file, events=("start",))
    _, root = next(events)  # Grabbing the root element
    
    k_v_dict = {}
    for event, elem in events:
        if (event == "start"):
            if elem.tag in parent_tag_list:
                for tag in elem.iter("tag"):
                    k_val = tag.attrib["k"]
                    v_val = tag.attrib["v"]
                    if k_val not in k_v_dict.keys():
                        k_v_dict[k_val] = set()
                        k_v_dict[k_val].add(v_val)
                    else:
                        k_v_dict[k_val].add(v_val)
                root.clear()  # Freeing up memory by clearing the root element
    
    osm_file.close()
    
    return k_v_dict

### Unique "k" and "v" values in "node" tags

In [10]:
node_k_v_dict = audit_k_v(SAMPLE, ["node"])
pprint.pprint(node_k_v_dict)

{'CAB:User_restictions': set(['Limited to patients of practice.']),
 'FIXME': set(['continue', 'please confrim exact location']),
 'Site:use': set(['Outreach surgery']),
 'Support_service': set(['Citizans Advice Bureau']),
 'access': set(['customers', 'no', 'permissive', 'private']),
 'access:hgv': set(['@ (18:00-10:00)']),
 'accessfrom': set(['Hawkins Street']),
 'addr:city': set(['Eynsham, Witney', 'Oxford']),
 'addr:country': set(['GB']),
 'addr:county': set(['Oxfordshire']),
 'addr:door': set(['Flats 13-24']),
 'addr:housename': set(['Courtney Pianos',
                        'Flat 4',
                        'Foundry House',
                        'Furnace House',
                        'M&S',
                        'North Parade Chambers',
                        'Oxford Railway Station',
                        'SoJo',
                        'Stroud Court Lodge',
                        'Summertown Pavilion']),
 'addr:housenumber': set(['1',
                          '1,2,5,

### Unique "k" and "v" values in "way" tags

In [11]:
way_k_v_dict = audit_k_v(SAMPLE, ["way"])
pprint.pprint(way_k_v_dict)

{'BUFF_DIST': set(['-1.0']),
 'FIXME': set(['Tag with more precise incline value',
               'incomplete',
               'needs a multi-source alignment check',
               'needs naming',
               'stub']),
 'FIXME:nsl': set(['inferred dual-carriageway NSL - remove this tag once verified; if this is a Special Road, remove the source:maxspeed tag and add motorroad=yes',
                   'inferred dual-carriageway NSL - remove this tag once verified;if this is a Special Road, remove the source:maxspeed tag and add motorroad=yes']),
 'Fixme': set(['New building here needing a survey',
               'this a cycle lane if there at all now.']),
 'Id': set(['0']),
 'NPLG:USRN:1': set(['29724068']),
 'ORIG_FID': set(['0', '282']),
 'abutters': set(['garages', 'industrial', 'residential', 'retail']),
 'access': set(['customer',
                'customers',
                'destination',
                'no',
                'permissive',
                'private',
           

### Unique "k" and "v" values in "relation" tags

In [12]:
relation_k_v_dict = audit_k_v(SAMPLE, ["relation"])
pprint.pprint(relation_k_v_dict)

{'IBGE:GEOCODIGO': set(['2926707']),
 'TMC:cid_58:tabcd_1:Class': set(['Road']),
 'TMC:cid_58:tabcd_1:LCLversion': set(['8.00']),
 'TMC:cid_58:tabcd_1:LocationCode': set(['35520']),
 'addr:city': set(['Oxford']),
 'addr:housenumber': set(['11']),
 'addr:postcode': set(['OX1 1DW', 'OX1 1SS']),
 'addr:street': set(['Floyds Row']),
 'admin_level': set(['10', '8']),
 'amenity': set(['school', 'training_and_support_centre', 'university']),
 'area': set(['yes']),
 'ascent': set(['1120 m']),
 'authoritative': set(['yes']),
 'boundary': set(['administrative', 'national_park', 'protected_area']),
 'building': set(['university', 'yes']),
 'color': set(['#000099']),
 'colour': set(['#0000FF', '#F39200']),
 'colour:arrow': set(['black']),
 'colour:back': set(['white']),
 'colour:text': set(['black']),
 'construction': set(['roadworks']),
 'description': set(['780 15 Mile Rd to Maple & Orchard Lake via Oakland Park Towers',
                     'Abingdon - Cumnor - Botley - Oxford - Iffley - Rose H

### Unique "k" and "v" values in all tags

In [3]:
all_k_v_dict = audit_k_v(SAMPLE, ["node", "way", "relation"])

In [7]:
# Examining all "k" values
all_k_list = all_k_v_dict.keys()
all_k_list.sort()
all_k_list

['BUFF_DIST',
 'CAB:User_restictions',
 'FIXME',
 'FIXME:nsl',
 'Fixme',
 'IBGE:GEOCODIGO',
 'Id',
 'NPLG:USRN:1',
 'ORIG_FID',
 'Site:use',
 'Support_service',
 'TMC:cid_58:tabcd_1:Class',
 'TMC:cid_58:tabcd_1:LCLversion',
 'TMC:cid_58:tabcd_1:LocationCode',
 'abutters',
 'access',
 'access:disabled',
 'access:hgv',
 'accessfrom',
 'addr:city',
 'addr:city:simc',
 'addr:country',
 'addr:county',
 'addr:door',
 'addr:housename',
 'addr:housenumber',
 'addr:inclusion',
 'addr:interpolation',
 'addr:place',
 'addr:postcode',
 'addr:state',
 'addr:street',
 'addr:substreet',
 'addr:substreet1',
 'addr:substreet:1',
 'addr:substreet:2',
 'addr:town',
 'addr:unit',
 'adjacent',
 'admin_level',
 'aeroway',
 'alt_name',
 'amenity',
 'amenity:disused',
 'answer',
 'area',
 'area:highway',
 'artwork_type',
 'ascent',
 'atm',
 'attribution',
 'authoritative',
 'backrest',
 'barrier',
 'basin',
 'bicycle',
 'bicycle:no:times',
 'bicycle:note',
 'bicycle_parking',
 'board_type',
 'boat',
 'boundar

In [11]:
# Looking up "v" values for each "k"
all_k_v_dict["source"]

{'3dShapes',
 ';Oxford City Council/Oxfordshire County Council Transport publications;site surveys',
 'BAG',
 'Bing',
 'Bing and Local Knowledge',
 'Bing, 2007/4',
 'Bing, local knowledge',
 'Bing; OS_OpenData_StreetView',
 'Bing; local knowledge',
 'Bing; survey',
 'Bing; www.admin.ox.ac.uk',
 'Bing; www.oxfordshire.gov.uk',
 'Bing; www.pembrokebridgingcenturies.org',
 'Bing;OS OpenData StreetView',
 'Bing;OSM gps point cloud;Oxford City Council/Oxfordshire County Council Transport publications;site surveys',
 'Bing;OSM_gps_data',
 'Bing;OS_OpenData_StreetView',
 'Bing;OS_OpenData_StreetView;yahoo_imagery',
 'Bing;gps_survey',
 'Bing;inference',
 'Bing;local_knowledge',
 'Bing;local_knowledge;inference',
 'Bing;local_knowledge;website',
 'Bing;osm_gps_points',
 'Bing;osm_gps_points;local_knowledge',
 'Bing;photograph',
 'Bing;photograph;local_knowledge',
 'Bing;survey',
 'Bing;survey;yahoo_imagery',
 'CANVEC',
 'DOP from WMS at geodaten.bayern.de',
 'GPS',
 'GPS Survey',
 'GPS; NPE',


## Structural Issues

### 1. Multilevel appearance

There are some "k" values that have both one-level and two-level appearances (e.g., "amenity", "amenity:disused"). Some resolution is necessary. First, let's check all values that are subject to this problem.

In [9]:
# Getting "k" values that have both one-level and two-level appearances

lower = re.compile(r'^([a-z]|_)*$')

k_multilevel = set()
for key_one in all_k_list:
    if lower.search(key_one):
        key_single_colon = key_one + ":"
        for key_two in all_k_list:
            if key_two.startswith(key_single_colon):
                k_multilevel.add(key_one)
                k_multilevel.add(key_two)
k_multilevel

{'access',
 'access:disabled',
 'access:hgv',
 'amenity',
 'amenity:disused',
 'area',
 'area:highway',
 'bicycle',
 'bicycle:no:times',
 'bicycle:note',
 'building',
 'building:levels',
 'building:levels:underground',
 'building:material',
 'building:part',
 'building:roof',
 'building:ruian:type',
 'building:type',
 'building:use',
 'bus',
 'bus:lanes:forward',
 'capacity',
 'capacity:disabled',
 'capacity:private',
 'colour',
 'colour:arrow',
 'colour:back',
 'colour:text',
 'cycleway',
 'cycleway:hazard',
 'cycleway:left',
 'cycleway:left:priority',
 'cycleway:left:width',
 'cycleway:note',
 'cycleway:otherside:width',
 'cycleway:right',
 'cycleway:right:priority',
 'cycleway:right:width',
 'disused',
 'disused:amenity',
 'disused:highway',
 'disused:shop',
 'footway',
 'footway:left',
 'frequency',
 'frequency:4A',
 'garden',
 'garden:type',
 'is_in',
 'is_in:state',
 'landuse',
 'landuse:former',
 'lanes',
 'lanes:bus:backward',
 'lanes:bus:forward',
 'lanes:forward',
 'lanes:psv

Let's take a look at what "v" values each of these problem "k" values contains.

In [16]:
k_multilevel = list(k_multilevel)
k_multilevel.sort()

for problem_key in k_multilevel:
    print problem_key
    pprint.pprint( all_k_v_dict[problem_key] )
    print "\n"

access
set(['customer',
     'customers',
     'destination',
     'no',
     'permissive',
     'private',
     'public',
     'steps',
     'unknown',
     'yes'])


access:disabled
set(['yes'])


access:hgv
set(['@ (18:00-10:00)', 'delivery'])


amenity
set(['Wellbeing_service',
     'advice_service',
     'arts_centre',
     'atm',
     'ballot_box',
     'bank',
     'bar',
     'bench',
     'bicycle_parking',
     'bicycle_rental',
     'biergarten',
     'binstore',
     'bureau_de_change',
     'cafe',
     'childcare',
     'clock',
     'college',
     'community_centre',
     'compound',
     'dentist',
     'doctors',
     'fast_food',
     'fountain',
     'fuel',
     'grave_yard',
     'hide',
     'hospice',
     'hospital',
     'kindergarten',
     'library',
     'motorcycle_parking',
     'parking',
     'parking_entrance',
     'parking_space',
     'pharmacy',
     'place_of_worship',
     'post_box',
     'post_office',
     'pub',
     'public_building',
     '

### 2. Multiple colons

There are also some "k" values that contain more than one colon. Some resolution is necessary here too. As before, let's check all values that are subject to this problem. Since many of such "k" values appear in the list above (of those with multilevel appearance), we can focus only on the non-overlapping values.

In [20]:
# Getting "k" values that contain more than one colon

lower_many_colons = re.compile(r'^([a-z]|_)*:([a-z]|_)*:')

k_many_colons = []
for key in all_k_list:
    if lower_many_colons.search(key) \
            and key not in k_multilevel:  # To prevent redundancy in correction
        k_many_colons.append(key)
k_many_colons.sort()
k_many_colons

['addr:city:simc',
 'addr:substreet:1',
 'addr:substreet:2',
 'not:name:note',
 'organisation:affiliation:National',
 'organisation:affiliation:region']

Thankfully, there remain just a handful of problem values. Let's take a look at what "v" values each of these problem "k" values contains.

In [21]:
k_many_colons.sort()

for problem_key in k_many_colons:
    print problem_key
    pprint.pprint( all_k_v_dict[problem_key] )
    print "\n"

addr:city:simc
set(['0440325'])


addr:substreet:1
set(['12', 'Hilltop Court', 'York Place'])


addr:substreet:2
set(['Anchor Court'])


not:name:note
set(['looks a lot like a typo, not c100% confirmed'])


organisation:affiliation:National
set(['The Citizens Advice service/Mae gwasanaeth Cyngor ar Bopeth'])


organisation:affiliation:region
set(['Consortium of Oxfordshire CAB'])




## Classifying Problem "k" Values

It would be ideal to individually audit all these problem "k" values and their corresponding "v" values, and come up with new, more efficient labeling/classification schemes (ideally, of two-level since this seems to the standard followed and suggested by OpenStreetMap). But for actual implementation, we need a programmatic default method to resolve these problems. After all, the list of problem "k" values we see here is derived from a *sample* of the original data set, and we cannot be certain that it is exhaustive - hence the need for a programmatic solution.
 
In order to retain as much information in the original data set as possible, we need to deploy a data model/structure that is flexibly nest-able. With a flexible data model that (almost) fully captures information in the original data set, we can later resume our data cleaning if needed. The first step to developing such a flexible data model is to further classify the problem "k" values so that a solution can be devised for each category.

In [6]:
# Writing a function that classifies problem "k" values


def classify_k_multilevel(file_name):
    
    # Get the list of all unique "k" values
    all_k_v_dict = audit_k_v(file_name, ["node", "way", "relation"])
    all_k_list = all_k_v_dict.keys()
    
    # Classify "k" values according to colon inclusion
    k_lower = list()
    k_lower_colon = list()
    k_lower_colon_colon = list()
    for key in all_k_list:
        if lower.search(key):
            k_lower.append(key)
        elif lower_colon.search(key):
            k_lower_colon.append(key)
        elif lower_colon_colon.search(key):
            k_lower_colon_colon.append(key)
    
    ## Identify "k" values with no colon that appear 
    ## in other "k" values with a colon or two
    k_multilevel_one = set()
    for key_one in k_lower:
        key_single_colon = key_one + ":"
        for key_two in k_lower_colon:
            if key_two.startswith(key_single_colon):
                k_multilevel_one.add(key_one)
                break
        for key_three in k_lower_colon_colon:
            if key_three.startswith(key_single_colon):
                k_multilevel_one.add(key_one)
                break
    
    ## Identify "k" values with one colon that appear 
    ## in other "k" values with two colons
    k_multilevel_two = set()
    for key_two in k_lower_colon:
        key_double_colon = key_two + ":"
        for key_three in k_lower_colon_colon:
            if key_three.startswith(key_double_colon):
                k_multilevel_two.add(key_two)
                break
    
    return (k_multilevel_one, k_multilevel_two)

In [7]:
k_multilevel_one, k_multilevel_two = classify_k_multilevel(SAMPLE)

In [8]:
# "k" values with no colon that appear in other "k" values with a colon or two
k_multilevel_one

{'access',
 'amenity',
 'area',
 'bicycle',
 'building',
 'bus',
 'capacity',
 'colour',
 'cycleway',
 'disused',
 'footway',
 'garden',
 'is_in',
 'landuse',
 'lanes',
 'lcn',
 'level',
 'maxspeed',
 'maxweight',
 'motor_vehicle',
 'name',
 'note',
 'oneway',
 'opening_hours',
 'operator',
 'parking',
 'proposed',
 'public_transport',
 'railway',
 'ref',
 'source',
 'tourist_bus',
 'wheelchair'}

In [9]:
# "k" values with one colon that appear in other "k" values with two colons
k_multilevel_two

{'addr:city',
 'building:levels',
 'cycleway:left',
 'cycleway:right',
 'not:name',
 'source:addr'}