In [14]:
import xml.etree.cElementTree as ET
import pprint
import re
from collections import defaultdict

filename = "small_sample.osm"

In [6]:
def count_tags(filename):
    tags = {}
    for event, element in ET.iterparse(filename):
        tag = element.tag
        if tag in tags:
            tags[tag] += 1
        else:
            tags[tag] = 1
    return tags

pprint.pprint(count_tags(filename))

{'member': 108,
 'nd': 19106,
 'node': 16924,
 'osm': 1,
 'relation': 10,
 'tag': 8515,
 'way': 1790}


In [10]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):

    if element.tag == "tag":
        k = element.get('k')
        if re.search(lower, k):
            keys['lower'] += 1
        elif re.search(lower_colon, k):
            keys['lower_colon'] += 1
        elif re.search(problemchars, k):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
        
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

pprint.pprint(process_map(filename))

{'lower': 3878, 'lower_colon': 4403, 'other': 234, 'problemchars': 0}


In [13]:
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        uid = element.get('uid')
        if uid:
            users.add(uid)

    return users

print len(process_map(filename))

369


In [19]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osmfile, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    return street_types

street_types = audit(filename)
pprint.pprint(dict(street_types))

{'Dr.': set(['Tidal Dr.']),
 'Highway': set(['Middle Highway']),
 'Rd': set(['Airport Rd', 'Davisville Rd', 'Dexter Rd']),
 'Way': set(['Commerce Way', 'Minuteman Way'])}


In [None]:
mapping = { "St": "Street",
            "St.": "Street",
            "Rd":"Road",
            "Rd.": "Road",
            "Ave":"Avenue"
            }

def update_name(name, mapping):
    street_type = re.search(street_type_re, name).group()
    if street_type in mapping:
        name = name.replace(street_type, mapping[street_type], 1)
        
    return name