In [2]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
import collections
import os
import pymongo

In [12]:
shh_data = "brooklyn_new-york.osm"

In [14]:
#Parse through the file with ElementTree and count the number of unique element types to understand overall structure.
def count_tags(filename):
        tags = {}
        for event, elem in ET.iterparse(filename):
            if elem.tag in tags: 
                tags[elem.tag] += 1
            else:
                tags[elem.tag] = 1
        return tags
shh_tags = count_tags(shh_data)
pprint.pprint(shh_tags)

{'bounds': 1,
 'member': 14551,
 'nd': 3494969,
 'node': 2484785,
 'osm': 1,
 'relation': 1701,
 'tag': 2819240,
 'way': 490294}


In [15]:
import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        for tag in element.iter('tag'):
            k = tag.get('k')
            if lower.search(k):
                keys['lower'] += 1
            elif lower_colon.search(k):
                keys['lower_colon'] += 1
            elif problemchars.search(k):
                keys['problemchars'] += 1
            else:
                keys['other'] += 1
    return keys


def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

shh_keys = process_map(shh_data)
pprint.pprint(shh_keys)

{'lower': 1052673,
 'lower_colon': 1745044,
 'other': 7239,
 'problemchars': 14284}


In [16]:
#people invovlved in the map editing.
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        for e in element:
            if 'uid' in e.attrib:
                users.add(e.attrib['uid'])
    return users
users = process_map(shh_data)
len(users)

1384

In [17]:
from collections import defaultdict

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Avenue", "Boulevard", "Commons", "Court", "Drive", "Lane", "Parkway", 
                         "Place", "Road", "Square", "Street", "Trail"]

mapping = {'Ave'  : 'Avenue',
           'Blvd' : 'Boulevard',
           'Dr'   : 'Drive',
           'Ln'   : 'Lane',
           'Pkwy' : 'Parkway',
           'Rd'   : 'Road',
           'Rd.'   : 'Road',
           'St'   : 'Street',
           'street' :"Street",
           'Ct'   : "Court",
           'Cir'  : "Circle",
           'Cr'   : "Court",
           'ave'  : 'Avenue',
           'Hwg'  : 'Highway',
           'Hwy'  : 'Highway',
           'Sq'   : "Square"}


In [18]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types

In [19]:
shh_street_types = audit(shh_data)

In [20]:
pprint.pprint(dict(shh_street_types))

{'1': set(['Graham Avenue #1']),
 '11217': set(['305 Schermerhorn St., Brooklyn, NY 11217']),
 '1st': set(['1st']),
 '218650358': set(['218650358']),
 '3': set(['Hanover Square #3']),
 '300': set(['Ste 300']),
 '4B': set(['Union Avenue 4B']),
 '500': set(['Main St., Suite 500']),
 '861': set(['861']),
 'A': set(['Avenue A']),
 'Alley': set(['Cortlandt Alley',
               'Exchange Alley',
               'Freeman Alley',
               'Grace Court Alley',
               'Harrison Alley',
               'Mac Dougal Alley',
               'Theater Alley']),
 'Americas': set(['Avenue Of The Americas',
                  'Avenue of the Americas',
                  'Avenue of the Americas\n']),
 'Atrium': set(['Broadway Atrium']),
 'Ave': set(['4th Ave', '5th Ave', '6th Ave', 'Norman Ave', 'Park Ave']),
 'Ave.': set(['Washington Ave.']),
 'Avene': set(['Nostrand Avene']),
 'Avenue,': set(['70th Avenue,']),
 'B': set(['Avenue B']),
 'Bayside': set(['Bayside']),
 'Blvd': set(['Marin Blvd', 

In [22]:
def update_name(name, mapping, regex):
    m = regex.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping:
            name = re.sub(regex, mapping[street_type], name)

    return name

for street_type, ways in shh_street_types.iteritems():
    for name in ways:
        better_name = update_name(name, mapping, street_type_re)
        print name, "=>", better_name

Columbia Heights => Columbia Heights
Aviation Rd => Aviation Road
West 8th Steet => West 8th Steet
Linden Boulevard Outer Eb Rb => Linden Boulevard Outer Eb Rb
Bedford avenue => Bedford avenue
2nd avenue => 2nd avenue
Utica avenue => Utica avenue
Prospect Park Southwest => Prospect Park Southwest
Graham Avenue #1 => Graham Avenue #1
Greenwich Mews => Greenwich Mews
Washington Mews => Washington Mews
Willoughby => Willoughby
Brighton 2nd Path => Brighton 2nd Path
Brighton 10th Path => Brighton 10th Path
Brighton 1st Path => Brighton 1st Path
Avenue D => Avenue D
Avenue H => Avenue H
Avenue L => Avenue L
Avenue P => Avenue P
Avenue T => Avenue T
Avenue X => Avenue X
Northside Piers => Northside Piers
Avenue Of The Finest => Avenue Of The Finest
Exchange Alley => Exchange Alley
Cortlandt Alley => Cortlandt Alley
Mac Dougal Alley => Mac Dougal Alley
Grace Court Alley => Grace Court Alley
Freeman Alley => Freeman Alley
Harrison Alley => Harrison Alley
Theater Alley => Theater Alley
bus_stop