In [163]:
import codecs
import json
import re

import IPython.display as disp
#from lxml import etree as ET
import xml.etree.cElementTree as ET

from collections import defaultdict
from enum import IntEnum
from pprint import pprint

# P3: Wrangle OpenStreetMap Data

In [188]:
class AuditXml(object):
    """
    Used to audit openstreetmaps xml files.
    
    Examples
    ----------
    example_audit = AuditXml("example1.osm")
    example_audit.run()
    example_audit.summary()
    """
    class Options(IntEnum):
        """
        Enum options to pass into AuditXml during instantiation.
        The following options can be OR'd together.
        """
        tag_frequency = 1,
        key_frequency = 2,
        key_names = 4,
        street_analysis = 8
        
    
    lower = re.compile(r'^([a-z]|_)*$')
    lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
    problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]+')
    street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
    
    street_expected = ['Alley', 'Avenue', 'Boulevard', 'Center', 'Circle',
                       'Commons', 'Court', 'Drive', 'Highway', 'Lane',
                       'Parkway', 'Place', 'Plaza', 'Road', 'Square',
                       'Street', 'Terrace', 'Trail', 'Vista', 'Walk', 'Way']
    
    def __init__(self, filename):
        self.filename = filename
        self.tags_found_dict = {}
        self.keys_found_dict = {}
        self.key_names_audit = {"lower": 0,
                                "lower_colon": 0,
                                "problemchars": 0,
                                "other": 0}
        self.street_types = defaultdict(set)
        
        self.all_options = 0
        
        for option in self.Options:
            self.all_options += option
        
    def run(self, options=None):
        if options is None:
            options = self.all_options

        for _, elem in ET.iterparse(self.filename):
            if options & self.Options.tag_frequency:
                self.option_tag_frequency(elem)
            
            if options & self.Options.key_frequency:
                self.option_key_frequency(elem)
                
            if options & self.Options.key_names:
                self.option_key_names(elem)
                
            if options & self.Options.street_analysis:
                self.option_street_analysis(elem)

            elem.clear()
    
    def option_tag_frequency(self, elem):
        """
        Find out how what kind of tags and how frequent do they occur.
        """
        self.count_names(elem.tag, self.tags_found_dict)
    
    def option_key_frequency(self, elem):
        """
        Find out what kind of key value pairs are their according to the 
        Open Street Maps Standard.
        """
        if elem.tag == "tag":
            key = elem.attrib["k"]
            self.count_names(key, self.keys_found_dict)
    
    def option_key_names(self, elem):
        """
        Find out what types of keys are out there and if they pose any threat
        to converting into JSON.
        """
        if elem.tag == "tag":
            match = False
            key = elem.attrib["k"]
            if self.lower.search(key):
                self.key_names_audit["lower"] += 1
                match = True

            if self.lower_colon.search(key):
                self.key_names_audit["lower_colon"] += 1
                match = True

            if self.problemchars.search(key):
                self.key_names_audit["problemchars"] += 1
                match = True

            if not match:
                self.key_names_audit["other"] += 1
    
    def option_street_analysis(self, elem):
        """
        Inspect values associated with the addr:street key and find out what type
        of unexpected street types are out there.
        """
        if elem.tag == "tag":
            if elem.attrib['k'] == "addr:street":
                street_name = elem.attrib['v']
                m = self.street_type_re.search(street_name)
                if m:
                    street_type = m.group()
                    if street_type not in self.street_expected:
                        self.street_types[street_type].add(street_name)

    def summary(self, options=None):
        
        if options is None:
            options = self.all_options

        if options & self.Options.tag_frequency:
            print("Tags found and their frequencies:----------------------")
            pprint(sorted(self.tags_found_dict.items(), key=lambda t: -t[1]))
            print("")

        if options & self.Options.key_frequency:
            print("Keys found and their frequencies:--------------------\n")
            print("Sorted by Name:----------------------------------------")
            pprint(sorted(self.keys_found_dict.items(), key=lambda t: t[0].lower()))
            print("")
            print("Sorted by Frequency------------------------------------")
            pprint(sorted(audit.keys_found_dict.items(), key=lambda t: -t[1]))
            print("")

        if options & self.Options.key_names:
            print("Types of keys:---------------------------------------")
            pprint(self.key_names_audit)
            print("")

        if options & self.Options.street_analysis:
            print("\nStreet name analysis:--------------------------------")
            pprint(sorted(self.street_types.items(), key=lambda t: t[0].lower()))
            print("")
        
    @staticmethod
    def count_names(name, mydict):
        """Builds a frequency dictionary of names passed in"""
        if name in mydict:
            mydict[name] += 1
        else:
            mydict[name] = 1


In [189]:
# Run on sample set to verify functionality.
example_audit = AuditXml("example1.osm")
example_audit.run()
example_audit.summary()

Tags found and their frequencies:----------------------
[('tag', 64),
 ('node', 25),
 ('nd', 11),
 ('member', 3),
 ('way', 2),
 ('bounds', 1),
 ('relation', 1),
 ('osm', 1)]

Keys found and their frequencies:--------------------

Sorted by Name:----------------------------------------
[('addr:city', 4),
 ('addr:country', 1),
 ('addr:housename', 1),
 ('addr:housenumber', 6),
 ('addr:postcode', 5),
 ('addr:state', 1),
 ('addr:street', 6),
 ('addr:street:name', 1),
 ('addr:street:prefix', 1),
 ('addr:street:type', 1),
 ('amenity', 5),
 ('building', 1),
 ('building:levels', 1),
 ('chicago:building_id', 1),
 ('cuisine', 4),
 ('highway', 2),
 ('name', 6),
 ('outdoor_seating', 3),
 ('phone', 4),
 ('restriction', 1),
 ('shop', 1),
 ('smoking', 3),
 ('source', 1),
 ('takeaway', 3),
 ('type', 1)]

Sorted by Frequency------------------------------------
[('building', 485012),
 ('highway', 373635),
 ('name', 255138),
 ('addr:housenumber', 197174),
 ('addr:street', 184889),
 ('tiger:county', 179036

In [190]:
# Run on large set
audit = AuditXml("san-francisco-bay_california.osm")
audit.run()

In [191]:
audit.summary(audit.Options.tag_frequency)

Tags found and their frequencies:----------------------
[('nd', 11237495),
 ('node', 9572721),
 ('tag', 4551092),
 ('way', 928003),
 ('member', 61917),
 ('relation', 6975),
 ('bounds', 1),
 ('osm', 1)]



In [192]:
audit.summary(audit.Options.key_names)

Types of keys:---------------------------------------
{'lower': 2052979,
 'lower_colon': 2352583,
 'other': 145349,
 'problemchars': 181}



In [193]:
audit.summary(audit.Options.key_frequency)

Keys found and their frequencies:--------------------

Sorted by Name:----------------------------------------
[('', 1),
 ('145', 1),
 ('24h', 2),
 ('3', 1),
 ('_Acres_', 8),
 ('_OBJNAME_', 3),
 ('_Shape_Area_', 8),
 ('_Shape_Leng_', 8),
 ('abandoned', 7),
 ('abandoned:aeroway', 8),
 ('abandoned:amenity', 2),
 ('abandoned:highway', 9),
 ('abutters', 116),
 ('Access', 3),
 ('access', 15922),
 ('access:backward', 4),
 ('access:bicycle', 12),
 ('access:bicycles', 1),
 ('access:boat', 1),
 ('access:conditional', 7),
 ('access:dog', 1),
 ('access:dogs', 6),
 ('access:foot', 7),
 ('access:horse', 3),
 ('access:lanes', 1),
 ('access:motor_vehicle', 1),
 ('access:vehicle', 1),
 ('accuracy:east', 2),
 ('accuracy:ellipsoid', 2),
 ('accuracy:north', 2),
 ('Acres', 16),
 ('acres', 5360),
 ('add', 1),
 ('addr.source:housenumber', 82),
 ('addr:1:housenumber', 192),
 ('addr:1:street', 5),
 ('addr:2:housenumber', 81),
 ('addr:3:housenumber', 44),
 ('addr:4:housenumber', 29),
 ('addr:4:street', 1),
 ('

In [194]:
audit.summary(audit.Options.street_analysis)


Street name analysis:--------------------------------
[('0.1', set(['Ala 680 PM 0.1'])),
 ('1',
  set(['10795 Hwy 1',
       '2030 Hwy 1',
       'California Highway 16, House No. 1',
       'State Hwy 1',
       'Stewart Drive Suite #1',
       'W Of Us 101 @ Jct Sr 1'])),
 ('10', set(['San Mateo 35 PM 10', 'South St #10'])),
 ('100', set(['Woodside Road, Suite 100'])),
 ('101',
  set(['Highway 101',
       'Nw Quad Lincoln Ave / Us 101',
       'Se Quad Smith Ranch Rd / Us 101'])),
 ('10675', set(['10675'])),
 ('11', set(['Fairview Rd #11'])),
 ('110', set(['West Angela Street, Suite 110'])),
 ('110,', set(['Promenade Circle #110,'])),
 ('114', set(['West Evelyn Avenue Suite #114'])),
 ('116',
  set(['Hwy 116',
       'In "Y" Jct Of Sr 121 / Sr 116',
       'W Side Of Us 101 @ Sr 116'])),
 ('12',
  set(['480 Highway 12',
       'Hawkins St #12',
       'Main Street At Sr 12',
       'Rustic St #12'])),
 ('120', set(['E. Hwy 120', 'East Highway 120'])),
 ('12180142', set(['12180142']

## Summary of Audit:

From the audit, I am choosing to continue addressing street abbreviations by adding more conversions to the work done in Lesson 6. I am also choosing to convert additional "addr:(?i)[a-z]." keys under the "address" name.

I have also noticed that there is abundunt amount of keys prefixed with "tiger". After further research, I have discovered that it is data that was imported into openstreetmap provided by the US Census during the early stages in openstreet development. "tiger:county" will be accepted as "address:county" and "tiger:zip_left" will be accepted as "address:postcode" if no values for those keys exist already.

In [183]:
class ShapeXmlToJson(object):
    """Shapes passed in Open Street Maps xml file to JSON"""
    
    attrtovar_toplevel = set(["id", "visible"])
    """Attributes to convert into elements under the constructed member"""
    
    attrtovar_created = set(["version", "changeset", "timestamp", "user", "uid"])
    """Attributes to convert to elements under the created name inside the member"""

    re_c = re.compile
    problemchars = re_c(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
    street_type_re = re_c(r'\b\S+\.?$', re.IGNORECASE)
    address_type = re_c(r'^addr:')
    
    
    corrections =   [(re_c("a[bv]e\.?n?[ui]?e?$", re.IGNORECASE), "Avenue"),
                     (re_c("blvd\.?$", re.IGNORECASE), "Boulevard"),
                     (re_c("bouleva.*$", re.IGNORECASE), "Boulevard"),
                     (re_c("circ?l?e?\.?$", re.IGNORECASE), "Circle"),
                     (re_c("ct\.?$", re.IGNORECASE), "Court"),
                     (re_c("court$", re.IGNORECASE), "Court"),
                     (re_c("ctr\.?$", re.IGNORECASE), "Center"),
                     (re_c("center$", re.IGNORECASE), "Center"),
                     (re_c("dr\.?$", re.IGNORECASE), "Drive"),
                     (re_c("drive$", re.IGNORECASE), "Drive"),
                     (re_c("expwy\.?$", re.IGNORECASE), "Expressway"),
                     (re_c("expressway$", re.IGNORECASE), "Expressway"),
                     (re_c("hwy\.?$", re.IGNORECASE), "Highway"),
                     (re_c("highway$", re.IGNORECASE), "Highway"),
                     (re_c("ln\.?$", re.IGNORECASE), "Lane"),
                     (re_c("lane$", re.IGNORECASE), "Lane"),
                     (re_c("pkwy\.?$", re.IGNORECASE), "Parkway"),
                     (re_c("pl\.?$", re.IGNORECASE), "Place"),
                     (re_c("rd\.?$", re.IGNORECASE), "Road"),
                     (re_c("road$", re.IGNORECASE), "Road"),
                     (re_c("st\.?$", re.IGNORECASE), "Street"),
                     (re_c("street$", re.IGNORECASE), "Street"),
                     (re_c("terrace$", re.IGNORECASE), "Terrace"),
                     (re_c("way$", re.IGNORECASE), "Way")]
    """Mapping of corrections of street types"""
    
    def __init__(self, infile, outfile=None, store_to_var=False, pretty=False):
        self.source = infile
        self.store_to_var = store_to_var
        self.pretty = pretty
        self.data = []
        if outfile is None:
            self.outfile = "{0}.json".format(self.source)
    
    def shape(self):
        """Converts XML to JSON and writes it to file suffixed with .json"""
        with codecs.open(self.outfile, "w") as fo:
            for _, elem in ET.iterparse(self.source):
                el = self.shape_element(elem)
                if el is not None:
                    if self.store_to_var:
                        self.data.append(el)
                    if self.pretty:
                        fo.write(json.dumps(el, indent=2)+"\n")
                    else:
                        fo.write(json.dumps(el) + "\n")
    
    @staticmethod
    def shape_element(element):
        '''
        Converts passed in XML tag to JSON with each member following
        the structure:
        {
        "id": "261114295", 
        "visible": "true", 
        "type": "node", 
        "pos": [41.9730791, -87.6866303], # Optional
        "created": {
            "changeset": "11129782", 
            "user": "bbmiller", 
            "version": "7", 
            "uid": "451048", 
            "timestamp": "2012-03-28T18:31:23Z"
            }
        }
        '''
        if element.tag == "node" or element.tag == "way":
            node = {}
            created = {}
            pos = [None,None]
            node_refs = []

            node["created"] = created

            node["type"] = element.tag

            for k,v in element.attrib.iteritems():
                if k in ShapeXmlToJson.attrtovar_created:
                    node["created"][k] = v
                elif k == "lat":
                    pos[0] = float(v)
                elif k == "lon":
                    pos[1] = float(v)
                elif k in ShapeXmlToJson.attrtovar_toplevel:
                    node[k] = v
                else:
                    raise KeyError(k)
            if (pos[0] is not None) and (pos[1] is not None):
                node["pos"] = pos

            for tag in element.iter("tag"):
                k,v = tag.attrib['k'], tag.attrib['v']
                k = k.lower().strip()
                v = v.strip()

                if ShapeXmlToJson.problemchars.search(k):
                    print("------------Problem inserting {}:{}--------------".format(k,v))
                    print("Node:")
                    pprint(node)
                    print("-------------------------------------------------")
                    continue
                elif k == "address":
                    print("------------Ignoring Address Key---------------")
                    print("{}:{}".format(k,v))
                    pprint(node)
                    continue
                elif k == "addr:street":
                    if "address" not in node:
                        node["address"] = {}
                    val = ShapeXmlToJson.street_name_convert(v)
                    node["address"]["street"] = val
                elif k == "addr:housenumber":
                    if "address" not in node:
                        node["address"] = {}
                    node["address"]["housenumber"] = v
                elif k.count(':') == 1 and ShapeXmlToJson.address_type.match(k):
                    # Convert only keys that have one colon and ignore the rest for now
                    if "address" not in node:
                        node["address"] = {}
                    node["address"][k.split(':')[1]] = v
                elif k == "tiger:county":
                    # Convert old tiger data
                    if "address" not in node:
                        node["address"] = {}
                    elif "county" in node["address"]:
                        continue
                    if v.count(',') > 0:
                        v = v.split(',')[0]

                    node["address"]["county"] = v
                elif k == "tiger:zip_left":
                    # Convert old tiger data
                    if "address" not in node:
                        node["address"] = {}
                    elif "postcode" in node["address"]:
                        continue

                    node["address"]["postcode"] = v       
                else:
                    node[k] = v

            for tag in element.iter("nd"):
                ref = tag.attrib["ref"]
                node_refs.append(ref)

            if len(node_refs) != 0:
                node["node_refs"] = node_refs
            
            element.clear()
            return node
        else:
            return None
        
    @staticmethod
    def street_name_convert(orig):
        """Checks and converts street names with abbreviated
        street types to the full spelling
        """
        corrected = orig
        m = ShapeXmlToJson.street_type_re.search(orig)
        if m:
            street_type = m.group()
            for re_tuple in ShapeXmlToJson.corrections:
                match = re_tuple[0].match(street_type)
                if match:
                    # Don't change if string to be replaced is the same
                    if street_type != re_tuple[1]:
                        corrected = ShapeXmlToJson.street_type_re.sub(re_tuple[1], orig)
                        print("{} -> {}".format(orig, corrected))
                    break
        return corrected

In [184]:

example1osm_shape = ShapeXmlToJson("example1.osm", pretty=True, store_to_var=True)
example1osm_shape.shape()

correct_first_elem = {
    "id": "261114295", 
    "visible": "true", 
    "type": "node", 
    "pos": [41.9730791, -87.6866303], 
    "created": {
        "changeset": "11129782", 
        "user": "bbmiller", 
        "version": "7", 
        "uid": "451048", 
        "timestamp": "2012-03-28T18:31:23Z"
    }
}

data = example1osm_shape.data
pprint(data)

assert data[0] == correct_first_elem
assert data[-1]["address"] == {
                                "street": "West Lexington Street", 
                                "housenumber": "1412"
                                  }
assert data[-1]["node_refs"] == [ "2199822281", "2199822390",  "2199822392", "2199822369", 
                                "2199822370", "2199822284", "2199822281"]


North Lincoln Ave -> North Lincoln Avenue
North Lincoln blvd. -> North Lincoln Boulevard
North Lincoln bouleva. -> North Lincoln Boulevard
N. Lincoln Ave -> N. Lincoln Avenue
Baldwin Rd. -> Baldwin Road
West Lexington St. -> West Lexington Street
[{'created': {'changeset': '11129782',
              'timestamp': '2012-03-28T18:31:23Z',
              'uid': '451048',
              'user': 'bbmiller',
              'version': '7'},
  'id': '261114295',
  'pos': [41.9730791, -87.6866303],
  'type': 'node',
  'visible': 'true'},
 {'created': {'changeset': '8448766',
              'timestamp': '2011-06-15T17:04:54Z',
              'uid': '451048',
              'user': 'bbmiller',
              'version': '6'},
  'id': '261114296',
  'pos': [41.9730416, -87.6878512],
  'type': 'node',
  'visible': 'true'},
 {'created': {'changeset': '8581395',
              'timestamp': '2011-06-29T14:14:14Z',
              'uid': '451048',
              'user': 'bbmiller',
              'version': '5'},
  '

In [185]:
sf_bay_area = ShapeXmlToJson("san-francisco-bay_california.osm")
sf_bay_area.shape()

------------Problem inserting sfgov.org:objectid:16--------------
Node:
{'address': {'city': 'San Francisco',
             'housenumber': '1390',
             'postcode': '94102',
             'state': 'CA',
             'street': 'Market Street'},
 'amenity': 'post_office',
 'created': {'changeset': '22611423',
             'timestamp': '2014-05-29T04:29:32Z',
             'uid': '501715',
             'user': 'rkuris',
             'version': '7'},
 'id': '61689054',
 'name': 'Fox Plaza',
 'phone': '(415) 931-1053',
 'pos': [37.7774401, -122.4169146],
 'postal_code': '94102',
 'type': 'node'}
-------------------------------------------------
------------Problem inserting sfgov.org:office_typ:Post Office--------------
Node:
{'address': {'city': 'San Francisco',
             'housenumber': '1390',
             'postcode': '94102',
             'state': 'CA',
             'street': 'Market Street'},
 'amenity': 'post_office',
 'created': {'changeset': '22611423',
             'timestamp

In [3]:
def css_styling():
    styles = open("../css/custom.css", "r").read()
    return disp.HTML(styles)
css_styling()