# P3: Wrangle OpenStreetMap Data
<p style="font-size:20px;font-style:italic">W. Alexander Jenkins</p><br/>
<p>Map Area: Savannah, Georgia, United States</p><br/>
<a href="https://www.openstreetmap.org/relation/119867">https://www.openstreetmap.org/relation/119867</a><br/>
<a href="https://mapzen.com/data/metro-extracts/#savannah-georgia">https://mapzen.com/data/metro-extracts/#savannah-georgia</a>

## Problems Encountered in the Map
<br/>
<li>House numbers are sometimes ranges or multiple numbers. There is one instance where the street is including in the house number field.</li>
<li>Street names have inconsistent types</li>
<li>There are duplicate items from the TIGER data import and the attempts to correct them</li>
<li>Phone number formats are inconsistent</li>
<li>Feature tags are related but one may not easily see the relations</li>
<li>Some OSM tags are deprecated or discarded features</li>

In [1]:
import xml.etree.cElementTree as ET
import re
import pprint

datafile = 'savannah_georgia.osm'

"""
Returns a dictionary with the tag name as the key and number of times
this tag can be encountered in the map as value.
"""
def count_tags(filename):
        tags = {}
        for event, elem in ET.iterparse(filename):
            tag = elem.tag
            if tag in tags.keys():
                tags[tag] = tags[tag] + 1
            else:
                tags[tag] = 1
        
        return tags
    
"""
Checks the "k" value for each "<tag>" and see if they can be valid keys
in MongoDB, as well as see if there are any other potential problems.

Returns a count of each of four tag categories in a dictionary:
  "lower", # of tags that contain only lowercase letters and are valid,
  "lower_colon", # of otherwise valid tags with a colon in their names,
  "problemchars", # of tags with problematic characters, and
  "other", # of tags that do not fall into the other three categories.
"""

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        key = element.attrib['k']
        # print key
        if lower.match(key):
            keys['lower'] = keys['lower'] + 1
        elif lower_colon.match(key):
            keys['lower_colon'] = keys['lower_colon'] + 1
        elif problemchars.match(key):
            keys['problemchars'] = keys['problemchars'] + 1
        else:
            # print key
            keys['other'] = keys['other'] + 1
        
        
    return keys

def check_keys(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

pprint.pprint(count_tags(datafile))
check_keys(datafile)

{'bounds': 1,
 'member': 3115,
 'nd': 398680,
 'node': 333365,
 'osm': 1,
 'relation': 300,
 'tag': 141458,
 'way': 32203}


{'lower': 67181, 'lower_colon': 58905, 'other': 15372, 'problemchars': 0}

In [2]:
def count_other_keys(filename):
    others = {}
    for _, element in ET.iterparse(filename):
        if element.tag == "node" or element.tag == 'way':
            for tag in element.iter('tag'):
                key = tag.attrib['k']
                if lower.match(key):
                    continue
                elif lower_colon.match(key):
                    continue
                elif problemchars.match(key):
                    continue
                else:
                    if key in others.keys():
                        others[key] = others[key] + 1
                    else:
                        others[key] = 1
        
        
    return others

other_keys = count_other_keys(datafile)

def audit_keys(filename):
    keys = {}
    for _, element in ET.iterparse(filename):
        if element.tag == 'node' or element.tag == 'way':
            for tag in element.iter('tag'):
                key = tag.attrib['k']
                if lower.match(key) or lower_colon.match(key):
                    if key in keys:
                        keys[key] = keys[key] + 1
                    else:
                        keys[key] = 1
    return keys
               
unique_keys = audit_keys(datafile)

In [3]:
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!

The function process_map should return a set of unique user IDs ("uid")
"""

def get_user(element):
    if 'uid' in element.attrib.keys():
        return element.attrib['uid']
    
    return False


def get_unique_users(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if get_user(element):
            users.add(get_user(element))
            
    return users

unique_users = get_unique_users(datafile)

print "There are " + str(len(unique_users)) + " unique users for Savannah, GA."

There are 310 unique users for Savannah, GA.


In [4]:
from collections import defaultdict

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
house_number_re = re.compile(r'^[0-9]+$')
non_numeric_re = re.compile(r'[^0-9]')
numeric_re = re.compile(r'[0-9]+')


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Walk", "Way", "Expressway", "Circle"]

"""
Changes needed to fix the unexpected street types to the 
appropriate ones in the expected list. 
"""
mapping = { "St": "Street",
            "Ave": "Avenue",
            "Blvd": "Boulevard",
            "Dr":"Drive",
            "Cir":"Circle",
            "Ct":"Court",
            "Pl":"Place",
            "Sq":"Square",
            "Ln":"Lane",
            "Rd":"Road",
            "Pkwy":"Parkway"
            }

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit_house_num(numbers, housenumber):
    if house_number_re.match(housenumber):
        return
    else:
        numbers.add(housenumber)

def is_house_num(elem):
    return (elem.attrib['k'] == "addr:housenumber")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    numbers = set()
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
                if is_house_num(tag):
                    audit_house_num(numbers,tag.attrib['v'])

    return street_types,numbers
# phone numbers

def need_update(name):
    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            return True
    return False

"""
update_name (name, mapping)
ARGUMENTS:
  name:     street name (string)
  mapping:  changes needed to fix the unexpected street types (dict)
RETURNS: the fixed name (string)
"""
def update_name(name, mapping):

    ending = name[name.rfind(' ')+1:]
    name = name[:name.rfind(' ')+1] + mapping[ending]   

    return name

def fix_housenumber(housenumber):
    if house_number_re.match(housenumber):
        new_val = (housenumber,)
    else:
        numbers = numeric_re.findall(housenumber)
        new_val = tuple(numbers)
            
    return new_val

phone_re = re.compile(r'^[0-9]{3} [0-9]{3} [0-9]{4}$')

def format_phone(phonenumber):
    phone = ""
    for group in numeric_re.findall(phonenumber):
        phone = phone + group
    digits = len(phone)
    if digits < 7:
        return None
    elif digits == 7:
        phone = phone[:-4] + ' ' + phone[-4:]
    elif digits >= 10 and digits < 14:
        # remove prefix
        phone = phone[-10:-7] + ' ' + phone[-7:-4] + ' ' + phone[-4:]
    else:
        return None
    
    return phone

streets,numbers = audit(datafile)


In [5]:
import codecs
import json
"""
Transforms the shape of the data into the following model;
returns a list of dictionaries that look like this:
{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB. 

Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to 
update the street names before you save them to JSON. 

In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings. 
- if second level tag "k" value contains problematic characters, it should be ignored
- if second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if second level tag "k" value does not start with "addr:", but contains ":", you can process it
  same as any other tag.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:

<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>

  should be turned into:

{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}

- for "way" specifically:

  <nd ref="305896090"/>
  <nd ref="1719825889"/>

should be turned into
"node_refs": ["305896090", "1719825889"]
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        node['type'] = element.tag
        created = {}
        pos = [0]
        for key in element.attrib.keys():
            if key in CREATED:
                created[key] = element.attrib[key]
            elif key == "lat":
                pos[0] = float(element.attrib[key])
            elif key == 'lon':
                pos.append(float(element.attrib[key]))
            else:
                node[key] = element.attrib[key]
        
        if len(pos) > 1:
            node['pos'] = pos
        
        node['created'] = created
        addr = {}
        gnis = {}
        for tag in element.iter('tag'):
            key = tag.attrib['k']
            val = tag.attrib['v']
            
            if key == 'type':
                continue
            
            # fix data inconsistencies
            if is_house_num(tag):
                val = fix_housenumber(val)
            if is_street_name(tag):
                if need_update(val):
                    val = update_name(val,mapping)

            if problemchars.match(key):
                continue
            elif lower.match(key):
                node[key] = val
            elif lower_colon.match(key):
                tokens = key.split(':')
                if tokens[0] == 'tiger':
                    continue
                elif tokens[0] == 'addr':
                    addr[tokens[1]] = val
                    node['address'] = addr
                elif tokens[0] == 'gnis':
                    gnis[tokens[1]] = val
                    node['gnis'] = gnis
                elif tokens[0] == 'contact':
                    if tokens[1] == 'phone' and not phone_re.match(val):
                        val = format_phone(val)                        
                    node[tokens[1]] = val
                else:
                    node[key] = val
                    
        if element.tag == 'way':
            node_refs = []
            for tag in element.iter('nd'):
                node_refs.append(tag.attrib['ref'])
            if node_refs:
                node['node_refs'] = node_refs
        return node
    else:
        return None


def format_data(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [6]:
shaped_data = format_data(datafile)

In [11]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017")

db = client.osm_db

db.osm.remove({})

db.osm.insert(shaped_data)

db.osm.find_one()

{u'_id': ObjectId('56c939470be44463fd52c14d'),
 u'created': {u'changeset': u'31410812',
  u'timestamp': u'2015-05-24T03:04:10Z',
  u'uid': u'1962916',
  u'user': u'hokieengr',
  u'version': u'1'},
 u'id': u'3540439506',
 u'pos': [32.112124, -81.2340457],
 u'type': u'node'}

In [12]:
print "Number of documents"
                                                
print db.osm.find().count()                                                
                                                
print "Number of nodes"
                                                
print db.osm.find({"type":"node"}).count()
                                                
print "Number of ways"
                                                
print db.osm.find({"type":"way"}).count()

print "Number of unique users"
                                                
print len(db.osm.distinct("created.user"))
                                                
print "Top 1 contributing user"
                                                
print [doc for doc in db.osm.aggregate([{"$group":{"_id":"$created.user",
                                    "count":{"$sum":1}}},
                         {"$sort":{"count":-1}},
                         {"$limit":1}])]
    
print "Documents possibly created by bots"

bot_docs = [doc for doc in db.osm.aggregate([{'$match':{"created.user":{'$regex':'bot'}}},
                                             {'$group':{"_id":"$created.user",
                                                        "count":{'$sum':1}}},
                                             {'$sort':{"count":-1}}])]

print bot_docs

print "Total number of documents possibly created by bots"

bot_docs = [doc for doc in db.osm.aggregate([{'$match':{"created.user":{'$regex':'bot'}}},
                                             {'$group':{"_id":"$created.user",
                                                        "count":{'$sum':1}}},
                                             {'$group':{"_id":"Total Bot Docs",
                                                        "total":{'$sum':"$count"}}}])]
print bot_docs

print "Number of different amenities"

print len(db.osm.distinct("amenity"))

amenities = [doc for doc in db.osm.aggregate([{"$match": {"amenity":{"$exists":True}}},
                                              {"$group":{"_id":"$amenity",
                                                         "count":{"$sum":1}}},
                                              {"$sort":{"count":-1}},
                                              {"$limit":10}])]

print amenities



Number of documents
365568
Number of nodes
333365
Number of ways
32203
Number of unique users
299
Top 1 contributing user
[{u'count': 116189, u'_id': u'hokieengr'}]
Documents possibly created by bots
[{u'count': 32990, u'_id': u'woodpeck_fixbot'}, {u'count': 3764, u'_id': u'bot-mode'}, {u'count': 6, u'_id': u'xybot'}]
Total number of documents possibly created by bots
[{u'total': 36760, u'_id': u'Total Bot Docs'}]
Number of different amenities
48
[{u'count': 351, u'_id': u'place_of_worship'}, {u'count': 173, u'_id': u'parking'}, {u'count': 155, u'_id': u'school'}, {u'count': 74, u'_id': u'restaurant'}, {u'count': 49, u'_id': u'grave_yard'}, {u'count': 46, u'_id': u'bench'}, {u'count': 27, u'_id': u'fast_food'}, {u'count': 18, u'_id': u'library'}, {u'count': 17, u'_id': u'hospital'}, {u'count': 16, u'_id': u'fuel'}]


In [13]:
print len(db.osm.distinct("service"))

print [doc for doc in db.osm.aggregate([{"$match": {"service":{"$exists":True}}},
                                        {"$group":{"_id":"$service",
                                    "count":{"$sum":1}}},
                         {"$sort":{"count":-1}}])]

8
[{u'count': 822, u'_id': u'parking_aisle'}, {u'count': 261, u'_id': u'spur'}, {u'count': 193, u'_id': u'driveway'}, {u'count': 127, u'_id': u'alley'}, {u'count': 13, u'_id': u'drive-through'}, {u'count': 9, u'_id': u'siding'}, {u'count': 5, u'_id': u'emergency_access'}, {u'count': 1, u'_id': u'Joe Collins Lane'}]
