In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"
import zipfile as zf
import requests, io
import shutil
import html5lib
from plotly.offline import iplot, iplot_mpl 
import plotly.graph_objs as go
%matplotlib inline
import bz2
import xml.etree.cElementTree as et
from collections import defaultdict
import re
import pprint
import csv
import codecs
import cerberus

In [2]:
url="https://s3.amazonaws.com/metro-extracts.mapzen.com/new-york_new-york.osm.bz2"
r=requests.get(url,timeout=1)

In [3]:
def count_tags(filename):
        tags=defaultdict()
        for event, element in et.iterparse(filename,events=("start",)):
            if element.tag not in tags.keys():
                tags[element.tag]=1
            elif element.tag in tags.keys():
                tags[element.tag]+=1
        return tags

with bz2.BZ2File(io.BytesIO(r.content)) as xml:
    tags=count_tags(xml)

In [5]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

def key_type(element, keys):
    if element.tag == "tag":
        if lower.match(element.attrib.get('k')):
            keys['lower']+=1
        elif lower_colon.match(element.attrib.get('k')):
            keys['lower_colon']+=1
        elif problemchars.match(element.attrib.get('k')):
            keys['problemchars']+=1
        else:
            keys['other']+=1
        pass
        
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in et.iterparse(filename):
        keys = key_type(element, keys)

    return keys

with bz2.BZ2File(io.BytesIO(r.content)) as xml:
    keys=process_map(xml)

In [6]:
def user_func(filename):
    users = set()
    for _, element in et.iterparse(filename):
        if element.tag in ["node","relation","way"]:
            user=element.attrib.get('user')
            if user not in users:
                users=users.union([user])
            else:
                pass
        
        else:
            pass

    return users
with bz2.BZ2File(io.BytesIO(r.content)) as xml:
    users=user_func(xml)
    

In [7]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")
def street_func(filename):
    street_names = defaultdict()
    for _, element in et.iterparse(filename):
        for i in element.iter("tag"):
            if i==None:
                continue
            else:
                if is_street_name(i):
                    street_name=i.attrib.get('v')
                    if street_name not in street_names:
                        street_names[street_name]=1
                    else:
                        street_names[street_name]+=1
                else:
                    continue

    return street_names
with bz2.BZ2File(io.BytesIO(r.content)) as xml:
    street_names=street_func(xml)

In [3]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
expected = ["Airport","Alley","Avenue", "Boulevard","Bridge","Building","Circle", \
            "Close","Court","Concourse","Commerce", "Common","Commons","Crescent","Cross","Drive",\
            "Driveway","Expressway","Highway","Lane","Loop","Park","Parkway","Path",\
            "Place","Plaza""Ridge","Road","Route","Run","Slip","Square","Street","Suite",\
            "Terrace","Trace","Trail","Thruway","Turnpike","Walk","Walkway","Way"]

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def audit(osm_file):
    street_types = defaultdict(set)
    for event, elem in et.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    return street_types

with bz2.BZ2File(io.BytesIO(r.content)) as osm_file:
    street_type=audit(osm_file)

In [8]:
with open('C://Users/Zohaib/Desktop/Lectures/Udacity/Streets.txt','w') as f:
    pprint.pprint(street_type,f)

In [4]:
mapping = { "Americas\n":"Americas",
            "Ave.":"Avenue",
            "ave":"Avenue",
            "avenue":"Avenue",
            "Ave,":"Avenue",
            "Avene":"Avenue",
            "Aveneu":"Avenue",
            "Ave":"Avenue",
            "AVE.":"Avenue",
            "AVE":"Avenue",
            "AVenue":"Avenue",
            "AVENUE":"Avenue",
            "bl":"Boulevard",
            "bl":"Building",
            "Blv.":"Boulevard",
            "boulevard":"Boulevard",
            "Blvd.":"Boulevard",
            "Blvd":"Boulevard",
            "BLDG":"Building",
            "BLD":"Building",
            "Cir":"Circle",
            "Ct.":"Court",
            "Ct":"Court",
            "Ctr":"Center",
            "Crst":"Cresecent",
            "Cres":"Crescent",
            "Cmn":"Common",
            "Concrs":"Concourse",
            "Cv":"Cove",
            "drive":"Drive",
            "DRIVE":"Drive",
            "Dr.":"Drive",
            "Dr":"Drive",
            "EAST":"East",
            "E":"East",
            "Expy":"Expressway",
            "Grn":"Green",
            "HIGHWAY":"Highway",
            "Hwy":"Highway",
            "LANE":"Lane",
            "lane":"Lane",
            "Ldg":"Landing",
            "Ln":"Lane",
            "N":"North",
            "north":"North",
            "Pky":"Parkway",
            "Pkwy":"Parkway",
            "PLAZA":"Plaza",
            "PARKWAY":"Parkway",
            "Plz":"Plaza",
            "Pl":"Place",
            "Pl":"Place",
            "PLACE":"Place",
            "Pt":"Point",
            "Rd.": "Road",
            "Rd":"Road",
            "ROAD":"Road",
            "Rdg":"Ridge",
            "route":"Route",
            "route":"Route",
            "road":"Road",
            "St.": "Street",
            "St": "Street",
            "st.":"Street",
            "st ":"Street",
            "street":"Street",
            "STREET":"Street",
            "ST":"Street",
            "Ste.":"Suite",
            "Ste":"Suite",
            "STE":"Suite",
            "S":"South",
            "SOUTH":"South",
            "STREET":"Street",
            "Turnlike":"Turnpike",
            "Tunrpike":"Turnpike",
            "Tunpike":"Turnpike",
            "Tpke":"Turnpike",
            "Tirnpike":"Turnpike",
            "Ter":"Terrace",
            "Trce":"Trace",
            "WAY":"Way",
            "W.":"West",
            "W":"West",
            "west":"West",
            "WEST":"West"}
def update_name(name, mapping):
    map=''
    mapping.keys()
    for i in mapping.keys():
        name=re.sub('(?<![a-zA-Z0-9])(?<=''){}(?!\.)(?![a-zA-Z0-9\-])'.format(i),mapping[i],name)
    return name

for st_type, ways in street_type.items():
        for name in ways:
            better_name = update_name(name, mapping)

In [9]:
NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

import sche
SCHEMA = sche.schema

NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements
    list=['id','user','uid','version','lat','lon','timestamp','changeset']
    listw=['id','user','uid','version','timestamp','changeset']
    list1=['id','key','value','type']
    list2=['id','node_id','position']
    if element.tag == 'node':
       node_attribs.update(element.attrib)
       k=node_attribs.keys()
       for i in k:
           if i not in list:
               del node_attribs[i]
       for i in element.iter("tag"):
           temp_dict=defaultdict()
           if i==None:
               continue
           elif ':' in i.attrib.get('k'):
               temp_dict['id']=element.attrib.get('id')
               temp_dict['type']=i.attrib.get('k').partition(':')[0]
               temp_dict['key']=i.attrib.get('k').partition(':')[2]
               temp_dict['value']=i.attrib.get('v')
               tags.append(temp_dict)
           else:
               temp_dict['id']=element.attrib.get('id')
               temp_dict['key']=i.attrib.get('k')
               temp_dict['type']=default_tag_type
               temp_dict['value']=i.attrib.get('v')
               tags.append(temp_dict)
               
            
    elif element.tag == 'way':
        way_attribs.update(element.attrib)
        k=way_attribs.keys()
        for i in k:
            if i not in listw:
                del way_attribs[i]
        for a,i in enumerate(element.iter("nd")):
            temp_dict=defaultdict()
            if i==None:
                continue
            else:
                temp_dict['id']=element.attrib.get('id')
                temp_dict['node_id']=i.attrib.get('ref')
                temp_dict['position']=a
                way_nodes.append(temp_dict)
        for i in element.iter("tag"):
            temp_dict=defaultdict()
            if ':' in i.attrib.get('k'):
                temp_dict['id']=element.attrib.get('id')
                temp_dict['type']=i.attrib.get('k').partition(':')[0]
                temp_dict['key']=i.attrib.get('k').partition(':')[2]
                temp_dict['value']=i.attrib.get('v')
                tags.append(temp_dict)
            else:
                temp_dict['id']=element.attrib.get('id')
                temp_dict['key']=i.attrib.get('k')
                temp_dict['type']=default_tag_type
                temp_dict['value']=i.attrib.get('v')
                tags.append(temp_dict)

    
    if element.tag == 'node':
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}

In [27]:
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = et.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.items())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))

class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""
    
    def writerow(self, row):
        print(row)
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-16') if type(v)=='str' else v) for k, v in row.items()
        })
        
    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

In [28]:
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file,\
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file,\
         codecs.open(WAYS_PATH, 'w') as ways_file,\
        codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file,\
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


with bz2.BZ2File(io.BytesIO(r.content)) as osm_file:
    process_map(osm_file, validate=True)


{'id': 'id', 'lat': 'lat', 'lon': 'lon', 'user': 'user', 'uid': 'uid', 'version': 'version', 'changeset': 'changeset', 'timestamp': 'timestamp'}
{'id': 'id', 'key': 'key', 'value': 'value', 'type': 'type'}
{'id': 'id', 'user': 'user', 'uid': 'uid', 'version': 'version', 'changeset': 'changeset', 'timestamp': 'timestamp'}
{'id': 'id', 'node_id': 'node_id', 'position': 'position'}
{'id': 'id', 'key': 'key', 'value': 'value', 'type': 'type'}
{'id': '26769789', 'lat': '40.6995927', 'lon': '-74.1868914', 'version': '4', 'timestamp': '2016-07-25T17:17:46Z', 'changeset': '41015803', 'uid': '326503', 'user': 'wambag'}
{'id': '26769792', 'lat': '40.6962016', 'lon': '-74.1779077', 'version': '5', 'timestamp': '2016-07-12T22:13:28Z', 'changeset': '40698552', 'uid': '326503', 'user': 'wambag'}
{'id': '26769800', 'lat': '40.685869', 'lon': '-74.1908483', 'version': '5', 'timestamp': '2016-07-28T23:41:20Z', 'changeset': '41096881', 'uid': '326503', 'user': 'wambag'}
defaultdict(None, {'id': 26769800

defaultdict(None, {'id': 28437775, 'key': 'created_by', 'type': 'regular', 'value': 'Merkaartor 0.12'})
{'id': '28437792', 'lat': '40.6995968', 'lon': '-74.3609606', 'version': '5', 'timestamp': '2013-11-20T03:21:46Z', 'changeset': '19005923', 'uid': '584325', 'user': 'bhousel'}
{'id': '28437871', 'lat': '40.700287', 'lon': '-74.3540598', 'version': '5', 'timestamp': '2008-09-21T23:57:58Z', 'changeset': '678037', 'uid': '60905', 'user': 'John Peterson'}
defaultdict(None, {'id': 28437871, 'key': 'created_by', 'type': 'regular', 'value': 'Merkaartor 0.12'})
{'id': '28437885', 'lat': '40.7006369', 'lon': '-74.3514816', 'version': '4', 'timestamp': '2008-09-21T23:57:58Z', 'changeset': '678037', 'uid': '60905', 'user': 'John Peterson'}
defaultdict(None, {'id': 28437885, 'key': 'created_by', 'type': 'regular', 'value': 'Merkaartor 0.12'})
{'id': '28437897', 'lat': '40.7015805', 'lon': '-74.3483488', 'version': '4', 'timestamp': '2008-09-21T23:57:59Z', 'changeset': '678037', 'uid': '60905', '

defaultdict(None, {'id': 30979324, 'key': 'created_by', 'type': 'regular', 'value': 'JOSM'})
{'id': '30979327', 'lat': '40.6920237', 'lon': '-74.0202303', 'version': '3', 'timestamp': '2013-08-03T03:17:11Z', 'changeset': '17199164', 'uid': '632378', 'user': '3yoda'}
{'id': '30979329', 'lat': '40.6925622', 'lon': '-74.0200032', 'version': '4', 'timestamp': '2013-08-03T03:17:11Z', 'changeset': '17199164', 'uid': '632378', 'user': '3yoda'}
{'id': '30979333', 'lat': '40.6931376', 'lon': '-74.019728', 'version': '3', 'timestamp': '2013-08-03T03:17:11Z', 'changeset': '17199164', 'uid': '632378', 'user': '3yoda'}
{'id': '30979335', 'lat': '40.6932313', 'lon': '-74.019438', 'version': '3', 'timestamp': '2013-08-03T03:17:11Z', 'changeset': '17199164', 'uid': '632378', 'user': '3yoda'}
{'id': '30979337', 'lat': '40.6932325', 'lon': '-74.019268', 'version': '4', 'timestamp': '2013-08-03T03:17:11Z', 'changeset': '17199164', 'uid': '632378', 'user': '3yoda'}
{'id': '30979375', 'lat': '40.6911278', 

{'id': '39076485', 'lat': '40.7696971', 'lon': '-73.7609616', 'version': '12', 'timestamp': '2009-08-15T06:03:50Z', 'changeset': '2149593', 'uid': '14293', 'user': 'KindredCoda'}
{'id': '39076486', 'lat': '40.7668825', 'lon': '-73.7593737', 'version': '11', 'timestamp': '2009-08-15T06:04:41Z', 'changeset': '2149593', 'uid': '14293', 'user': 'KindredCoda'}
{'id': '39076490', 'lat': '40.7624294', 'lon': '-73.7570906', 'version': '4', 'timestamp': '2014-09-29T22:29:28Z', 'changeset': '25757341', 'uid': '1376118', 'user': 'ChrisZontine'}
defaultdict(None, {'id': 39076490, 'key': 'ref', 'type': 'regular', 'value': '31W'})
defaultdict(None, {'id': 39076490, 'key': 'highway', 'type': 'regular', 'value': 'motorway_junction'})
{'id': '39076495', 'lat': '40.7613962', 'lon': '-73.7557236', 'version': '3', 'timestamp': '2012-04-22T20:03:32Z', 'changeset': '11388484', 'uid': '113450', 'user': 'nfgusedautoparts'}
{'id': '39076496', 'lat': '40.7604206', 'lon': '-73.7533827', 'version': '3', 'timestam

UnicodeEncodeError: 'charmap' codec can't encode character '\u044f' in position 33: character maps to <undefined>