# P3 Cleaning of the OpenStreet Map Data

In [None]:
#P3-Cleaning-of-the-OpenStreet-Map-Data#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow
import pprint
import re
import pprint
import csv
import codecs
import cerberus
import schema

#File names for csv format data
NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

#Schema was defined in the "schema.py" file
SCHEMA = schema.schema

#Define all the regular expressions used
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
lOWER = re.compile(r'^([a-z]|_)*$')
LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
postcode_re = re.compile(r'^\d{5}(-\d{4})?$')

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

# ================================================== #
#               Cleaning Functions                   #
# ================================================== #
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Freeway", "Loop", "Park","Way","Plaza","Speedway"]

mapping_streetType= { 
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "Blvd": "Boulevard",
            "Blvd.": "Boulevard",
            "Dr": "Drive",
            "Fwy":"Freeway",
             'Plaze':"Plaza",
            "Rd": "Road",
             "St": "Street",
             "Stree": "Street",
             "Ste":"Suite",
            "Hwy":"Highway",
            "Pkwy":'Parkway'}
mapping_streetName={
            'Beechnut': 'Beechnut Street',
            'Blossom': 'Blossom Street',
            'Driscoll':'Driscoll Street',
            'Durham': 'Durham Drive',
            'San Felipe':'San Felipe Street',
            'Graustark': 'Graustark Street',
            'Hidalgo': 'Hidalgo Street',
            'Hillcroft': 'Hillcroft Avenue',
            'Larchmont': 'Larchmont Road',
            'Maroneal': 'Maroneal Street',
             'Richmond': 'Richmond Avenue',
             'Riverway': 'Riverway Drive',
             'Chimney Rock': 'Chimney Rock Road',
            'Pine Valley': 'Pine Valley Drive',
            'Welford': 'Welford Drive',
            'Westheimer': 'Westheimer Road',
             'Westhimer': 'Westheimer Road',
            "Meyerland Plaza, Houston, TX 77096":"Beechnut Street",
            '77027':"Weslayan Street",
            'Southwest Freeway 59': 'Southwest Freeway',  
            'Meyerland Plaza Mall': "Beechnut Street"        
}

mapping_postcode={'Weslayan Street': '77027',
                  '7-':'77478'}
def get_key_tagType(key,default):
     """
         Get key and tag type from key
         Args:
                   key (string): the unprocessed key
                   default (string): default type of the key
         Returns:
                   string1: processed key
                   string2: updated type of the key
    """
    if PROBLEMCHARS.match(key):
        return None, None
    n=key.find(':')
    if n==-1:
        return default, key
    else:
        return key[:n],key[n+1:]

def update_street(name, mapping_streetName, mapping_streetType):
     """
         Update street names and types
         Args:
                   name (string): the unprocessed street name
                   mapping_streetName (dictionary): chart to match some incorrect info or typo for street names
                   mapping_streetTypes (dictionary): chart to map words with abbreviation
         Returns:
                   string: updated street name
    """
# Initialize Return
    corr_name=name
# Correct idiosyncratic cases:
    if name in mapping_streetName:
        corr_name=mapping_streetName[name]
#Expanding Common Abbr.
    else: 
        words=name.split()
        for i in range(len(words)) :
            if words[i] in mapping_streetType:
                words[i]=mapping_streetType[words[i]]
        corr_name=" ".join(words)
    return corr_name

def update_postcode(postcode, mapping_postcode):
    """
         Update postal codes
         Args:
                   name (string): the unprocessed postal code
                   mapping_postcode (dictionary): chart to map incorrect input postal codes with corrected ones
         Returns:
                   string: updated postal code
    """
    corr_postcode=postcode
    if not postcode_re.match(postcode):
        if postcode in mapping_postcode:
            corr_postcode=mapping_postcode[postcode]
    return corr_postcode
# ================================================== #
#               Formating Data Function              #
# ================================================== #
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    if element.tag == 'node':
        for attr in node_attr_fields:
            node_attribs[attr]=element.attrib[attr]
        for tag in element.iter("tag"):
            key_string=tag.attrib['k']
            #Process tag key
            tag_type,key=get_key_tagType(key_string,default_tag_type)
            #Assign the variables to use in the data structure
            node_id=node_attribs['id']
            value=tag.attrib['v']
            # Cleaning street names
            if key[0:6]=='street': value=update_street(value,mapping_streetName, mapping_streetType)
            # Cleaning post codes
            if key[0:8]=='postcode': value=update_postcode(value,mapping_postcode)
            # append data to node tag list
            tags.append({'id':node_id,
                            'key':key,
                            'value':value,
                            'type':tag_type})
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        for attr in way_attr_fields:
            way_attribs[attr]=element.attrib[attr]
        i=0
        for nd in element.iter("nd"):
            way_nodes.append({'id':way_attribs['id'],
                              'node_id':nd.attrib['ref'],
                              'position':i})
            i+=1
        for tag in element.iter("tag"):
            key_string=tag.attrib['k']
            tag_type,key=get_key_tagType(key_string,default_tag_type)
            #Assign the variables to use in the data structure
            way_id=way_attribs['id']
            value=tag.attrib['v']
            # Cleaning street names
            if key[0:6]=='street': value=update_street(value,mapping_streetName, mapping_streetType)
            # Cleaning post codes
            if key[0:8]=='postcode': value=update_postcode(value,mapping_postcode)
            # append data to way tag list
            tags.append({'id':way_id,
                         'key':key,
                         'value':value,
                         'type':tag_type})
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
    else:
        return None

# ================================================== #
#               Helper Functions                     #
# ================================================== #


def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)
# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
        codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])

if __name__ == '__main__':
# Validate with sample file first, then run the larger OSM file.
    #process_map('sample.osm', validate=True)
    process_map('HoustonSW.osm',validate=False)
    