# PART 1: DATA WRANGLING

In [5]:
import xml.etree.cElementTree as ET

PATH = "/Users/tamtrinh/Documents/workspace/DAND/P3_Wrangle_OpenStreetMap/"
OSM_FILE = PATH+"richmond_virginia.osm"  
SAMPLE_FILE = PATH+"richmond_sample.osm"

#print(len(list(ET.iterparse(OSM_FILE))))
#len returned over 1.6 million aka over 1.6 mil lines

## Wrangling Prep: Downsize File to Sample Size
The original data file was very big (131.7 MB) making it difficult to work with. Therefore it was downsized to a sample file (66.6 MB) by taking every other top element. This sample file was still pretty big, so I parsed the file even further taking every 100th element of the sample file and used that information to explore the data before going back to the sample file to make my official cleaning. 

In [3]:
# Creating a Sample File:

k = 2 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    
    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [44]:
SAMPLE_FILE_100 = PATH+"richmond_sample_100.osm"

k = 100 # Parameter: take every k-th top level element

with open(SAMPLE_FILE_100, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    
    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

## Wrangling Prep: Data Overview

In [6]:
# Count Element Tags:
# Count through element tags to understand an overview of the data

import xml.etree.cElementTree as ET
import pprint


def count_tags(filename): # counts element tags                  
    tags={}
    for event, elem in ET.iterparse(filename):
        if type(elem.tag)=='None':
            pass
        if elem.tag not in tags.keys():
            tags[elem.tag] = 1
        else:
            tags[elem.tag] += 1
    return tags

In [7]:
count_tags(SAMPLE_FILE)

{'member': 1540,
 'nd': 350226,
 'node': 300985,
 'osm': 1,
 'relation': 231,
 'tag': 159299,
 'way': 31070}

In [9]:
# Find the number of unique users who contributed to the map

def get_unique_users(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if element.get('uid'):
            users.add(element.attrib['uid'])
            
    return len(users)

In [10]:
get_unique_users(SAMPLE_FILE)

357

## Wrangling Problem 1: Fixing Keys with a Space to an Underscore
A search through the keys of the elements revealed that most keys were in the correct spelling format, with most having only lower-case letters or a colon. However, a couple of the keys had problematic characters, which were updated to have an underscore instead of a space. 

In [29]:
# Find out the tag types

import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys): # finds whether the keys are lower-case, lower-colon, problemchars or others

    if element.tag == 'tag':
        k = element.attrib['k']
        if problemchars.search(k):
            keys['problemchars'] += 1
        elif lower_colon.search(k):
            keys['lower_colon'] += 1
        elif lower.search(k):
            keys['lower'] += 1
        else:
            keys['other'] += 1
    return keys


def process_map(filename): # counts key_type, problemchars, etc
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys


In [30]:
process_map(SAMPLE_FILE)

{'lower': 70776, 'lower_colon': 85141, 'other': 3380, 'problemchars': 2}

In [33]:
def fix_problemchars(osmfile): # replaces the space in the problemChar with underscores
    for _, element in ET.iterparse(osmfile):
        if element.tag == 'tag':
            k = element.attrib['k']
            probChars = problemchars.search(k)
            if probChars:
                print 'BEFORE'
                print k
                k = k.replace(probChars.group(),'_')
                print 'AFTER'
                print k


In [34]:
fix_problemchars(SAMPLE_FILE)

BEFORE
commit message
AFTER
commit_message
BEFORE
commit message
AFTER
commit_message


In [61]:
# modified function to be called in Main Function
def el_fix_problemchars(element): # replaces the space in the problemChar with underscores

    for child in element:
    
        if child.tag == 'tag':
            k = child.attrib['k']
            probChars = problemchars.search(k)
            if probChars:
                changed = True
                k = k.replace(probChars.group(),'_')
                child.attrib['k'] = k 
            
    return element
          


## Wrangling Problem 2: Update Street Types
To make the street types more consistent, the abbreviated street names were updated to the full spelling (e.g., Ave to Avenue and Tnpk to Turnpike). 

In [46]:
from collections import defaultdict

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]


def audit_street_type(street_types, street_name): # input street_types (defaultdict(set)) and street_name (string)
    m = street_type_re.search(street_name) # searches for the specified pattern (from street_type_re) in the string (street_name); in this case the street-type (ie: street, rd, dr, etc)
    if m: # if search finds a match, then m is a True boolean
        street_type = m.group() # m.group is the matched pattern; (in this case rd, drive, PWWY, etc)
        if street_type not in expected: # if street-type not in the expected list, it is added to the street_types dict
            street_types[street_type].add(street_name)


def is_street_name(elem): # input elem is a string, a line from the xml code
    return (elem.attrib['k'] == "addr:street") # if the attrib['k'] == 'addr:street', then that indicates that the element associated with the attribute is a street addr


def audit(osmfile): # this functon pulls all the helper functions tgr
    osm_file = open(osmfile, "r") # open file and read
    street_types = defaultdict(set) # initializes street_types as a dict, where if something not in the dictionary is called, then it is set as an empty set()
    for event, elem in ET.iterparse(osm_file, events=("start",)): # parses file

        if elem.tag == "node" or elem.tag == "way": # if the tag is node or way
            for tag in elem.iter("tag"): # for all tags with 'tag'
                if is_street_name(tag): # if 'tag' tag contains a street name
                    audit_street_type(street_types, tag.attrib['v']) # check if street type is in expected, else add it             
    osm_file.close()
    return street_types


In [48]:
street_types = audit(SAMPLE_FILE)

In [49]:
mapping = { "Ave": "Avenue",
            "Dr": "Drive",
            "Pkwy": "Parkway",
            "Rd": "Road",
            "St": "Street",
            "Tnpk": "Turnpike"
            }

def update_name(name, mapping):
    for key in mapping:
        if re.search(key, name):
            name = name.replace(key,mapping[key])

    return name

In [87]:
def update_street_types(osmfile):
    street_types = audit(osmfile)
    old_street_names = []
    updated_street_names = []
    for k in street_types.keys():
        for street in street_types[k]:
            old_street_names += [street]
            updated_street_names += [update_name(street, mapping)]
    return "old: ", old_street_names, "updated: ", updated_street_names

In [93]:
update_street_types(SAMPLE_FILE)

('old: ',
 ['Shockoe Slip',
  'Walnut Alley',
  'S. Addison St',
  'Coalfield Rd',
  'Brook Rd',
  'Whittall Way',
  'WC Commons Way',
  'West Broad Village',
  'Midlothian Turnpike',
  'Augusta Ave',
  'Park Ave',
  'Brushwood Ave',
  'Breezy Point Circle',
  'Chamberlayne Pkwy',
  'Braceland Dr',
  'Midlothian Tnpk',
  'Washington Highway'],
 'updated: ',
 ['Shockoe Slip',
  'Walnut Alley',
  'S. Addison Street',
  'Coalfield Road',
  'Brook Road',
  'Whittall Way',
  'WC Commons Way',
  'West Broad Village',
  'Midlothian Turnpike',
  'Augusta Avenue',
  'Park Avenue',
  'Brushwood Avenue',
  'Breezy Point Circle',
  'Chamberlayne Parkway',
  'Braceland Drive',
  'Midlothian Turnpike',
  'Washington Highway'])

In [69]:
# modified function to be called in Main Function 
def el_update_name(element, mapping):
    if element.tag == "node" or element.tag == "way": # if the tag is node or way
        for tag in element.iter("tag"): # for all tags with 'tag'
            if is_street_name(tag): # if 'tag' tag contains a street name
                for key in mapping:
                    try:
                        if re.search(key, tag.attrib['v']):
                            tag.attrib['v'] = tag.attrib['v'].replace(key,mapping[key])
                    except:
                        print("key: "+key)
                        print("tag: "+tag)

    return element

## Wrangling Problem 3: Standardizing Phone Numbers
Certain locations, such as businesses and buildings, had phone numbers listed. However, the phone numbers came in various formats (e.g., +1 804 123 4567 and (804) 123-4567). To make the phone numbers more standardardized, special characters were removed and spaces separating numbers were changed to hyphens (e.g., 1-804-123-4567). 

In [72]:
symbols_re = re.compile(r'[=\+\(\)/&<>;\'"\?%#$@\,\.\t\r\n]') # looks for special characters
space_re = re.compile(r'[ ]') # looks for space

In [73]:
# Update phone number to standardized form (eg. (804) 123-4567 to 804-123-4567)

def update_phone(number): 
    updated_number = re.sub(symbols_re, '', number)
    updated_number = re.sub(space_re, '-', updated_number)
    return updated_number

In [74]:
def process_phone(osmfile):
    osm_file = open(osmfile, "r") # open file and read
    num_before = []
    num_after = []
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node": # if the tag is node 
            for tag in elem.iter("tag"): # for all tags with 'tag'
                if tag.attrib['k']=='phone': # if the key is 'phone'
                    num_before.append(tag.attrib['v']) # add to num_before to see prior to change
                    after = update_phone(tag.attrib['v']) # update phone number to standardized form
                    num_after.append(after) # see numbers after change
    osm_file.close()
    return 'Before: ', num_before, 'After: ', num_after    

In [157]:
process_phone(SAMPLE_FILE)

('Before: ',
 ['804 276-3193',
  '(804) 864-2992',
  '(804) 643-8818',
  '(804) 644-2466',
  '(804) 644-5465',
  '(804) 643-8080',
  '804-648-3734',
  '(804) 343-3647',
  '(804) 367-2323',
  '804-323-2550',
  '804-796-4242',
  '804-323-2600',
  '+1 804-359-6688',
  '804.644.4411',
  '(804) 232-3446',
  '(804)355-6805',
  '8043677909',
  '(804) 550-0819',
  '(804) 554-5411',
  '(804) 780-2537',
  '804-346-2100',
  '(804) 270-SPIN',
  '(804) 747-1030',
  '(804) 965-0880',
  '804.290.4407',
  '(804) 360-2006',
  '(804) 360-3111',
  '(804) 364-2707',
  '(804) 360-5909',
  '(804) 447-4763',
  '(804) 355-0146',
  '(804) 342-6006',
  '804-340-5563',
  '(804) 359-8500',
  '804-257-5510',
  '+1 804-644-9091',
  '804-864-5363',
  '804-217-9883',
  '(804) 864-5336',
  '(804) 359-0603',
  '+1 804-649-1042',
  '(804) 353-0106',
  '(804) 308-8576',
  '(804) 254-3838',
  '+1 804-342-1272',
  '+1 (804) 204-1755',
  '+1 804-342-8990',
  '+1 804-353-8365',
  '+1 804-353-8518',
  '(804) 359-6646',
  '+1 

In [75]:
# modified function to be called in Main Function
def el_process_phone(element): 
    if element.tag == "node": # if the tag is node 
        for tag in element.iter("tag"): # for all tags with 'tag'
            if tag.attrib['k']=='phone': # if the key is 'phone'
                tag.attrib['v'] = update_phone(tag.attrib['v']) # update phone number to standardized form

    return element   

## Wrangling Wrap-up: Convert XML to CSV
The data has been audited and cleaned and is now ready to be converted to CSV format. 

In [24]:
schema_1= {
    'node': {
        'type': 'dict',
        'schema': {
            'id': {'required': True, 'type': 'integer', 'coerce': int},
            'lat': {'required': True, 'type': 'float', 'coerce': float},
            'lon': {'required': True, 'type': 'float', 'coerce': float},
            'user': {'required': True, 'type': 'string'},
            'uid': {'required': True, 'type': 'integer', 'coerce': int},
            'version': {'required': True, 'type': 'string'},
            'changeset': {'required': True, 'type': 'integer', 'coerce': int},
            'timestamp': {'required': True, 'type': 'string'}
        }
    },
    'node_tags': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'key': {'required': True, 'type': 'string'},
                'value': {'required': True, 'type': 'string'},
                'type': {'required': True, 'type': 'string'}
            }
        }
    },
    'way': {
        'type': 'dict',
        'schema': {
            'id': {'required': True, 'type': 'integer', 'coerce': int},
            'user': {'required': True, 'type': 'string'},
            'uid': {'required': True, 'type': 'integer', 'coerce': int},
            'version': {'required': True, 'type': 'string'},
            'changeset': {'required': True, 'type': 'integer', 'coerce': int},
            'timestamp': {'required': True, 'type': 'string'}
        }
    },
    'way_nodes': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'node_id': {'required': True, 'type': 'integer', 'coerce': int},
                'position': {'required': True, 'type': 'integer', 'coerce': int}
            }
        }
    },
    'way_tags': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'key': {'required': True, 'type': 'string'},
                'value': {'required': True, 'type': 'string'},
                'type': {'required': True, 'type': 'string', 'required': True}
            }
        }
    }
}


In [78]:
import csv
import codecs
import re
import xml.etree.cElementTree as ET
import cerberus
import P3_Schema # P3_Schema is a local file, it is being imported/accesed in this file

PATH = "/Users/tamtrinh/Documents/workspace/DAND/P3_Wrangle_OpenStreetMap/"
OSM_FILE = PATH+"richmond_virginia.osm"  
SAMPLE_FILE = PATH+"richmond_sample.osm"

NODES_PATH = PATH+"nodes_t.csv" # these are the different csv files that will be created/written
NODE_TAGS_PATH = PATH+"nodes_tags_t.csv"
WAYS_PATH = PATH+"ways_t.csv"
WAY_NODES_PATH = PATH+"ways_nodes_t.csv"
WAY_TAGS_PATH = PATH+"ways_tags_t.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = P3_Schema.schema # could also used the pasted info above and just input "schema_1" instead of "P3_Schema.schema"
# P3_Schema is a local file, 'schema' is variable defined in P3_Schema file (P3_Schema.schema accesses local file (P3_Schema) and variable ()"schema") in file)
# the data should be sorted into a structure like the schema;
# verify our shape_element function shapes data like schema before writing into csv
                     
# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

# shapes elements into the structure of the schema to save to csv later, 
# aka sort the elements into the corresponding dicts and lists
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    
    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    if element.tag == 'node':
        for field in node_attr_fields:
            node_attribs[field] = element.attrib[field]
        for child in element:
            tag = {}
            tag['id'] = element.attrib['id']
            tag['value'] = child.attrib['v']
            tag['type'] = default_tag_type
            problem = PROBLEMCHARS.search(child.attrib['k'])
            colon = LOWER_COLON.search(child.attrib['k'])
            if problem:
                pass
            elif colon:
                    m = colon.group()
                    tag_type = m.split(':')[0]
                    tag_key = m.split(':')[1]
                    tag['key'] = tag_key
                    tag['type'] = tag_type
            else:
                tag['key'] = child.attrib['k']
            tags.append(tag)
        
        return {'node': node_attribs, 'node_tags': tags}

    if element.tag == 'way':
        for field in way_attr_fields:
            way_attribs[field] = element.attrib[field]
        node_pos = 0
        for child in element:
            if child.tag == 'nd':
                node = {}
                node['id'] = way_attribs['id']
                node['node_id'] = child.attrib['ref']
                node['position'] = node_pos
                node_pos += 1
                way_nodes.append(node)
            elif child.tag == 'tag':
                tag = {}
                tag['id'] = element.attrib['id']
                tag['value'] = child.attrib['v']
                tag['type'] = default_tag_type
                problem = PROBLEMCHARS.search(child.attrib['k'])
                colon = LOWER_COLON.search(child.attrib['k'])
                if problem:
                    pass
                elif colon:
                        target = child.attrib['k'] 
                        tpos = target.find(':')
                        tag['type'] = target[0:tpos]
                        tag['key'] = target[tpos+1:]
                else:
                    tag['key'] = child.attrib['k']
                tags.append(tag)

        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
                


In [79]:
# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')): # parses file to get elements
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA): # validates the structure we've made from shape_element matches the example in schema
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_strings = (
            "{0}: {1}".format(k, v if isinstance(v, str) else ", ".join(v))
            for k, v in errors.iteritems()
        )
        raise cerberus.ValidationError(
            message_string.format(field, "\n".join(error_strings))
        )


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)



In [80]:
# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    # opens/writes names of new csvs
    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        # writes column names from _FIELDS into corresponding csvs        
        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

 
        # calls data cleaning functions
        for element in get_element(file_in, tags=('node', 'way')): # parses file and gets elements

            el_1 = el_fix_problemchars(element) # fix problemchars

            el_2 = el_update_name(el_1, mapping) # update street types

            el_3 = el_process_phone(el_2) # standardize phone numbers

            el = shape_element(el_3) # shapes elements and puts in appropriate dicts/lists
        
            # shapes/sorts elements into appropriate csvs  
            if el:
                if validate is True: # validates sorting is same with schema
                    validate_element(el, validator)

                if element.tag == 'node': # fills in csvs with node dicts/lists
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way': # fills in csvs with way dicts/lists
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])



In [81]:
dict_list = process_map(SAMPLE_FILE, validate=True)

In [17]:
# Test write csv, this function was not used 
node_attribs = {'id':'1', 'lat':'2', 'lon':'3', 'user':'4', 'uid':'5', 'version':'6', 'changeset':'7', 'timestamp':'8'}
with codecs.open(PATH+'test_1.csv', 'w') as nodes_file:
    nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
    nodes_writer.writeheader()
    nodes_writer.writerow(node_attribs)