#P3 Data Wrangling with MongoDB

###Map Area: San Francisco, CA

#0. Procedure
###Count types of tags in the dataset

To get a rough sense of the data

In [10]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json

In [11]:
def count_tags(filename):
        # YOUR CODE HERE
        tags = {}
        for event, elem in ET.iterparse(filename):
            if elem.tag not in tags.keys():
                tags[elem.tag] = 1
            else:
                tags[elem.tag] += 1
                
        return tags

In [12]:
print count_tags('san-francisco.osm')

{'node': 1410191, 'nd': 1677325, 'bounds': 1, 'member': 26246, 'tag': 949435, 'relation': 1687, 'way': 154315, 'osm': 1}


###Count number of tag types

In [17]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        # YOUR CODE HERE
        key = element.get('k')
        if re.search(lower, key):
            keys['lower'] += 1
        elif re.search(lower_colon, key):
            keys['lower_colon'] += 1
        elif re.search(problemchars, key):
            keys['problemchars'] += 1
            if key not in keys['problemchars_dict']:
                keys['problemchars_dict'][key] = 1
            else: 
                keys['problemchars_dict'][key] += 1
        else:
            keys['other'] += 1
            if key not in keys['other_dict']:
                keys['other_dict'][key] = 1
            else: 
                keys['other_dict'][key] += 1
                
    return keys


def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, 
            "problemchars_dict": {}, "other": 0,
            "other_dict": {}}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [19]:
tag_types_dict = process_map('san-francisco.osm')

In [20]:
print tag_types_dict

{'lower': 445870, 'other_dict': {'seamark:daymark:construction': 47, 'addr:street:source': 2, 'gnis:ST_num': 226, 'alt_name2': 1, 'seamark:buoy_special_purpose:status': 5, 'tiger:COUNTYFP': 1, 'seamark:fog_signal:group': 13, 'seamark:radar_transponder:category': 1, 'tiger:zip_right_3': 22, 'seamark:buoy_safe_water:colour': 1, 'tiger:zip_right_5': 1, 'tiger:zip_right_4': 7, 'seamark:daymark:source_date': 57, 'seamark:buoy_lateral:name': 7, 'tiger:zip_right_8': 1, 'service:bicycle:rental': 2, 'seamark:buoy_special_purpose:category': 6, 'tiger:zip_left_7': 1, 'tiger:zip_left_6': 1, 'tiger:zip_left_5': 1, 'tiger:zip_left_4': 16, 'tiger:zip_left_3': 50, 'seamark:platform:source': 1, 'tiger:zip_left_1': 642, 'seamark:daymark:shape': 57, 'seamark:buoy_lateral:system': 9, 'seamark:landmark:source_date': 1, 'parking:condition:both': 16, 'seamark:light:information': 1, 'seamark:beacon_special_purpose:status': 30, 'addr:1:housenumber': 1, 'seamark:fog_signal:sequence': 10, 'Building': 1, 'FG:area

In [21]:
print tag_types_dict['other_dict']

# most of them look like seamark (nautical chart standard), they can be ignored.

{'seamark:daymark:construction': 47, 'addr:street:source': 2, 'gnis:ST_num': 226, 'alt_name2': 1, 'seamark:buoy_special_purpose:status': 5, 'tiger:COUNTYFP': 1, 'seamark:fog_signal:group': 13, 'seamark:radar_transponder:category': 1, 'tiger:zip_right_3': 22, 'seamark:buoy_safe_water:colour': 1, 'tiger:zip_right_5': 1, 'tiger:zip_right_4': 7, 'seamark:daymark:source_date': 57, 'seamark:buoy_lateral:name': 7, 'tiger:zip_right_8': 1, 'service:bicycle:rental': 2, 'seamark:buoy_special_purpose:category': 6, 'tiger:zip_left_7': 1, 'tiger:zip_left_6': 1, 'tiger:zip_left_5': 1, 'tiger:zip_left_4': 16, 'tiger:zip_left_3': 50, 'seamark:platform:source': 1, 'tiger:zip_left_1': 642, 'seamark:daymark:shape': 57, 'seamark:buoy_lateral:system': 9, 'seamark:landmark:source_date': 1, 'parking:condition:both': 16, 'seamark:light:information': 1, 'seamark:beacon_special_purpose:status': 30, 'addr:1:housenumber': 1, 'seamark:fog_signal:sequence': 10, 'Building': 1, 'FG:area': 4, 'tiger:LINEARID': 7, 'NHS'

In [22]:
print len(tag_types_dict['other_dict'])

241


###Auditing Postal codes and Street names

In [5]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
from collections import defaultdict

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Rd.": "Road",
            "Ave": "Avenue"
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def audit_postal_code(postal_codes, postal_code):
    if postal_code not in postal_codes:
        postal_codes.add(postal_code)
        print postal_code
        

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def is_postal_code(elem):
    return (elem.attrib['k'] == "addr:postcode")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    postal_codes = set()
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
#                 if is_street_name(tag):
#                     audit_street_type(street_types, tag.attrib['v'])
                    
                if is_postal_code(tag):
                    audit_postal_code(postal_codes, tag.attrib['v'])

#     return street_types
    return postal_codes

In [6]:
# postal codes audit
p_codes = audit('san-francisco.osm')

94117
94102
94111
94710
94618
94611
94103
94704
94107
94002
94702
94703
94501
94709
94103-3124
94158
94115
94536
94706
94609
94556
94110
94132
94541
94124
94108
94131
94112
94121
94118
94123
94134
94114
94133
94116
94105
94122
94577
94804
94070
94109
94104
94030
94063
94607
94612
94612-2202
94904
94544
94404
94587
94213
94619
94602
94019
94080
94610
94608
94066
94939
94560
94555
94605
94403
94801
94014
94596
94549
94027
94061
94901
94065
94523
94606-3636
94606
94401
94502
94301
94925
94578
1087
94087
94705
94598
94113
90214
94707
94597
94920
93710
CA
94601
94595
94530
94143
94062
94129
94188
94044
94805
94303
94127
94025
CA 94133
94965
94002-3585
94402
94010
CA 94030
94118-4504
94563
CA:94103
94013
94015
CA 94544
94579
94166
94545
94546
94519
94130
94941
94549-5506
94720-1076
515
94301-2019
94017
94621
94603
94117-9991
94115 
94112 
94121 
94708
9412
94720
94613
95498
941234
94552


In [12]:
type(list(p_codes)[0])

str

###Postal codes

- whitespaces --> trim
- CA --> truncate the front
- extension --> truncate the back
- 4 digit, 3 digit --> drop

In [14]:
for code in p_codes:
    if len(code) != 5:
        print '--------'
        print code
        print len(code)
        print ' '

--------
CA 94544
8
 
--------
CA 94133
8
 
--------
CA:94103
8
 
--------
94612-2202
10
 
--------
94118-4504
10
 
--------
1087
4
 
--------
94720-1076
10
 
--------
CA 94030
8
 
--------
94549-5506
10
 
--------
9412
4
 
--------
515
3
 
--------
941234
6
 
--------
94115 
6
 
--------
94606-3636
10
 
--------
94301-2019
10
 
--------
94112 
6
 
--------
CA
2
 
--------
94002-3585
10
 
--------
94121 
6
 
--------
94103-3124
10
 
--------
94117-9991
10
 


In [23]:
for code in p_codes:
    if len(code) != 5:
        print '--------'
        n_string = code.strip('CA:').strip()
        index = n_string.find('-')
        if index > 0:
            print n_string[0:index]
        elif len(n_string) != 5:
            print 'not 5', n_string
        else:
            print n_string

--------
94544
--------
94133
--------
94103
--------
94612
--------
94118
--------
not 5 1087
--------
94720
--------
94030
--------
94549
--------
not 5 9412
--------
not 5 515
--------
not 5 941234
--------
94115
--------
94606
--------
94301
--------
94112
--------
not 5 
--------
94002
--------
94121
--------
94103
--------
94117


In [None]:
def update_postal_code(code):
    if len(code) != 5:
        n_string = code.strip('CA:').strip()
        index = n_string.find('-')
        if index > 0:
            return n_string[0:index]
        elif len(n_string) != 5:
            return ''
        else:
            return n_string
    else:
        return code

###Street types

In [28]:
st_types = audit('san-francisco.osm')

In [34]:
print sorted(st_types.keys())

['1', '1.3', '120', '12180142', '15th', '170', '2', '24th', '3.2', '300', '4.5', '41276', '9th', 'A', 'Abenue', 'Academy', 'Alameda', 'Alcatraz', 'Alley', 'Arguello', 'Ave', 'Ave.', 'Bay', 'Bluxome', 'Blvd', 'Blvd,', 'Blvd.', 'Boulavard', 'Boulvard', 'Bradshaw', 'Brannan', 'Bridge', 'Bridgeway', 'Broadway', 'California', 'Center', 'Circle', 'Clement', 'Clemente', 'Columbus', 'Cortland', 'Cragmont', 'Cres', 'Crescent', 'Ct', 'Ctr', 'Cut', 'D', 'Dr', 'Dr.', 'E', 'East', 'Embarcadero', 'Everett', 'F', 'Fillmore', 'G', 'Geary', 'H', 'Harrison', 'Highway', 'Hwy', 'I-580', 'I-580)', 'Ic', 'Int', 'Judah', 'King', 'Las', 'Leimert', 'Leslie', 'Lindbergh', 'Ln', 'Ln.', 'Lugano', 'M', 'Mall', 'Market', 'Market/Castro', 'Market/Noe', 'Mission', 'Ness', 'North', 'Oakridge', 'Ora', 'Palms', 'Park', 'Path', 'Peak', 'Pl', 'Plaza', 'Plz', 'Post', 'Powell', 'Pulgas', 'Rd', 'Rd.', 'Real', 'Rhein', 'Rock', 'Schwerin', 'Sobrante', 'Southgate', 'Spencer', 'St', 'St.', 'Steet', 'Steps', 'Sutter', 'Telegraph'

###Some invalid numeric street names

- house number in the wrong place
- 'PM'

###A's

- Abenue
- variants of avenue

###B's

- variants of blvd and broadway

###C's

- variants of crescent and court

###D's

- variants of drive

###H's

- variants of highway

###L's

- variants of lanes and plaza

###R's

- variants of road 

###S's 

- variants of streets

In [40]:
for e in sorted(st_types.keys()):
    print e
    print '---------'
    print st_types[e]
    print ''

1
---------
set(['W Of Us 101 @ Jct Sr 1'])

1.3
---------
set(['ALA 260 PM 1.3'])

120
---------
set(['12th Street #120'])

12180142
---------
set(['12180142'])

15th
---------
set(['15th'])

170
---------
set(['California Street, Suite 170'])

2
---------
set(['San Francisco Bicycle Route 2'])

24th
---------
set(['24th'])

3.2
---------
set(['ALA 84 PM 3.2'])

300
---------
set(['Mission Street #300'])

4.5
---------
set(['SF 80 PM 4.5'])

41276
---------
set(['Upton St 41276'])

9th
---------
set(['9th'])

A
---------
set(['Avenue A', 'Upton St #A'])

Abenue
---------
set(['Columbus Abenue'])

Academy
---------
set(['California College Prep Academy'])

Alameda
---------
set(['Alameda', 'The Alameda'])

Alcatraz
---------
set(['Alcatraz'])

Alley
---------
set(["Kahn's Alley", 'Redfield Alley', 'Hodges Alley'])

Arguello
---------
set(['Arguello'])

Ave
---------
set(['Greenwood Ave', 'Earl Ave', 'West Portal Ave', 'Telegraph Ave', 'Grand Ave', 'University Ave', '45th Ave', 'W 25th 

###Street types cleaning

In [43]:
# UPDATE THIS VARIABLE
mapping = { "Ave.": "Ave",
            "Abenue": "Ave",
            "ave": "Ave",
            "avenue": "Ave",
            "Blvd,": "Blvd",
            "Blvd.": "Blvd",
            "Boulavard": "Blvd",
            "Boulvard": "Blvd",
            "broadway": "Broadway",
            "bush": "Bush",
            "Cres": "Crescent",
            "Ctr": "Center",
            "Dr.": "Dr",
            "Hwy": "Highway",
            "Ln.": "Ln",
            "Plz": "Plaza",
            "parkway": "Parkway", 
            "Rd.": "Rd",
            "Steet": "St",
            "St.": "St",
            "st": "St",
            "street": "St",
            "square": "Square",
            "sutter": "Sutter" }


def update_street_name(name, mapping):

    # YOUR CODE HERE
    k = street_type_re.search(name)
    old = k.group()
    if old not in mapping.keys():
        return name
    else:
        new = mapping[old]
        return name.replace(old, new)


for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name

Leimert => Leimert
Willow Rd => Willow Rd
Se Quad I-680 / Rudgear Rd => Se Quad I-680 / Rudgear Rd
Ascot Rd => Ascot Rd
Ygnacio Valley Rd => Ygnacio Valley Rd
San Mateo Rd => San Mateo Rd
Marshlands Rd => Marshlands Rd
Rollins Rd => Rollins Rd
broadway => Broadway
Alameda => Alameda
The Alameda => The Alameda
Bay and Powell => Bay and Powell
Powell => Powell
Columbus Abenue => Columbus Ave
bush => Bush
Indian Rock Path => Indian Rock Path
Oak Street Path => Oak Street Path
Mendocino Path => Mendocino Path
Arden Path => Arden Path
Parnassus Path => Parnassus Path
Oakridge => Oakridge
E Of Center St @ I-580 => E Of Center St @ I-580
San Clemente => San Clemente
Avenue D => Avenue D
Avenue H => Avenue H
Bay => Bay
12th Street #120 => 12th Street #120
Alemany Boulvard => Alemany Blvd
Bancroft Steps => Bancroft Steps
24th => 24th
Kahn's Alley => Kahn's Alley
Redfield Alley => Redfield Alley
Hodges Alley => Hodges Alley
Market/Castro => Market/Castro
Broadway => Broadway
North Broadway => No

###Converting to JSON and preparing for MongoDB 

step is done with data_prep.py

In [1]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
import pdb


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

# UPDATE THIS VARIABLE
mapping = { "Ave.": "Ave",
            "Abenue": "Ave",
            "ave": "Ave",
            "avenue": "Ave",
            "Avenue": "Ave",
            "Blvd,": "Blvd",
            "Blvd.": "Blvd",
            "Boulavard": "Blvd",
            "Boulvard": "Blvd",
            "Boulevard": "Blvd",
            "broadway": "Broadway",
            "bush": "Bush",
            "Cres": "Crescent",
            "Ctr": "Center",
            "Court": "Ct",
            "Dr.": "Dr",
            "Drive": "Dr",
            "Hwy": "Highway",
            "Ln.": "Ln",
            "Lane": "Ln",
            "Plz": "Plaza",
            "parkway": "Parkway", 
            "Rd.": "Rd",
            "Road": "Rd",
            "Steet": "St",
            "St.": "St",
            "st": "St",
            "street": "St",
            "Street": "St",
            "square": "Square",
            "sutter": "Sutter" }


def update_street_name(name, mapping):
    """
    converts a single entry of street name to the appropriate
    name
    INPUT: streetname, mapping
    OUTPUT: appropriate name
    """
    # YOUR CODE HERE
    try: # find the last word
        k = street_type_re.search(name)
        old = k.group()
        if old not in mapping.keys():
            print 'last word case'
            return name
        else:
            new = mapping[old]
            print 'last word case'
            return name.replace(old, new)
    except AttributeError: # if cannot find the last word
        return name
    

def update_postal_code(code):
    """
    converts a single entry of postal code to the appropriate
    postal code
    INPUT: postal code
    OUTPUT: appropriate postal code
    """
    if len(code) != 5:
        n_string = code.strip('CA:').strip()
        index = n_string.find('-')
        if index > 0:
            return n_string[0:index]
        elif len(n_string) != 5:
            return ''
        else:
            return n_string
    else:
        return code
    
    
def shape_element(element):
    node = {}
    node['created'] = {}
    node['address'] = {}
    node['node_refs'] = []
    node['pos'] = [0, 0]

    if element.tag == "node" or element.tag == "way" :
        for tag in element.iter():
            
            if tag.tag == 'tag': # if it is a secondary tag
                key = tag.get('k')
                if re.search(lower, key):
                    node[key] = tag.get('v')

                elif re.search(lower_colon, key):
                    if key[0:4] == 'addr':
                        if type(node['address']) != type({}): # not sure why it's not dict
                            node['address'] = {}
                        
                        if key[5:] == 'street': # if street case
                            print 'street case'
                            print update_street_name(tag.get('v'), mapping)
                            
                            node['address'][key[5:]] = update_street_name(tag.get('v'),
                                                                          mapping)
                            
                        elif key[5:] == 'postcode': # if postal code case
                            print 'postcode'
                            print update_postal_code(tag.get('v'))
                            
                            node['address'][key[5:]] = update_postal_code(tag.get('v'))
                            
                        else: # not street or postal case
                            node['address'][key[5:]] = tag.get('v')
                    
                    else:
                        node[key] = tag.get('v')

                elif re.search(problemchars, key):
                    print 'problematic', key
                else:
                    print 'else', key
            
            else:
                node['type'] = element.tag

                for k, v in tag.attrib.iteritems():
                    if k in CREATED:
                        node['created'][k] = v

                    elif k == 'lat':
                        node['pos'][0] = float(v)

                    elif k == 'lon':
                        node['pos'][1] = float(v)

                    elif k == 'user':
                        pass

                    elif k == 'ref' and element.tag == 'way':
                        node['node_refs'].append(v)

                    else: 
                        node[k] = v

        if node['address'] == {}:
            del node['address']
            
        if len(node['node_refs']) == 0:
            del node['node_refs']
        
#         print node
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
#         i = 0
        for _, element in ET.iterparse(file_in):
#             i += 1
#             if i == 1000000:
#                 pdb.set_trace()
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

###Old working version

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:

{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB. 

Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to 
update the street names before you save them to JSON. 

In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings. 
- if second level tag "k" value contains problematic characters, it should be ignored
- if second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if second level tag "k" value does not start with "addr:", but contains ":", you can process it
  same as any other tag.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:

<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>

  should be turned into:

{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}

- for "way" specifically:

  <nd ref="305896090"/>
  <nd ref="1719825889"/>

should be turned into
"node_refs": ["305896090", "1719825889"]
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def shape_element(element):
    node = {}
    print 11111
    print element
    print 22222
    node['created'] = {}
    node['address'] = {}
    node['node_refs'] = []
    node['pos'] = [0, 0]
    if element.tag == "node" or element.tag == "way" :
        # YOUR CODE HERE
        for tag in element.iter():
            print 22333
            print tag.attrib
            # cut from here start 
            try:
                key = tag.get('k')
                if re.search(lower, key):
                    print 'lower', key, tag.get('v')
                    node[key] = tag.get('v')
                    print node
                elif re.search(lower_colon, key):
                    print 'lower colon', key, tag.get('v')
                    if key[0:4] == 'addr':
                        node['address'][key[5:]] = tag.get('v')
                    else:
                        node[key] = tag.get('v')
                    print node
                elif re.search(problemchars, key):
                    print 'problematic', key
                else:
                    print 'else', key
                print 23333
            except:
                pass
            # cut from here end
            node['type'] = element.tag
            for k, v in tag.attrib.iteritems():
                if k in CREATED:
                    node['created'][k] = v
                    
                elif k == 'lat':
                    node['pos'][0] = float(v)
                    
                elif k == 'lon':
                    node['pos'][1] = float(v)
                    
                elif k == 'user':
                    pass
                
                elif k == 'ref' and element.tag == 'way':
                    node['node_refs'].append(v)
                
                else: 
                    print 33331
                    
                    node[k] = v
                    #print node
                    #print 33332
                
        print 33333
        print node
        print 44444
        if node['address'] == {}:
            del node['address']
            
        if len(node['node_refs']) == 0:
            del node['node_refs']
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def test():
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    data = process_map('example.osm', True)
    #pprint.pprint(data)
    
    print 'TEST'
    print data[0]
    
    correct_first_elem = {
        "id": "261114295", 
        "visible": "true", 
        "type": "node", 
        "pos": [41.9730791, -87.6866303], 
        "created": {
            "changeset": "11129782", 
            "user": "bbmiller", 
            "version": "7", 
            "uid": "451048", 
            "timestamp": "2012-03-28T18:31:23Z"
        }
    }
    assert data[0] == correct_first_elem
    
    print 'TEST'
    print data[-1]
    assert data[-1]["address"] == {
                                    "street": "West Lexington St.", 
                                    "housenumber": "1412"
                                      }
    assert data[-1]["node_refs"] == [ "2199822281", "2199822390",  "2199822392", "2199822369", 
                                    "2199822370", "2199822284", "2199822281"]

if __name__ == "__main__":
    test()

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:

{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB. 

Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to 
update the street names before you save them to JSON. 

In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings. 
- if second level tag "k" value contains problematic characters, it should be ignored
- if second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if second level tag "k" value does not start with "addr:", but contains ":", you can process it
  same as any other tag.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:

<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>

  should be turned into:

{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}

- for "way" specifically:

  <nd ref="305896090"/>
  <nd ref="1719825889"/>

should be turned into
"node_refs": ["305896090", "1719825889"]
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def shape_element(element):
    node = {}
    print 11111
    print element
    print 22222
    node['created'] = {}
    node['address'] = {}
    node['node_refs'] = []
    node['pos'] = [0, 0]
    if element.tag == "node" or element.tag == "way" :
        # YOUR CODE HERE
        for tag in element.iter():
            print 22333
            print tag.attrib
            
            if tag.tag == 'tag': # if it is a secondary tag
                key = tag.get('k')
                if re.search(lower, key):
                    print 'lower', key, tag.get('v')
                    node[key] = tag.get('v')
                    print node
                elif re.search(lower_colon, key):
                    print 'lower colon', key, tag.get('v')
                    if key[0:4] == 'addr':
                        node['address'][key[5:]] = tag.get('v')
                    else:
                        node[key] = tag.get('v')
                    print node
                elif re.search(problemchars, key):
                    print 'problematic', key
                else:
                    print 'else', key
            
            else:
                node['type'] = element.tag

                for k, v in tag.attrib.iteritems():
                    if k in CREATED:
                        node['created'][k] = v

                    elif k == 'lat':
                        node['pos'][0] = float(v)

                    elif k == 'lon':
                        node['pos'][1] = float(v)

                    elif k == 'user':
                        pass

                    elif k == 'ref' and element.tag == 'way':
                        node['node_refs'].append(v)

                    else: 
                        node[k] = v

        print 33333
        print node
        print 44444
        if node['address'] == {}:
            del node['address']
            
        if len(node['node_refs']) == 0:
            del node['node_refs']
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def test():
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    data = process_map('example.osm', True)
    #pprint.pprint(data)
    
    print 'TEST'
    print data[0]
    
    correct_first_elem = {
        "id": "261114295", 
        "visible": "true", 
        "type": "node", 
        "pos": [41.9730791, -87.6866303], 
        "created": {
            "changeset": "11129782", 
            "user": "bbmiller", 
            "version": "7", 
            "uid": "451048", 
            "timestamp": "2012-03-28T18:31:23Z"
        }
    }
    assert data[0] == correct_first_elem
    
    print 'TEST'
    print data[-1]
    assert data[-1]["address"] == {
                                    "street": "West Lexington St.", 
                                    "housenumber": "1412"
                                      }
    assert data[-1]["node_refs"] == [ "2199822281", "2199822390",  "2199822392", "2199822369", 
                                    "2199822370", "2199822284", "2199822281"]

if __name__ == "__main__":
    test()