In [None]:
import os
os.chdir("..")

In [2]:
import xml.etree.cElementTree as ET
import audit as AD
import transform as TF
import pprint
import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
lower_colon_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SAMPLE = "oxford_sample.osm"
CREATED = [ "version", "changeset", "timestamp", "user", "uid" ]

k_multilevel_one, k_multilevel_two = AD.classify_k_multilevel(SAMPLE)

In [3]:
## Defining a new class for the deeper nested structure 
## of the output dictionary ("document" in MongoDB)


class AutoVivification(dict):
    '''
    Implementation of perl's autovivification feature
    
    Reference:
    http://stackoverflow.com/questions/2600790/multiple-levels-of-collection-defaultdict-in-python
    '''
    def __getitem__(self, item):
        try:
            return dict.__getitem__(self, item)
        except KeyError:
            value = self[item] = type(self)()
            return value

In [4]:
def shape_element(element):
    doc = AutoVivification()
    if element.tag in ["node", "way", "relation"]:
        
        # Identify the category
        doc["category"] = element.tag
        
        # Check if the "pos" key should be created
        if "lat" in element.keys():
            doc["pos"] = [0, 0]
        
        # Process same-level attributes
        for attrib_name in element.keys():
            if attrib_name in CREATED:
                doc["created"][attrib_name] = element.attrib[attrib_name]
            elif (attrib_name == "lat"):
                doc["pos"][0] = float( element.attrib[attrib_name] )
            elif (attrib_name == "lon"):
                doc["pos"][1] = float( element.attrib[attrib_name] )                
            else:
                doc[attrib_name] = element.attrib[attrib_name]
        
        # Process lower-level "tag" attributes
        if (element.find("tag") != None):
            
            # Make modifications
            for tag in element.iter("tag"):
                key = tag.attrib["k"]
                value = tag.attrib["v"]
                if not problemchars.search(key):
                    
                    # Clean and shape data related to address
                    if key.startswith("addr:"):
                        addr_key, addr_val = TF.transform_address_k_v(key, value)
                        doc["address"][addr_key] = addr_val
                    
                    # Clean and shape data related to amenity
                    elif key.startswith("amenity"):
                        amenity_key, amenity_val = TF.transform_amenity_k_v(key, value)
                        if (key == "amenity:disused"):
                            doc["disused"][amenity_key] = amenity_val
                        else:
                            doc[amenity_key] = amenity_val
                    
                    
                    ########################################################
                    #####  (Add more field-specific transformations!)  #####
                    ########################################################
                    
                    
                    # Shape other data into default model
                    elif lower.search(key):
                        if key in k_multilevel_one:
                            doc[key]["basic_info"] = value
                        else:
                            doc[key] = value                    
                    elif lower_colon.search(key):
                        key_upper = key.split(":")[0]
                        key_lower = key.split(":")[1]
                        if key in k_multilevel_two:
                            doc[key_upper][key_lower]["basic_info"] = value
                        else:
                            doc[key_upper][key_lower] = value
                    elif lower_colon_colon.search(key):
                        key_upper = key.split(":")[0]
                        key_middle = key.split(":")[1]
                        key_lower = key.split(":")[2]
                        doc[key_upper][key_middle][key_lower] = value
        
        # Process lower-level "nd" attributes
        if (element.find("nd") != None):
            doc["node_refs"] = []
            for nd in element.iter("nd"):
                ref_num = nd.attrib["ref"]
                doc["node_refs"].append(ref_num)
        
        # Process lower-level "member" attributes
        if (element.find("member") != None):
            doc["member"] = []
            for mb in element.iter("member"):
                mb_dict = {}
                mb_dict["ref"] = mb.attrib["ref"]
                mb_dict["type"] = mb.attrib["type"]
                # Some members do not have a value for "role"
                if (mb.attrib["role"] != ""):
                    mb_dict["role"] = mb.attrib["role"]
                doc["member"].append(mb_dict)
        
        return doc
    
    else:
        return None

### Testing the shaping process

In [5]:
# Writing a function that turns an OSM file into a list of Python dictionaries


def make_dict(osm_file):
    
    events = ET.iterparse(osm_file, events=("start",))
    _, root = next(events)  # Grabbing the root element
    
    data = []
    for _, element in events:
        el = shape_element(element)
        if el:
            data.append(el)
            root.clear()  # Freeing up memory by clearing the root element
    
    return data

In [6]:
data = make_dict(SAMPLE)

In [7]:
len(data)

32455

In [8]:
# Audit dictionary entries (or "documents" in MongoDB)
pprint.pprint(data[30000:30050])

[{'barrier': 'hedge',
  'created': {'changeset': '13937575',
              'timestamp': '2012-11-19T20:53:21Z',
              'uid': '782788',
              'user': 'GordonFS',
              'version': '1'},
  'id': '191626788',
  'node_refs': ['2018439996', '2022094208', '2022094210', '2022094212'],
  'type': 'way'},
 {'barrier': 'hedge',
  'created': {'changeset': '13937575',
              'timestamp': '2012-11-19T20:59:01Z',
              'uid': '782788',
              'user': 'GordonFS',
              'version': '1'},
  'id': '191627468',
  'node_refs': ['2022102649', '2022099932'],
  'type': 'way'},
 {'barrier': 'wire_fence',
  'created': {'changeset': '13947869',
              'timestamp': '2012-11-20T17:28:50Z',
              'uid': '782788',
              'user': 'GordonFS',
              'version': '1'},
  'id': '191749080',
  'node_refs': ['2023296845',
                '2023296859',
                '2023296867',
                '2023296862',
                '2023296866',
    

### Saving into a JSON file

In [8]:
import codecs
import json


def process_map(file_in, pretty=False):
    file_out = "{0}.json".format(file_in)
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")

In [11]:
# Transform and save into JSON
process_map(SAMPLE)