In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow
import re

## Get sample OSM 

In [None]:
OSM_FILE = "stockholm_sweden.osm"  
SAMPLE_FILE = "sample.osm"

In [None]:
k = 400 # Parameter: take every k-th top level element

with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [None]:
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

## Count tags

In [None]:
tags = {}
for event, elem in ET.iterparse(SAMPLE_FILE):
    if elem.tag not in tags:
        tags[elem.tag] = 1
    else:
        tags[elem.tag] += 1

In [None]:
tags

## Tags type

In [None]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        key = element.attrib['k']
        if re.match(lower, key) != None:
            keys['lower'] += 1
        elif re.match(lower_colon, key) != None:
            keys['lower_colon'] += 1
        elif re.match(problemchars, key) != None:
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
            print(key)
    return keys

In [None]:
keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
for _, element in ET.iterparse(SAMPLE_FILE):
    keys = key_type(element, keys)

keys

## Auditing Street names, not meaningful for Stockholm data

In [None]:
from collections import defaultdict

In [None]:
street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()

        street_types[street_type] += 1

def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print "%s: %d" % (k, v)

def is_street_name(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")

def audit(osm_file):
    for event, elem in ET.iterparse(osm_file):
        if is_street_name(elem):
            audit_street_type(street_types, elem.attrib['v'])
    print_sorted_dict(street_types)

## Shaping element

In [None]:
LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


# Make sure the fields order in the csvs matches the column order in the sql table schema
#'id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp'
NODE_FIELDS = ['id', 'lat', 'lon', 'changeset', 'timestamp', 'version', 'uid', 'user']
NODE_FIELDS_NOUSR = ['id', 'lat', 'lon', 'changeset', 'timestamp', 'version']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

In [None]:
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    if element.tag == 'node':
        # IF 'uid' or 'user' not in the node attribute
        try:
            for node_attrb in node_attr_fields:
                node_attribs[node_attrb] = element.attrib[node_attrb];
        except KeyError, e:
            for node_attrb in NODE_FIELDS_NOUSR:
                node_attribs[node_attrb] = element.attrib[node_attrb];
        if node_attribs['id'] == '935182740':
            print(node_attribs['id'])
                
        for tag in element.iter('tag'): #several tags
            node_tags = {}
            if not problem_chars.match(tag.attrib['k']):
                node_tags['id']=element.attrib['id']
                node_tags['value']=tag.attrib['v']
                if LOWER_COLON.search(tag.attrib['k']):
                    node_tags['key']=tag.attrib['k'].split(":",1)[1]
                    node_tags['type']=tag.attrib['k'].split(":",1)[0]
                else:
                    node_tags['key']=tag.attrib['k']
                    node_tags['type']=default_tag_type
                
                if node_tags['key']=='postcode':
                    node_tags['value'] = transfer_postcode(node_tags['value'])
                
                tags.append(node_tags)
        
        return {'node': node_attribs, 'node_tags': tags}

    elif element.tag == 'way':
        
        for way_attrb in way_attr_fields:
            way_attribs[way_attrb] = element.attrib[way_attrb];
        
        way_node = {}
        
        count = 0
        for nd in element.iter("nd"):
            way_nodes.append({'id': element.attrib['id'],
                              'node_id': nd.attrib['ref'],
                              'position': count})
            count += 1        
            
        
        for tag in element.iter('tag'): #several tags
            way_tags = {}
            if not problem_chars.match(tag.attrib['k']):                
                way_tags['id']=element.attrib['id']
                way_tags['value']=tag.attrib['v']
                if LOWER_COLON.match(tag.attrib['k']):
                    way_tags['key']=tag.attrib['k'].split(":",1)[1]
                    way_tags['type']=tag.attrib['k'].split(":",1)[0]
                else:
                    way_tags['key']=tag.attrib['k']
                    way_tags['type']=default_tag_type
                
                if way_tags['key']=='postcode':
                    way_tags['value'] = transfer_postcode(way_tags['value'])                   
                    
                tags.append(way_tags)
              
        
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}

## Solve unconsistnet postcode

In [None]:
def transfer_postcode(postcode):
    """Transform postcode to correct format.

    :param postcode:
    :return:

    >>> transfer_postcode("11619")
    '116 19'
    >>> transfer_postcode("116 19")
    '116 19'
    """
    match = re.search(r"(\d{3})\s*(\d{2})", postcode)
    if match:
        return match.group(1) + " " + match.group(2)


# manual test
print transfer_postcode("11619")
print transfer_postcode("116  19")

In [None]:
NODES_PATH = "nodes_sample.csv"
NODE_TAGS_PATH = "nodes_tags_sample.csv"
WAYS_PATH = "ways_sample.csv"
WAY_NODES_PATH = "ways_nodes_sample.csv"
WAY_TAGS_PATH = "ways_tags_sample.csv"

In [None]:
NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

## Write data to csv

In [None]:
import csv
class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('ISO-8859-1','ignore') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

In [None]:
import codecs
def process_map(file_in):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'wb') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'wb') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'wb') as ways_file, \
        codecs.open(WAY_NODES_PATH, 'wb') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'wb') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()


        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])

In [None]:
process_map(SAMPLE_FILE)

In [None]:
process_map(OSM_FILE)

In [None]:
 def check(FileName):
    datafile = file(FileName)
    found = False
    for line in datafile:
        if 'scuba_diving' in line:
            print(line)
            found = True
    return found        


if check(OSM_FILE):
    print "true"
else:
    print "false"

if check("nodes.csv"):
    print "true"
else:
    print "false"