In [1]:
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
from collections import defaultdict
import csv
import codecs
import cerberus
import schema
import sqlite3

In [2]:
OSM_FILE = "federal_district.osm"
SAMPLE_FILE = "federal_district_sample.osm"

### Create sample file from the original Federal District OSM
##### Functions
_From project P3: Limpando os dados do OpenStreetMap. 2. Detalhes do projeto_

In [3]:
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

##### Callers

In [4]:
k = 10 # Parameter: take every k-th top level element

with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

### Check out the tags occurences on the file
##### Functions
_from lesson "Case Study: OpenStreetMap Data", "3. Quiz: Iterative Treatment". Adapted_

In [5]:
def count_tags(filename):
    """Count the number of tags in the file"""
    
    tags = defaultdict(int)
        
    for event, elem in ET.iterparse(filename):
            tags[elem.tag] += 1
    return tags

def count_attribs(filename):
    """Count the number of tags attributes in the file"""
    
    attribs = defaultdict(int)
        
    for event, elem in ET.iterparse(filename):
        for tag_attribute in elem.attrib:
                attribs[tag_attribute] += 1
    return attribs

##### Callers

In [6]:
# count the occurrences of tags and tags attributes
print "tags mapping:"
pprint.pprint(dict(count_tags(OSM_FILE)))
print ""
print "attributes mapping:"
pprint.pprint(dict(count_attribs(OSM_FILE)))

tags mapping:
{'bounds': 1,
 'member': 6848,
 'nd': 588469,
 'node': 446164,
 'osm': 1,
 'relation': 957,
 'tag': 220336,
 'way': 94703}

attributes mapping:
{'changeset': 541824,
 'generator': 1,
 'id': 541824,
 'k': 220336,
 'lat': 446164,
 'lon': 446164,
 'maxlat': 1,
 'maxlon': 1,
 'minlat': 1,
 'minlon': 1,
 'ref': 595317,
 'role': 6848,
 'timestamp': 541825,
 'type': 6848,
 'uid': 541824,
 'user': 541824,
 'v': 220336,
 'version': 541825}


### Check the k attribute values
##### Functions

In [7]:
def count_k_values(filename):
    """Find k values on the tags and count the number of each occurrence"""
    
    k_values = defaultdict(int)
    for _, element in ET.iterparse(filename):
        if (element.tag == 'tag') and ('k' in element.attrib):
                    k_values[element.attrib['k']] += 1
 
    return k_values

##### Callers

In [8]:
# count the occurrences of each "k" attribute  of <tag>
counter = count_k_values(OSM_FILE)
print "number of distinct k attribute: ", len(counter)
pprint.pprint(sorted( ((v,k) for k,v in counter.iteritems()), reverse=True))

number of distinct k attribute:  666
[(64589, 'highway'),
 (29440, 'name'),
 (21011, 'building'),
 (13639, 'oneway'),
 (12127, 'source'),
 (10975, 'surface'),
 (7018, 'amenity'),
 (5834, 'maxspeed'),
 (3381, 'landuse'),
 (2915, 'noexit'),
 (2681, 'addr:street'),
 (2470, 'shop'),
 (2414, 'leisure'),
 (2009, 'building:levels'),
 (1822, 'access'),
 (1790, 'power'),
 (1756, 'lanes'),
 (1651, 'service'),
 (1350, 'ref'),
 (1126, 'natural'),
 (1065, 'addr:city'),
 (1032, 'addr:housenumber'),
 (969, 'type'),
 (966, 'barrier'),
 (899, 'junction'),
 (812, 'waterway'),
 (797, 'note'),
 (774, 'short_name'),
 (762, 'phone'),
 (750, 'addr:postcode'),
 (698, 'layer'),
 (697, 'addr:suburb'),
 (628, 'bicycle'),
 (624, 'sport'),
 (613, 'place'),
 (575, 'inep'),
 (563, 'operator'),
 (547, 'bridge'),
 (542, 'cuisine'),
 (496, 'crossing'),
 (491, 'restriction'),
 (462, 'parking'),
 (457, 'foot'),
 (432, 'boundary'),
 (410, 'addr:place'),
 (366, 'website'),
 (353, 'office'),
 (350, 'aeroway'),
 (324, 'heigh

 (1, 'name:ace'),
 (1, 'name:ab'),
 (1, 'microbrewery'),
 (1, 'maxwidth'),
 (1, 'lift_gate:type'),
 (1, 'leisure_2'),
 (1, 'leisure_1'),
 (1, 'layer_1'),
 (1, 'lawyer'),
 (1, 'landmark'),
 (1, 'inscription'),
 (1, 'industrial'),
 (1, 'iata'),
 (1, 'hoops'),
 (1, 'health_facility:type'),
 (1, 'handrail'),
 (1, 'golf:par'),
 (1, 'golf:course'),
 (1, 'golf'),
 (1, 'genus'),
 (1, 'generator:method'),
 (1, 'fuel:biogas'),
 (1, 'fuel:biodiesel'),
 (1, 'free_flying:site'),
 (1, 'floor:material'),
 (1, 'flagpole'),
 (1, 'flag:type'),
 (1, 'flag:country'),
 (1, 'fire_hydrant:type'),
 (1, 'female'),
 (1, 'fax'),
 (1, u'endere\xe7o'),
 (1, 'disused:amenity'),
 (1, 'direction'),
 (1, 'diet:lactose_free'),
 (1, 'diameter_crown'),
 (1, 'destination:lanes'),
 (1, 'currency:USD'),
 (1, 'currency'),
 (1, 'content'),
 (1, 'contact:instagram'),
 (1, 'collection_times'),
 (1, 'coffee'),
 (1, 'clock'),
 (1, 'capital_ISO3166-1'),
 (1, 'capital'),
 (1, 'capacity:women'),
 (1, 'button_operated'),
 (1, 'bridge

### Note:
* every tag with lat has a long attribute
* changeset, id, uid and user matches, timestamp and version matches, but (changeset, id, uid and user) and (timestamp and version) does not (541824 != 541825)
    * _this is due to the "osm" tag, that has the version and timestamp but is about the OSM export file, not the inserted tag itself_
* k and v attributes matches
* total ref matches with total (nd and member). nd and member are the tags that have ref attributes
* every tag "tag" has a "k" and "v" attribute

### Check for certain patterns in the tags attributes

##### Regular Expressions

In [9]:
lower = re.compile(r'^([a-z]+_?[a-z]*)*[a-z]+$') #sequence of lower words that can be spaced with _ (no repetitions) and no _ at the beginning end of the string. 
                                                 #Ex: code_for_udacity <- match
                                                 # code__for_udacity or code_for_udacity_ <- does not match

lower_colon = re.compile(r'^(([a-z]+_?[a-z]*)*[a-z]+:)+(([a-z]+_?[a-z]*)*[a-z]+)$') # same of lower with : ou multiple
                                                                                            # Ex: code:for:udacity,  <- match
                                                                                            # code::for_udacity <- does not match

problemchars = re.compile(r'[^a-zA-Z0-9_\s]')
numbers = re.compile(r'^([0-9])*$')
upper = re.compile(r'^([A-Z]+_?[A-Z]*)*[A-Z]+$')

##### Functions
_from lesson "Case Study: OpenStreetMap Data", "6. Quiz: Tags types". Adapted_

In [10]:
def key_type(element, keys, attribute):
    """Get the number of ocurrences of regular expressions patterns in the tags attributes"""
    
    if attribute in element.attrib:       
        attribute_value = element.attrib[attribute]
   
        if lower.search(attribute_value):
            keys['lower'] += 1
        elif lower_colon.search(attribute_value):
            keys['lower_colon'] += 1
        elif upper.search(attribute_value):
            keys['upper'] += 1
        elif numbers.search(attribute_value):
            keys['numbers'] += 1
        elif problemchars.search(attribute_value):
            keys['problemchars'] += 1
        else:
            keys['others'] += 1
                 
    return keys

def get_attr_list(filename):
    """Get the list of attributes on the xml file tags"""
    
    attr_list = set()
    for _, element in ET.iterparse(filename):
        for attribute in element.attrib:
            attr_list.add(attribute)
        
    return attr_list

def process_map(filename):
    """Create a list of dictionaries with the regular expressions patterns on the tags attributes"""
    
    keys_list = defaultdict(list)
    attributes = get_attr_list(filename)
    
    for attribute in attributes:
        keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "numbers" : 0, "upper": 0, "others" : 0}
        for _, element in ET.iterparse(filename):
            search = key_type(element, keys, attribute)
        keys_list[attribute] = search

    return keys_list

##### Callers

In [11]:
# check for certain patterns in the tags attributes
keys = process_map(OSM_FILE)
pprint.pprint(dict(keys))   

{'changeset': {'lower': 0,
               'lower_colon': 0,
               'numbers': 541824,
               'others': 0,
               'problemchars': 0,
               'upper': 0},
 'generator': {'lower': 0,
               'lower_colon': 0,
               'numbers': 0,
               'others': 0,
               'problemchars': 1,
               'upper': 0},
 'id': {'lower': 0,
        'lower_colon': 0,
        'numbers': 541824,
        'others': 0,
        'problemchars': 0,
        'upper': 0},
 'k': {'lower': 208547,
       'lower_colon': 11438,
       'numbers': 0,
       'others': 30,
       'problemchars': 311,
       'upper': 10},
 'lat': {'lower': 0,
         'lower_colon': 0,
         'numbers': 0,
         'others': 0,
         'problemchars': 446164,
         'upper': 0},
 'lon': {'lower': 0,
         'lower_colon': 0,
         'numbers': 0,
         'others': 0,
         'problemchars': 446164,
         'upper': 0},
 'maxlat': {'lower': 0,
            'lower_colon': 0,
 

### Check potential problems on attributes values

##### Functions

In [12]:
def get_attr_re_test(filename, attribute, re_test):
    """Return distinct list (set) of the selected attribute that matches a regular expression"""
    
    list_attr_re_test = set()
    
    for _, element in ET.iterparse(filename):
        if attribute in element.attrib: 
            if re_test.search(element.attrib[attribute]):
                    list_attr_re_test.add(element.attrib[attribute])
        
    return list_attr_re_test

def count_char_re_test(filename, attribute, re_test): 
    """Count the number of occurrences of the matched character on regular expression"""
    
    count_char_re_test = defaultdict(int)
    
    for _, element in ET.iterparse(filename):
            if attribute in element.attrib: 
                if re_test.search(element.attrib[attribute]):
                    m = re_test.findall(element.attrib[attribute])
                    for n in m:
                        count_char_re_test[n] += 1
        
    return count_char_re_test

def print_sorted_dict_by_value(d):
    """Print dictionary sorted by value"""
    
    sort = sorted( ((v,k) for k,v in d.iteritems()), reverse=True)
    for item in sort:
        print item[0], " : ", item[1]

##### Callers

In [13]:
# get the attribute value for given regular expression
print "generator attribute problemchars value: ", get_attr_re_test(OSM_FILE, "generator", problemchars)
print "version attribute problemchars value: ", get_attr_re_test(OSM_FILE, "version", problemchars)
print "user attribute number value: ", get_attr_re_test(OSM_FILE, "user", numbers)

generator attribute problemchars value:  set(['osmconvert 0.8.5'])
version attribute problemchars value:  set(['0.6'])
user attribute number value:  set(['2058'])


In [14]:
# get the number of occurrences of each charactere of "user" attribute that matches problemchars regular expression
users_problemchars_count = count_char_re_test(OSM_FILE, "user", problemchars)
print "users problemchars values count: " 
print_sorted_dict_by_value(users_problemchars_count)    

users problemchars values count: 
8700  :  í
5170  :  á
2151  :  !
711  :  -
693  :  เ
507  :  é
462  :  и
281  :  ú
232  :  я
231  :  ວ
231  :  э
231  :  Δ
231  :  ʟ
231  :  ɢ
66  :  š
37  :  Í
25  :  ô
18  :  ç
17  :  ã
4  :  е
2  :  с
2  :  р
2  :  н
2  :  л
2  :  й
2  :  А
1  :  у
1  :  о
1  :  к
1  :  д
1  :  г
1  :  в
1  :  б
1  :  а
1  :  П
1  :  З
1  :  â


In [15]:
# get the number of occurrences of each charactere of "v" attribute that matches problemchars regular expression
v_problemchars_count = count_char_re_test(OSM_FILE, "v", problemchars)
print "v problemchars values count: " 
print_sorted_dict_by_value(v_problemchars_count)   

v problemchars values count: 
6582  :  -
2333  :  í
2177  :  á
2101  :  /
2038  :  ã
1990  :  .
1648  :  :
1454  :  ;
1238  :  ç
947  :  é
810  :  ó
705  :  â
662  :  ú
397  :  ,
362  :  +
342  :  Á
290  :  ô
281  :  (
280  :  ê
280  :  )
178  :  õ
147  :  а
136  :  и
127  :  '
124  :  ª
111  :  р
83  :  л
77  :  ا
70  :  з
61  :  н
61  :  е
60  :  Б
57  :  @
51  :  ر
47  :  ل
46  :  о
44  :  à
39  :  Â
38  :  с
38  :  &
35  :  я
35  :  Ó
34  :  É
34  :  º
30  :  ա
30  :  д
29  :  ی
29  :  ي
27  :  ा
27  :  |
26  :  ب
26  :  %
24  :  ი
24  :  і
24  :  г
24  :  "
21  :  س
20  :  ز
20  :  т
19  :  ა
18  :  ف
18  :  Í
18  :  #
17  :  ة
17  :  к
15  :  й
14  :  რ
14  :  ्
14  :  ր
13  :  ி
13  :  ि
13  :  י
13  :  ն
13  :  Ф
12  :  र
12  :  य
12  :  م
12  :  у
11  :  ლ
11  :  ე
11  :  Ã
11  :  ?
10  :  西
10  :  巴
10  :  ร
10  :  া
10  :  स
10  :  ल
10  :  ज
10  :  א
10  :  ь
10  :  в
10  :  $
9  :  เ
9  :  ্
9  :  و
9  :  ن
9  :  ի
8  :  ་
8  :  ్
8  :  ी
8  :  ब
8  :  د
8  :  ר
8  :  ז
8 

In [16]:
# get the value of the "k" attribute that matches the upper regular expression
print "k attribute uppers value: ", get_attr_re_test(OSM_FILE, "k", upper)

# get the number of occurrences of each charactere of "k" attribute that matches problemchars regular expression
k_problemchars_count = count_char_re_test(OSM_FILE, "k", problemchars)
print "k problemchars values count: " 
print_sorted_dict_by_value(k_problemchars_count)
    
# find "k" attribute with problemchar
for _, element in ET.iterparse(OSM_FILE):
    if "k" in element.attrib:
        if (u'\xe7' in element.attrib["k"]) or ('-' in element.attrib["k"]):
            print "k with problemchar: ", element.attrib["k"]
    

k attribute uppers value:  set(['GEOCODIG_D', 'GEOCODIG_M', 'FIXME', 'BUI'])
k problemchars values count: 
11904  :  :
24  :  -
1  :  ç
k with problemchar:  name:zh-yue
k with problemchar:  name:bat-smg
k with problemchar:  name:cbk-zam
k with problemchar:  name:be-tarask
k with problemchar:  capital_ISO3166-1
k with problemchar:  endereço
k with problemchar:  voltage-high
k with problemchar:  voltage-high
k with problemchar:  ISO3166-1
k with problemchar:  ISO3166-1:alpha2
k with problemchar:  ISO3166-1:alpha3
k with problemchar:  ISO3166-1:numeric
k with problemchar:  name:bat-smg
k with problemchar:  name:be-tarask
k with problemchar:  name:cbk-zam
k with problemchar:  name:fiu-vro
k with problemchar:  name:nds-nl
k with problemchar:  name:roa-tara
k with problemchar:  name:zh-classical
k with problemchar:  name:zh-min-nan
k with problemchar:  name:zh-yue
k with problemchar:  ISO3166-2
k with problemchar:  ISO3166-2
k with problemchar:  ISO3166-2


### Note:
* minlat, minlon, maxlat, maxlon, lat, lon problemchars relative to minus (-) and point (.). Not really a problem
* timestamp problemchars due to minus (-)
* changeset, id, uid, ref, type as expected
* "v" has several problemchars and upper, but it is like a free field to the value of the key
* generator and version problemchars are due to dot(.)
* there is a user with just numbers, the user 2058
* there are a lot of users with special caracteres but it is not a big deal since users are a free form of creation and people like to use special characteres in their nicknames
* "k" problemchars are due to names and value 'endereço'
* the "v" value has a lot of problemchars. Most of them are portuguese and other languages specific characteres. Our focus will be on these characteres: 

/ . : ; , + ( ) ' &amp; " # $ ! = ] >
    * / used mainly for web pages. found in addresses (not a problem), opening hours (24/7) (not a problem), multivalorated telephones
    * .(dot) used in web pages, e-mails, addresses, abbreviations, zipcodes
    * : used for wikipedia, websites, hours
    * ; used as separator
    * , used as separator
    * + used on telephones and {network : + bike}
    * ( used on telephones, wikipedia and details
    * ) used on telephones, wikipedia and details
    * ' mainly on english names
    * & places names
    * " details
    * # colors
    * (dollar) prices
    * ! notes
    * = webpages
    * ] typo: Datacenter Banco do Brasil]
    * > typo: Esta via NUNCA é usada no sentido da EPCL->EPIA!
    

### Check attributes values with problemchar
##### Functions

In [17]:
def get_attr_values(filename, attribute, character):
    """Get attribute values by character"""
    
    list_attr_values = []
    for _, element in ET.iterparse(filename):
        if attribute in element.attrib:
            if character in element.attrib[attribute]:
                list_attr_values.append(element.attrib[attribute])
    return list_attr_values   

##### Callers

In [18]:
# define chars to be checked
problem_chars = ['/', '.', ':', ';', ',', '+', '(',')', "'", '&', '"', '#', "$", "!", "=", "]", ">"]

# get the attribute values that has the problem_chars
for problem_char in problem_chars:
    print problem_char, " ---------------------------------------------"
    for value in get_attr_values(OSM_FILE, "v", problem_char):
        print value

/  ---------------------------------------------
http://upload.wikimedia.org/wikipedia/commons/3/3c/Bandeira_do_Distrito_Federal_%28Brasil%29.svg
http://www.brasilia.df.gov.br/
http://www.formosa.go.gov.br
http://upload.wikimedia.org/wikipedia/commons/d/dc/Bandeira_de_Taguatinga_%28DF%29.svg
http://www.taguatinga.df.gov.br/
http://novageracao.academiaweb.com.br
http://www.novositedogiraffas.com
http://www.supercei.com.br
608/609 Norte
Escola Parque EQS 210/211 Sul
http://www.popobrasil.com
http://www.correios.com.br
24/7
http://www.pisttache.com.br
http://www.tuttisapori.com.br
http://www.tokyorestaurante.com.br
http://www.mansoori.com.br/portugues/localizacao.cfm
http://www.mansoori.com.br/portugues/localizacao.cfm
http://www.icesp.br/
http://www.faculdadedarwin.com
http://www.facitec.br/
http://www.anhanguera.com/graduacao/localidades/brasilia_jk.php
Ligia Artesanato / Capim Dourado
http://cinecultura.com.br/
http://www.hotelstpaul.com.br
http://itaucinemas.com.br
modulo 24/26
3399-2

http://jkshoppingdf.com.br
Quadra de Volei/Futevolei
24/7
http://www.clubedochoro.com.br/
SAIN Q. 04 lote s/n
CNB 10/CNB 11
EQSW 103/104
EQSW 304/504
EQRSW 5/6
EQRSW 3/4
EQRSW 1/2
EQRSW 6/7
EQRSW 7/8
http://serpro.gov.br
http://www.unieuro.edu.br/
http://www.ucb.br
SOF SUL Quadra 7/8
CLN 207/208
EQL 6/8
http://www.detran.df.gov.br/
http://www.borgeslandeiro.com.br/
CNB 4/CNB 5
CNB 9/CNB 10
CNB 5/CNB 6
CNB 7/CNB 8
CNB 1/CNB 2
SCS Quadra 2/3
CNB 8/CNB 9
SCRN 708/709 Bloco B
SCRN 708/709 Bloco C
SCRN 708/709 Bloco D
SCRN 708/709 Bloco G
SCRN 708/709 Bloco G
SCRN 708/709 Bloco F
Terracap: http://www.terracap.df.gov.br/terracapweb/edital/croquis_112013/59a63.jpg
http://www.terracap.df.gov.br/terracapweb/edital/croquis_112013/59a63.jpg
QNG 46 A/E
Clínica da Família 104/105
QSD 09/11
http://www.acibdf.com.br
http://www.villagealvorada1.com.br/
DEC 88.940 de 07/11/1983
DEC/SN 13/12/2002
http://www.saobernardodf.com.br
http://cinedrivein.com/
http://www.adbraz.com.br
Correios 208/408 Sul
Sudoes

Ed. Via Itamaraty
Ed. Palazzo D'Alberi
Ed. Min. Marcelo Pimentel
Ed. Helsinki
http://www.inmet.gov.br
70.686-420
70.752-060
70.686-405
70.687-145
70.687-135
70.687-140
70.687-310
70.687-305
70.686-710
70.86-705
70.687-330
70.686-750
70.686-625
70.686-620
70.686-615
70.686-610
70.686-605
70.686-525
70.687-210
70.687-215
70.687-245
70.687-120
70.687-110
70.686-550
70.686-440
70.686-535
70.686-510
70.686-190
70.686-195
70.686-180
70.686-185
70.686-155
70.687-150
70.686-820
70.686-815
70.686-805
70.686-065
70.686-055
70.686-715
70.686-720
70.687-230
http://jkshoppingdf.com.br
dp15_delegadochefe@pcdf.gov.br
Cond. Vivendas Serranas
70.686-070
70.686-075
http://www.clubedochoro.com.br/
SAIN Q. 04 lote s/n
Rua Mal. Hermes da Fonseca
http://serpro.gov.br
http://www.unieuro.edu.br/
http://www.ucb.br
http://www.detran.df.gov.br/
http://www.borgeslandeiro.com.br/
Contorno deduzido a partir de várias fontes, conferir.
Contorno deduzido a partir de várias fontes, conferir.
Contorno deduzido a partir

R: Ipanema
R: G-25D
R: Leblon
R: Pão de Açucar
Av: I
R: Copacabana
R: Jacob
R: Jacob
R: G-29D
AV: F Um
Av: G
R: Miguel Angelo Rollebom
Av: H
Av: I
R: Miguel Angelo Rollebom
R: Quinze
R: Dezoito
R: Quatro
R: Nove
R: Super Quadra 2
R: Cinco
R: Três
R: Dois
R: Quatro
R: Dois
R: Um
R: Quatro
R: Três
R: Vinte e Um
R: Cinco
R: Dois
R: Oito
R: Seis
R: Seis
R: Dois
R: Sete
R: Cinco
R: Quatro
R: Dois
R: Seis
R: Dois
R: Três
R: Quatro
R: Dois
R: Três
R: Três
R: Vinte e Dois
R: Vinte
R: Vinte e Quatro
R: Dois
R: Cinco
R: Dezenove
R: Dezesseis
R: Vinte e Três
R: Vinte e Três
R: Quatorze
R: Dezessete
AV: Mil e Um
R: Dezesseis
R: Dezessete
R: Quinze
R: Quartoze
R: Dez
R: Quarenta e três
R: Vinte
R: Trinta e um
R: Sete
R: Seis
R: Cinco
R: Quarenta e Quatro
R: Vinte e Sete
R: Vinte e Oito
AV: Dois
R: Um
R: Vinte e Oito
R: Dezesseis
R: Duque de Caxias
AV: Três
R: Santos Dumont
R: Quatro
AV: Um
R: Vinte e Sete
R: Vinte e Sete
R: São Mateus
R: Trinta e um
R: Dezenove
R: Vinte e Oito
R: Vinte Um
R: Três
R

BR-070;DF-095
Mo-Sa 10:00-22:30; Su 12:00-22:30
Mo-Fr 7:30-23:00;Sa 8:00-14:00
IBGE;Bing low resolution
federal;estadual
BR-040;BR-050
BR-040;BR-050
BR-040;BR-050
BR-040;BR-050
BR-040;BR-050
Tu-Fr 09:00-21:00; Sa-Su 08:00-20:00
FCA; VLI
FCA; VLI
FCA; VLI
Ville de Montagne - Quadra 20;Ville de Montagne
soccer;basketball
BR-070;DF-095
federal;estadual
FCA; VLI
FCA; VLI
FCA; VLI
BR-070;DF-095
BR-070;DF-095
BR-070;DF-095
BR-070;DF-095
federal;estadual
BR-040;BR-050
BR-070;DF-095
BR-070;DF-095
Tu-Th 09:00-17:00; Fr-Su 09:00-18:00
Tu-Th 09:00-17:00; Fr-Su 09:00-18:00
Mo-Sa 09:00-21:00; Su 12:00-20:00
BR-070;DF-095
BR-070;DF-095
BR-070;DF-095
BR-070;DF-095
BR-070;DF-095
BR-070;DF-095
BR-070;DF-095
BR-070;DF-095
BR-070;DF-095
BR-060;DF-075
basketball;tennis
basketball;tennis
BR-060;DF-075
Yahoo hires; osm-gpx
BR-060;DF-075
BR-070;DF-095
BR-070;DF-095
BR-070;DF-095
IBGE;survey
BR-040;BR-050
BR-040;BR-050
BR-040;BR-050
BR-040;BR-050
BR-040;BR-050
BR-040;BR-050
R;G-8B
federal;estadual
federal;est

+55 61 33271108
+55 61 3345-8669
+55 61 33018000
+Açaí
+55 61 32084128
+556135510001
+55 61 30337707
+55 61 3329-8000
+55 61 34271730
+55 61 30325774
+55 61 3581-1396
+55-61-3434-3707
+55 61 33582061
+55 61 33580458
+55 61 34585063
+55 61 33837828
+55 61 3364-3442
+55 61 3427-1240
+55 61 32421542
+55 61 39101045
+55 61 3910-1049
+55 61 33688282;+55 61 33688283
+55 61 3346 0231
+55 61 3445 5900
+55 61 3299-7629
+55 61 8151-7723
+55 61 3629 9900
+55 61 32740591
+55 61 30479494
+55 61 33497001
+55 61 32747113
+Bike
+55 61 40039846
+Bike
+55 61 40039846
+Bike
+55 61 40039846
+Bike
+55 61 40039846
+Bike
+55 61 40039846
+Bike
+55 61 40039846
+Bike
+55 61 40039846
+Bike
+55 61 40039846
+Bike
+55 61 40039846
+55 61 30456166
+Bike
+55 61 40039846
+55 61 39668300
+55 61 38794932
+55 61 3327-1604
+556132445797
+556133490106
+55 (61) 3107-8901
+55 61 33269170
+55 61 39648482
+55 61 30420202
+55 61 3387-4177
+55 61 33268078
+55 61 81363429
+55 61 33474913
+Bike
+55 61 40039846
+Bike
+55 61 40039846

pt:Gama (Distrito Federal)
pt:Formosa (Goiás)
pt:Taguatinga (Distrito Federal)
pt:Itapoã (Distrito Federal)
pt:Autódromo Internacional Nelson Piquet (Brasília)
(61) 99309-0100
pt:Águas Claras (Distrito Federal)
(61)3335-8776
(61)3427-1539
(61)3427-2009
Torre de treinamento(JICA Tokyo Tower)
Torre de treinamento(JICA Yokohama Tower)
pt:Samambaia (Distrito Federal)
(61) 33820446
pt:Guará (Distrito Federal)
(61) 3381-3444
(61) 3381-3837 ou 3573-4344
(61) 3381-3444
Monumento Solarius (Chifrudo)
pt:Sobradinho (Distrito Federal)
pt:Planaltina (Distrito Federal)
pt:Santa Maria (Distrito Federal)
pt:Cruzeiro (Distrito Federal)
pt:Jardim Botânico (Distrito Federal)
pt:São Sebastião (Distrito Federal)
pt:Varjão (Distrito Federal)
(61) 3363-9705
pt:Colégio Dom Bosco (Brasília)
Centro Olímpico da UnB (CO)
(61) 3427-1100
(61) 3427-1201
(61)3233-2496
(61) 3427-3535
UNEPXMIL 48hs (Rastreadores)
pt:A Justiça (escultura)
Rua 3 (Acampamento Rabelo)
+55 (61) 3107-8901
Atenção: é um bairro que não está oc

Alfredo's
McDonald's
Caixa d'Água
Snook Drink's
Family's Hair
McDonald's
Fróe's Burguer
Morgana's Caldos
Brokado's
Domino's Pizza
O'Rilley Irish Pub
C'est la Vie
Gate's Lounge Bar
Habib's
McDonald's
McDonald's
McDonald's
McDonald's
DriveThru McDonald's Taguatinga Centro
'A Justiça'
Thiago's Show de Carnes
bike'n FIX
D'Vilela Restaurante e Café
Bob's
L'Entrecôte de Paris
Fran's Café
Açougue Boi D'ouro
Parque Olhos d'Água
Domino's Pizza
D'Lav
Mitto's
Paper's Line
C'est si bon
D'Vilella
Du'Cheff
McDonald's
Habib's
Kabana's
Minas' cuisine ("cozinha mineira") with cheese bread ("pão-de-queijo") on almost every dish. Great beers and drinks to go along. Authentic brazilian restaurant dedicated to traditional brazilian dishes.
Green's Restaurante Natural
Bito's Bar
Good's
Moreno's Restaurante
Rosa's Café
D'Lurdes
Fran's Café
Bob's
MacDonald's
Point D'Cavalcante
Mariart's Festas
Virginia D'Arc
Jhonny's Car
Neiva's Auto
Brito's Restaurante
Maison Maxim's Cozinhas & Armários
Domino's Pizza
McDona

### Check different street names abbreviations

##### Regular Expressions

In [19]:
street_type_re = re.compile(r'^\S+\.?', re.IGNORECASE)

##### Functions
_from lesson 5 "Data Quality", "8. Exemplo Utilizando Nossas Melhores Práticas". Adapted_

In [20]:
street_types = defaultdict(int)

def audit_street_type(street_types, street_name):
    """Get the first word of the attribute and add counter to the list"""
    
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        street_types[street_type] += 1

def print_sorted_dict_by_key(d):
    """Print the dictionary sorted by key value"""
    
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print "%s: %d" % (k, v) 

def is_street_name(elem):
    """Check if the element is from the tag 'tag' and the attribute 'k' is addr:street"""
    
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")

def audit_street_k(filename):
    """List the first word (with or without dot) of the addr:street attribute with the number of occurrences"""
    
    for event, elem in ET.iterparse(filename):
        if is_street_name(elem):
            audit_street_type(street_types, elem.attrib['v'])    
    print_sorted_dict_by_key(street_types)    
    

##### Callers

In [21]:
# get the first word and number of occurrences of attribute "v" when "k" == "addr:street"
audit_street_k(OSM_FILE)

2: 1
203: 1
506/507: 1
508: 1
508-509: 1
508/509: 1
510: 1
713/913: 1
8: 1
A.O.S.: 6
Academia: 1
ADE: 1
AE: 4
AENW: 1
Aeroporto: 1
Alameda: 3
AR: 2
Area: 1
ASBAC: 1
AV.COMERCIAL: 1
Avenida: 282
Bloco: 1
BR-020: 1
BR060: 2
BR070: 2
Brasília: 1
C1: 1
C7: 2
CA: 45
Campus: 11
CCSW: 13
Centro: 4
Chácara: 1
CL: 1
Cln: 19
CLN: 247
CLNW: 6
CLRN: 2
Cls: 64
CLS: 58
Clsw: 2
CLSW: 64
CMA: 1
CNB: 14
CNF: 2
CNM: 10
Colonia: 1
Cond.: 16
Condomínio: 50
Conjunto: 1
conjunto: 1
CRN: 1
CRNW: 1
CRS: 1
Crs: 1
Cruzeiro: 1
CSA: 1
CSB: 1
CSG: 1
CSG04: 1
DF: 1
DF-095: 1
DF-140: 1
ED.: 1
Edificio: 2
Eixo: 21
EPIA: 1
EPTG: 9
EQ: 4
EQL: 2
EQN: 5
EQNM: 1
EQNO: 2
EQNP: 3
Eqs: 1
EQS: 6
EQSW: 2
Esplanada: 1
EST: 1
Estacionamento: 2
Estrada: 9
Estância: 2
Etapa: 1
Feira: 1
Ginásio: 1
HCGN: 5
Hospital: 1
Incra: 1
INCT: 1
INMET: 1
L2: 3
Marginal: 1
ML: 1
Núcleo: 19
Palácio: 1
Parque: 1
Pistão: 2
Praça: 7
Q: 2
Q.: 2
Q.I.: 1
Qd.: 1
QE: 7
QI: 44
QI33: 1
QL: 1
QMSW: 14
Qn: 12
QN: 34
QNA: 6
QNB: 2
QND: 2
QNF: 1
QNG: 3
Qnl: 2

### Note:
* The Brasilia city has a lot of abbreviations on addresses. This is the common way of describing addresses on the city, so the most relevant abbreviations to correct are: Rua, Avenida, Quadra, Condominio, Edificio, Setor e Companhia. 

### Find street abbreviations on the tags
##### Functions

In [22]:
def get_street_name(street_name):
    """Get the first word (with or without '.'), as defined by the regular expression, of the string"""
    
    m = street_type_re.search(street_name)
    if m:
        return m.group()
    
def is_tag(element):
    """Check if the tag of the element is 'tag'"""
    
    return element.tag == 'tag'
    
    
def audit_street(filename, values_list):
    """List how many tag attribute "v" words matches the defined list by tag attribute 'k'"""
    
    street_names_attr = defaultdict(dict)
    for _, element in ET.iterparse(filename):
        if is_tag(element):
            for attribute in element.attrib:
                attr_value = get_street_name(element.attrib[attribute])
                if attr_value.lower() in values_list:
                    if attr_value in street_names_attr[element.attrib['k']]:
                        street_names_attr[element.attrib['k']][attr_value] += 1
                    else:
                        street_names_attr[element.attrib['k']][attr_value] = 1

    return street_names_attr  

##### Callers

In [23]:
# get the "v" attribute matches occurrences on street abbreviation list by "k" attribute
street_abbr = ['avenida', 'av', 'av.', 'av:', 'rua, ''r', 'r.', 'r:', 'quadra', 'q', 'q.', 'qd', 'qd.', 'qd:', 'condominio', u'condomínio', 'cond', 'cond.', 'cond:', 'edificio', u'edifício', 'ed', 'ed.', 'ed:', 'setor', 'st', 'st.', 'st:', 'companhia', 'cia', 'cia.', 'cia:']
pprint.pprint(dict(audit_street(OSM_FILE, street_abbr)))

{'addr:full': {u'Avenida': 2},
 'addr:housename': {u'Condom\xednio': 1,
                    u'Ed.': 1,
                    u'Edif\xedcio': 51,
                    'QD.': 1,
                    'Quadra': 1},
 'addr:housenumber': {'Ed.': 1, 'Q': 1, 'Qd': 1, 'Quadra': 6},
 'addr:place': {u'Avenida': 1, 'Setor': 10},
 'addr:street': {'Avenida': 282,
                 u'Cond.': 16,
                 u'Condom\xednio': 50,
                 'ED.': 1,
                 'Edificio': 2,
                 'Q': 2,
                 'Q.': 2,
                 'QUADRA': 1,
                 'Qd.': 1,
                 u'Quadra': 348,
                 'Setor': 19},
 'addr:suburb': {u'Condom\xednio': 1, 'Setor': 102},
 'alt_name': {'Avenida': 3,
              u'Condom\xednio': 3,
              'Edificio': 1,
              u'Edif\xedcio': 5,
              'Quadra': 2,
              'Setor': 11},
 'description': {'Condominio': 1, u'Condom\xednio': 1, u'Edif\xedcio': 1},
 'destination': {'Setor': 1},
 'from': {'Se

### Check "v" values patterns

##### Regular Expressions

In [24]:
postal_code_re = re.compile(r'^\d{5}-\d{3}$')
phone_re = re.compile(r'^(\+55 61 9?\d{4}-\d{4})(;\+55 61 9?\d{4}-\d{4})*$') # one or more telephone occurences with ; as delimiter
email_re = re.compile(r'^[\w.-]+@[_\w-]+\.[_\w-]+(\.[_\w-]+)*$')
site_re = re.compile(r'^https?://[_\w-]+\.[_\w-]+((\.|/)?[_\w-]+)*/?$')
opening_hours_re = re.compile(r'(^([A-Z][a-z](-[A-Z][a-z])? \d\d:\d\d-\d\d:\d\d)+(; ?([A-Z][a-z](-[A-Z][a-z])? \d\d:\d\d-\d\d:\d\d))*$)|(^24/7$)')

##### Functions

In [25]:
def match_pattern(value, pattern_re, pattern_counter):
    """Count the number of matches and not matches of a value by given regular expression"""
    
    m = pattern_re.match(value)
    if m:
        pattern_counter['match'] += 1
    else:
        pattern_counter['no match'] += 1
        print value
    return pattern_counter
    
def is_attribute(element, k_value):
    """Check if the element tag attribute "k" matches the k_value"""
    
    return (element.tag == 'tag') and (element.attrib['k'] == k_value)
    
def check_patterns(filename, k_value, pattern_re):
    """Check number of regular expression patterns occurrences on the xml file by "k" attribute value"""
    
    pattern_counter = defaultdict(int)
    for _, element in ET.iterparse(filename):
        if is_attribute(element, k_value):
            pattern_counter = match_pattern(element.attrib['v'], pattern_re, pattern_counter)
    return pattern_counter

##### Callers

In [26]:
# look for the number of postal code correct pattern on the "k" attribute "addr:postcode"
attribute = "addr:postcode"
pattern_re = postal_code_re
print check_patterns(OSM_FILE, attribute, pattern_re)

71.961-540
71060230
72225971
71050041
71996075
71060230
72215-035‎
73100210
71215267
70716901
72910000
70390100
71060230
71060230
71060230
71060230
71060230
71060230
71060230
71060230
71060230
71060230
71060230
71060230
70.919-970
72231206
72215058
72215032
71735300
70855 520
72130095
72280184
71735300
72215963
70875510
71250005
70852520
71705521
73062507
72110789
70046900
70040906
70660045
87200110
70675426
70675427
70347090
70853530
70757040
715035-02
71503505
71060230
70740776
70670405
70658472
70.686-420
70.752-060
70.686-405
70.687-145
70.687-135
70.687-140
70.687-310
70.687-305
70.686-710
70.86-705
70.687-330
70.686-750
70.686-625
70.686-620
70.686-615
70.686-610
70.686-605
70.686-525
70.687-210
70.687-215
70.687-245
70.687-120
70.687-110
70.686-550
70.686-440
70.686-535
70.686-510
70.686-190
70.686-195
70.686-180
70.686-185
70.686-155
70.687-150
70.686-820
70.686-815
70.686-805
70.686-065
70.686-055
70.686-715
70.686-720
70.687-230
70.686-070
70.686-075
72255203
71929360
7074076

In [27]:
# look for the number of postal code correct pattern on the "k" attribute "postal_code"
attribute = "postal_code"
pattern_re = postal_code_re
print check_patterns(OSM_FILE, attribute, pattern_re)

71901300
71996075
73040135
73040133
73045170
73040134
73045155
73040137
73050182
73045152
73050185
73050187
73050186
73050184
73045173
73040132
73045151
73045169
73050196
73045153
73040136
73050176
73050163
73045174
73050173
73040138
73045154
73050166
73050177
73050164
73045171
73045172
73050178
73050194
73045167
73050193
73050174
73045172
73050175
73050181
73050166
73050190
73045171
73050162
73050183
73050161
71906500
71680348
73040131
73045175
73050191
73050192
73050195
73050189
73050188
73045168
73050165
defaultdict(<type 'int'>, {'no match': 57, 'match': 124})


In [28]:
# look for the number of telephone correct pattern on the "k" attribute "phone"
attribute = "phone"
pattern_re = phone_re
print check_patterns(OSM_FILE, attribute, pattern_re)

(61) 99309-0100
61 3367-4814
(61)3335-8776
(61)3427-1539
(61)3427-2009
+55 61 33271108
32342113
3361-0813
3399-2250
3399-2031/ 3064
(61) 33820446
(61) 3381-3444
(61) 3381-3837 ou 3573-4344
(61) 3381-3444
+55 61 33018000
+55 61 32084128
+556135510001
61 33191111
+55 61 30337707
+55 61 34271730
+55 61 30325774
613556-7622
6133840074
6132740534
6135567744
6132743007
6185761017
6133855718
6133844420
6135564669
6133851054
6184173870
6133843666
6135561005
6133842636
6139676716
6133855404
6133842267
6134842165
6135560095
6133843675
34845419
6134845876
6133849275
6135560102
6135563983
6133844574
6184125474
615564660
6139679732
6133843788
6135564030
6191060648
6199973864
6196220454 ou 84825119
92316199
6192192324
6133846494
6130416775
6130224274
6134842910
6133851770
6135569645
6130365663
6133841644
6133857759
6133844954
6132017493
6134847635
6133851900
6130377604
6133853343
6130368449
6133850888
6130410585
6135560534
6133851722
6134848985
615561083
6130337171
6133845857
6133840227
6135562525
6

61-3901-7530, 61-3447-8156
3272 2948 e 3273 9233
3214 4777
+55 61 33127000
+55 61 32223999
+55 61 3443 5500
+55 61 32429933
(61) 3426.0400
+55 61 33403747
3245-4555 3245-4611
3242 9088
3905 8632
+55 61 3368 7224
+55 61 3248 1672
+55 (61) 3427-2101
(61) 33820094
+55 61 33610404
34353471 34353457
+55 61 35749700
+556133889956
+55 61 3445 5888
3394 9349
+55 61 3340 5545
33541838
+55 61 34272579
+55 61 34272313
+55 61 33912822
+55 61 32238276
3393-6361
+556120246300
+55 61 32167500
3399-5476
3399-7186/5440
61 35614141
+55 61 31078901
61 34879000
6132076521
61 3462-8800
3395 9100 e 3395 9128
6134353002
61 3378-9200 3378-9228
+55 (61) 3107-8901
+55 61 31078901
+55 (61) 3107-8901
+55 (61) 3357-4721
+55 (61) 3427-4081
+55 (61) 3427-1512
+55 (61)3427-2052
+55 (62) 3981-1192
(61) 3432-2425
61) 3432-2108 / 3981-1149
(61) 3642-4176
(61) 3642 1483 / 3981 1150
+55 61 33690881
+55 61 32420542
3901-7556
3901-7559
3901 7568
+55 61 31032226
+55 61 32134848
+55 61 30128000
+55 61 81725233
+55 61 34454400

In [29]:
# look for the number of telephone correct pattern on the "k" attribute "alt_phone"
attribute = "alt_phone"
pattern_re = phone_re
print check_patterns(OSM_FILE, attribute, pattern_re)

+55 61 3427 4017
+55 61 3366 2660
defaultdict(<type 'int'>, {'no match': 2})


In [30]:
# look for the number of email correct pattern on the "k" attribute "email"
attribute = "email"
pattern_re = email_re
print check_patterns(OSM_FILE, attribute, pattern_re)

defaultdict(<type 'int'>, {'match': 19})


In [31]:
# look for the number of email correct pattern on the "k" attribute "contact:email"
attribute = "contact:email"
pattern_re = email_re
print check_patterns(OSM_FILE, attribute, pattern_re)

defaultdict(<type 'int'>, {'match': 2})


In [32]:
# look for the number of website correct pattern on the "k" attribute "website"
attribute = "website"
pattern_re = site_re
print check_patterns(OSM_FILE, attribute, pattern_re)

www.santander.com.br
www.felts.com.br
www.anac.gov.br
viaesteticabeleza.com.br
www.photomidia.com.br
www.zascar.com.br
www.jjempilhadeiras.com.br
facebook.com/restaurantemanati
www.abrapa.com.br
www.pastelariarossoni.com
www.estudiomappa.com
www.planejamento.gov.br
www.planejamento.gov.br
www.superadega.com.br
nacaoaventureira.com.br
www.felicittashopping.com.br
www.cartoriocolorado.com.br
www.cartoriodetaguatinga.com.br
www.adeb.com.br
www.icesamambaia.org.br
defaultdict(<type 'int'>, {'no match': 20, 'match': 346})


In [33]:
# look for the number of opening hours correct pattern on the "k" attribute "opening_hours"
attribute = "opening_hours"
pattern_re = opening_hours_re
print check_patterns(OSM_FILE, attribute, pattern_re)

06:00-18:00
06:00-20:00
18:00-23:00
07:00 - 19:30
Mo-Sa 12:00-15:00,19:00-23:00; Su 12:00-17:00
08:00-17:30
07:30-18:00
08:00-18:00
Mo-Sa 11:00-15:30;17:00-20:00
12:00-15:00
Mo-Sa 08:00-22:00, Su 08:00-18:00
Mo-Fr 07:00:21:00; Sa-Do 07:00-20:00
17:30-23:30
09:00-18:00
11:00 - 01:00
Mo-Sa 11:30-15:30, 18:30-23:30
08:00-18:00
11:00-15:00
Mo-Su 08:0-18:00
07:00-19:00
Th 10:00-22:00; Su, Tu-We, Fr 19:00-19:30, 21:00-21:30
Mo-Fr 06:00-23:00; Sa 08:00-14:00,16:00-20:00; Su 09:00-13:00
10:00-22:00
Mo-Su 11:00-14:00,18:00-22:00
08:00-18:00
07:30 - 21:30
08:00-18:00
08:00-18:00
08:30-22:30
16:00-23:30
09:00-17:00
Su-Mo, Fr 19:00-21:00
09:00-17:00
08:00 ate´17:30
mo-mo 16:20 22:00
Mo-Fr 7:30-23:00;Sa 8:00-14:00
Seg a Sex de 11:00 às 14:00
06:00-22:00
09:00-19:00
13:00-19:00
08:00-18:00
07:00-13:30;17:30-22:30
07:30-18:30
08:00-22:00
07:00-23:00
08:00 ATÉ 18:00
Seg à Sex de 08:00 às 20:00
defaultdict(<type 'int'>, {'no match': 47, 'match': 256})


### Find how many unique users have contributed to the map
##### Functions
_from lesson "Case Study: OpenStreetMap Data", "7. Quiz: Investigando Usuários". Adapted_

In [34]:
def get_attribute(element, tag_attrib):
    """Get attribute value from ET element"""
    
    return element.attrib[tag_attrib]


def get_unique_attribute(filename, tag_attrib):
    """Get a set of unique tag attribute value"""
    
    unique_attrib = set()
    for _, element in ET.iterparse(filename):
        if tag_attrib in element.attrib:
            unique_attrib.add(get_attribute(element, tag_attrib))

    return unique_attrib

##### Callers

In [35]:
# get the user id and the number of unique contribute users on the OSM xml file
user_attrib = 'uid'
users = get_unique_attribute(OSM_FILE, user_attrib)
print "unique contributers: ", len(users)
pprint.pprint(users)

unique contributers:  730
set(['100064',
     '1010353',
     '1016290',
     '1020923',
     '102648',
     '1031962',
     '103253',
     '103464',
     '1039852',
     '1069176',
     '107257',
     '108084',
     '109705',
     '1113277',
     '11374',
     '1138855',
     '115579',
     '118021',
     '1193351',
     '1193851',
     '1196357',
     '1202866',
     '1204081',
     '1206082',
     '1206540',
     '1208237',
     '1208664',
     '1211056',
     '121406',
     '1217944',
     '1218134',
     '1227959',
     '12293',
     '1231595',
     '1240849',
     '124232',
     '12448',
     '1249085',
     '1249205',
     '1267083',
     '128186',
     '12910',
     '12966',
     '130065',
     '130472',
     '1305259',
     '1306',
     '1310899',
     '13203',
     '133003',
     '13413',
     '1342842',
     '1342943',
     '1351096',
     '1352904',
     '1363315',
     '1373505',
     '1393857',
     '1420318',
     '142197',
     '1426385',
     '1434429',
     '144314',


### Fix "k" values

##### Functions

In [36]:
def replace_value(element, new_value, attribute):
    """Replace element attribute value"""
    
    element.attrib[attribute] = new_value
    return element
            
def fix_osm_k(filename, new_filename, k_mapping, attribute="k", tags=('node', 'way', 'relation')):
    """Update "k" attribute values on a new OSM file"""
    
    with open(new_filename, 'wb') as output:
        output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        output.write('<osm version="0.6" generator="osmconvert 0.8.5" timestamp="2017-08-10T15:02:03Z">\n  ')

        for _, element in ET.iterparse(filename, events=("end",)):
            if k_mapping["addr:postcode"] in element.attrib.values():
                element = replace_value(element, "addr:postcode", attribute)
            elif k_mapping["email"] in element.attrib.values():
                element = replace_value(element, "email", attribute)
            elif k_mapping["addr:street"] in element.attrib.values():
                element = replace_value(element, "addr:street", attribute)
            
            if element.tag in tags:
                output.write(ET.tostring(element, encoding='utf-8'))
        output.write('</osm>')

##### Callers

In [37]:
# Fix OSM k values on a new file
k_mapping = {"addr:postcode" : "postal_code", "email" : "contact:email", "addr:street" : u"endere\xe7o"}
new_filename = "federal_district_fixed_k.osm"
fix_osm_k(OSM_FILE, new_filename, k_mapping)

##### Check the result

In [38]:
# Check if the old "k" values were updated
for _, element in ET.iterparse(new_filename):
    if "k" in element.attrib:
        if element.attrib["k"] in ("postal_code", "contact:email", u"endere\xe7o"):
            print "old k found: ", element.attrib["k"]

### Fix "v" values
##### Regular Expressions

In [39]:
phone_fix_re = re.compile(r'(\+?55? ?\(?61?\)? ?)?(9?\d{4}[-| |\.]*\d{4})')

##### Functions

In [40]:
def fix_postcode(postcode):
    """Apply pattern to the postcode"""
    
    postcode = re.sub(r'[^\d]', '', postcode)
    if len(postcode) == 8:
        postcode = postcode[:5] + '-' + postcode[5:]
    return postcode

def fix_phone(phones):
    """Apply default string to the phone"""
    
    new_phones = ""
    for i, phone in enumerate(phones):
        part_phone = re.sub(r'[^\d]', '', phone[1])
        if part_phone[:4] != "0800": 
            if len(part_phone) == 9:
                part_phone = part_phone[:5] + '-' + part_phone[5:]
            else:
                part_phone = part_phone[:4] + '-' + part_phone[4:]
        
        new_phone = '+55 61 ' + part_phone
        if i == 0:
            new_phones = new_phone
        else:
            new_phones = new_phones + ";" + new_phone        
    return new_phones

def fix_website(website):
    """Apply 'http://www.' to the website string"""
    
    website = re.sub(r'(^(https?://)?(www\.)?)', '', website)
    return 'http://www.' + website
  
def apply_re_search(value, re_test):
    """Check if the regular expression is found on the string"""
    
    m = re_test.search(value)
    if m:
        return True
    else:
        return False
    
           
def fix_osm_v(filename, new_filename, attribute="v", tags=('node', 'way', 'relation')):
    """Update "v" attribute values on a new OSM file"""
    
    with open(new_filename, 'wb') as output:
        output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        output.write('<osm version="0.6" generator="osmconvert 0.8.5" timestamp="2017-08-10T15:02:03Z">\n  ')

        for _, element in ET.iterparse(filename, events=("end",)):
            if "addr:postcode" in element.attrib.values():
                if not apply_re_search(element.attrib["v"], postal_code_re):
                    element.attrib["v"] = fix_postcode(element.attrib["v"])
            elif "phone" in element.attrib.values():
                m = phone_fix_re.findall(element.attrib["v"])
                if m:
                    element.attrib["v"] = fix_phone(m)
            elif "alt_phone" in element.attrib.values():
                m = phone_fix_re.findall(element.attrib["v"])
                if m:
                    element.attrib["v"] = fix_phone(m)        
            elif "website" in element.attrib.values():
                if not apply_re_search(element.attrib["v"], site_re):
                    element.attrib["v"] = fix_website(element.attrib["v"])
            
            if element.tag in tags:
                output.write(ET.tostring(element, encoding='utf-8'))
        output.write('</osm>')

##### Callers

In [41]:
# Fix OSM v values on a new file
old_filename = "federal_district_fixed_k.osm"
new_filename = "federal_district_fixed_k_v.osm"
fix_osm_v(old_filename, new_filename)

##### Check the result

In [42]:
# look for the number of postal code correct pattern on the "k" attribute "addr:postcode"
attribute = "addr:postcode"
pattern_re = postal_code_re
print check_patterns(new_filename, attribute, pattern_re)

7086705
73088
72231
defaultdict(<type 'int'>, {'no match': 3, 'match': 928})


In [43]:
# look for the number of telephone correct pattern on the "k" attribute "phone"
attribute = "phone"
pattern_re = phone_re
print check_patterns(new_filename, attribute, pattern_re)

+55 61 08006120
defaultdict(<type 'int'>, {'no match': 1, 'match': 761})


In [44]:
# look for the number of telephone correct pattern on the "k" attribute "phone"
attribute = "alt_phone"
pattern_re = phone_re
print check_patterns(new_filename, attribute, pattern_re)

defaultdict(<type 'int'>, {'match': 2})


In [45]:
# look for the number of website correct pattern on the "k" attribute "website"
attribute = "website"
pattern_re = site_re
print check_patterns(new_filename, attribute, pattern_re)

defaultdict(<type 'int'>, {'match': 366})


### Fix Addresses
##### Constants

In [46]:
expected = ['avenida', 'rua', 'quadra', u'condomínio', u'edifício', 'setor', 'companhia']

street_mapping = { 'avenida': 'Avenida', 
                'av': 'Avenida', 
                'av.': 'Avenida',
                'av:': 'Avenida', 
                'rua': 'Rua',
                'r': 'Rua',
                'r.': 'Rua',
                'r:': 'Rua',
                'quadra': 'Quadra',
                'q': 'Quadra',
                'q.': 'Quadra',
                'q:': 'Quadra',
                'qd': 'Quadra',
                'qd.': 'Quadra',
                'qd:': 'Quadra',
                'condominio': u'Condomínio',
                u'condomínio': u'Condomínio',   
                'cond': u'Condomínio',
                'cond.': u'Condomínio',
                'cond:': u'Condomínio',
                'condominio': u'Condomínio',
                'edificio': u'Edifício',
                u'edifício': u'Edifício',
                'ed': u'Edifício',
                'ed.': u'Edifício',
                'ed:': u'Edifício',
                'edificio': u'Edifício',
                'setor': 'Setor',
                'st': 'Setor',
                'st.': 'Setor',
                'st:': 'Setor',
                'companhia': 'Companhia',
                'cia': 'Companhia',
                'cia.': 'Companhia',
                'cia:': 'Companhia'
              }

##### Functions
_from lesson "Case Study: OpenStreetMap Data", "10. Quiz: Melhorando os Nomes de Ruas". Adapted_

In [47]:
def is_street_name(element):
    """Return true if tag attribute "k" is equal to one of street names attributes"""
    
    return (element.attrib["k"] == "addr:housenumber") or (element.attrib["k"] == "addr:street") \
            or (element.attrib["k"] == "name") or (element.attrib["k"] == "addr:housename") \
            or (element.attrib["k"] == "alt_name")

def update_name(string, name):
    """Replace first word on a string"""
    
    return re.sub(r'^\S+\.?', name, string)

def fix_osm_street(filename, new_filename,tags=('node', 'way', 'relation')):
    """Update street names abbreviations on a new OSM file"""
    
    with open(new_filename, 'wb') as output:
        output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        output.write('<osm version="0.6" generator="osmconvert 0.8.5" timestamp="2017-08-10T15:02:03Z">\n  ')

        for _, element in ET.iterparse(filename, events=("end",)):
            if is_tag(element):
                if is_street_name(element):
                    street_name = get_street_name(element.attrib["v"]).lower()
                    if street_name in street_mapping:
                        element.attrib["v"] = update_name(element.attrib["v"], street_mapping[street_name])
            
            if element.tag in tags:
                output.write(ET.tostring(element, encoding='utf-8'))
        output.write('</osm>')

##### Callers

In [48]:
# Fix OSM street names on a new file
old_filename = "federal_district_fixed_k_v.osm"
new_filename = "federal_district_fixed.osm"
fix_osm_street(old_filename, new_filename)

##### Check the result

In [49]:
# get the "v" attribute matches occurrences on street abbreviation list by "k" attribute
street_abbr = ['avenida', 'av', 'av.', 'av:', 'rua, ''r', 'r.', 'r:', 'quadra', 'q', 'q.', 'qd', 'qd.', 'qd:', 'condominio', u'condomínio', 'cond', 'cond.', 'cond:', 'edificio', u'edifício', 'ed', 'ed.', 'ed:', 'setor', 'st', 'st.', 'st:', 'companhia', 'cia', 'cia.', 'cia:']
pprint.pprint(dict(audit_street(new_filename, street_abbr)))

{'addr:full': {u'Avenida': 2},
 'addr:housename': {u'Condom\xednio': 1, u'Edif\xedcio': 52, 'Quadra': 2},
 'addr:housenumber': {u'Edif\xedcio': 1, 'Quadra': 8},
 'addr:place': {u'Avenida': 1, 'Setor': 10},
 'addr:street': {'Avenida': 282,
                 u'Condom\xednio': 66,
                 u'Edif\xedcio': 3,
                 u'Quadra': 354,
                 'Setor': 19},
 'addr:suburb': {u'Condom\xednio': 1, 'Setor': 102},
 'alt_name': {'Avenida': 3,
              u'Condom\xednio': 3,
              u'Edif\xedcio': 6,
              'Quadra': 2,
              'Setor': 11},
 'description': {'Condominio': 1, u'Condom\xednio': 1, u'Edif\xedcio': 1},
 'destination': {'Setor': 1},
 'from': {'Setor': 2},
 'leisure': {'quadra': 1},
 'name': {'Avenida': 1102,
          'Companhia': 15,
          u'Condom\xednio': 219,
          u'Edif\xedcio': 258,
          'Quadra': 1132,
          'Setor': 315},
 'name:pt': {u'Condom\xednio': 4, 'Ed': 1, u'Edif\xedcio': 1, 'Quadra': 1},
 'note': {'Ed.': 8

### Check postal code consistency

##### Functions

In [50]:
def check_postalcode(postalcode):
    """Check if the postal code is within DF range (70000 to 72799 and 73000 to 73699)"""
    
    postalcode = postalcode.split("-")
    postalcode = int(postalcode[0])
    if not ((postalcode >= 70000 and postalcode <= 72799) or \
        (postalcode >= 73000 and postalcode <= 73699)):
            return postalcode
    
def is_postal(element):
    """Return if tag attribute 'k' is 'addr:postcode'"""
    
    return (element.tag == "tag") and (element.attrib["k"] == "addr:postcode")

def check_consistency(filename):
    """Check postal code consistency"""
    
    postal_code_list = []
    for _, element in ET.iterparse(filename):
        if is_postal(element):
            if check_postalcode(element.attrib["v"]):
                postal_code_list.append(check_postalcode(element.attrib["v"]))
    return postal_code_list

##### Callers

In [51]:
# Assert postal codes within Federal District postal code range
postal_code_consistency = check_consistency(new_filename)
print len(postal_code_consistency)
print postal_code_consistency

57
[72880, 72880, 72876, 72876, 72876, 72876, 72910, 72910, 87200, 73803, 73802, 73803, 73802, 73803, 73803, 73803, 73803, 73803, 72878, 7086705, 73803, 73803, 73803, 73809, 73809, 73809, 73809, 73809, 73809, 73809, 73809, 73809, 73809, 73809, 73809, 73809, 73809, 73809, 73809, 73809, 73803, 73803, 73803, 73801, 72870, 73803, 72900, 73803, 73803, 73803, 73803, 73803, 73803, 72878, 72870, 72870, 72878]


### Write csv file
##### Functions
_from lesson "Case Study: OpenStreetMap Data", "11. Quiz: Melhorando os Nomes de Ruas". Adapted_

##### Constants

In [52]:
NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

In [53]:
SCHEMA = schema.schema

In [54]:
def get_key(string):
    """Get k attribute value separated by the first ':' """
    
    k = string.split(":", 1)
    try:
        k_val = k[1]
        type_val = k[0]
    except:
        k_val = k[0]
        type_val = 'regular'
        
    return k_val, type_val

def shape_tags(element, element_id):
    """Shape "tag" XML tag to Python dict"""
    
    element_dict = {}
    k, type_val = get_key(element.attrib['k'])
    
    element_dict['id'] = int(element_id)
    element_dict['key'] = k
    element_dict['type'] = type_val
    element_dict['value'] = element.attrib['v']
    
    return element_dict


def format_element(element, element_dict, fields):
    """Shape node or way XML element to Python dict"""
    
    for key in element.attrib:
        if key in fields:
            if SCHEMA[element.tag]['schema'][key]['type'] == 'integer':
                element_dict[key] = int(element.attrib[key])
            elif SCHEMA[element.tag]['schema'][key]['type'] == 'float':
                element_dict[key] = float(element.attrib[key])
            elif SCHEMA[element.tag]['schema'][key]['type'] == 'string':
                element_dict[key] = element.attrib[key]
                
    return element_dict

def shape_nd(element, element_id, position):
    """Shape nd XML tag to Python dict"""
    
    element_dict = {}
    
    element_dict['id'] = int(element_id)
    element_dict['node_id'] = int(element.attrib['ref'])
    element_dict['position'] = position
    
    return element_dict

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS):
    """Shape xml elements to list of dictionaries""" 
    
    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []

    if element.tag == 'node':
        node_attribs = format_element(element, node_attribs, node_attr_fields)
         
        for tag in element:
            if tag.tag == 'tag':
                tags.append(shape_tags(tag, element.attrib['id']))
        
        return {'node': node_attribs, 'node_tags': tags}
    
    elif element.tag == 'way':
        way_attribs = format_element(element, way_attribs, way_attr_fields)
                    
        position = 0
        for tag in element:
            if tag.tag == 'tag':
                tags.append(shape_tags(tag, element.attrib['id']))
                
            if tag.tag == 'nd':
                way_nodes.append(shape_nd(tag, element.attrib['id'], position))                
                position += 1
        
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)

    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
        codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
        codecs.open(WAYS_PATH, 'w') as ways_file, \
        codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
        codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        #nodes_writer.writeheader()
        #node_tags_writer.writeheader()
        #ways_writer.writeheader()
        #way_nodes_writer.writeheader()
        #way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    if el['node_tags']:
                        node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    if el['way_nodes']:
                        way_nodes_writer.writerows(el['way_nodes'])
                    if el['way_tags']:
                        way_tags_writer.writerows(el['way_tags'])

##### Callers

In [55]:
OSM_PATH = "federal_district_fixed.osm"
process_map(OSM_PATH, validate=False)

### Create tables
##### Script

In [56]:
QUERY = """CREATE TABLE nodes (
    id INTEGER PRIMARY KEY NOT NULL,
    lat REAL,
    lon REAL,
    user TEXT,
    uid INTEGER,
    version INTEGER,
    changeset INTEGER,
    timestamp TEXT
);

CREATE TABLE nodes_tags (
    id INTEGER,
    key TEXT,
    value TEXT,
    type TEXT,
    FOREIGN KEY (id) REFERENCES nodes(id)
);

CREATE TABLE ways (
    id INTEGER PRIMARY KEY NOT NULL,
    user TEXT,
    uid INTEGER,
    version TEXT,
    changeset INTEGER,
    timestamp TEXT
);

CREATE TABLE ways_tags (
    id INTEGER NOT NULL,
    key TEXT NOT NULL,
    value TEXT NOT NULL,
    type TEXT,
    FOREIGN KEY (id) REFERENCES ways(id)
);

CREATE TABLE ways_nodes (
    id INTEGER NOT NULL,
    node_id INTEGER NOT NULL,
    position INTEGER NOT NULL,
    FOREIGN KEY (id) REFERENCES ways(id),
    FOREIGN KEY (node_id) REFERENCES nodes(id)
);"""

##### Execution

In [57]:
db = sqlite3.connect("DF_OSM2.db")
c = db.cursor()

# all SQL commands (split on ';')
sqlCommands = QUERY.split(';')

# Execute every command from the input file
for command in sqlCommands:
    try:
        c.execute(command)
    except OperationalError, msg:
        print "Command skipped: ", msg

db.commit()
db.close()

### Import csvs on DF_OSM.db
_Procedure executed on sqlite3 terminal_

In [None]:
sqlite> .mode csv
sqlite> .import nodes.csv nodes
sqlite> .import nodes_tags.csv nodes_tags
sqlite> .import ways.csv ways
sqlite> .import ways_tags.csv ways_tags
sqlite> .import ways_nodes.csv ways_nodes