In [1]:
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import requests
import re

In [3]:
filename = 'Juiz de Fora Region.osm'

def get_root(filename):
    tree = ET.parse(filename)
    return tree.getroot()

root = get_root(filename)

In [3]:
def get_users(root):
    'Return the set of unique users who contributed to this data set'
    
    users = set()
    
    element_types = ['node', 'way', 'relation']
    
    for element_type in element_types:
        for element in root.findall(element_type):
            users.add(element.get('user'))
    
    return users

In [4]:
def get_attrib_values_set(xpath, elem):
    """Look for xpath in elem and add all corresponding values to a set.
    Expects elements found on xpath to have a 'v' attribute."""
    
    return_set = set()
    
    for element in root.findall(xpath):
        return_set.add(element.get('v'))

    return return_set

In [42]:
def get_street_names_in_way(elem):
    """Look for 'way' elements that are likely streets and return a set of their names."""

    names = set()
    
    for way in elem.findall('.//way'):
        if way.find('.//tag[@k="highway"]') is not None:
            name_elem = way.find('.//tag[@k="name"]')
            if name_elem is not None:
                names.add(name_elem.get('v'))
    
    return names

In [7]:
def get_street_types_from_postal():
    """Access the Brazilian Postal Services website and extract official approved
    street name types (e.g. street, avenue etc.)"""
    types = []
    
    URL = 'http://www.buscacep.correios.com.br/sistemas/buscacep/buscaCep.cfm'
    
    r = requests.get(URL)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    select = soup.find(name='select', attrs={'name':'Tipo'})
    for option in select.find_all(name='option', string=True):
        types.append(option['value'])
    
    return types

In [8]:
official_street_types = get_street_types_from_postal()

* Some street names do not begin with indication "Rua", "Avenida" etc.
* Some streets are encoded as "nodes" and some as "ways"

In [43]:
sorted_way_names = sorted(way_names)

NameError: name 'way_names' is not defined

In [10]:
# Check which collected names don't begin with an official street type

names_to_check = set()
complete_names = addrst_names.union(way_street_names)

for name in complete_names:
    first_word = name.split(' ')[0]
    if first_word not in official_street_types:
        names_to_check.add(name)

In [11]:
for n in sorted(names_to_check):
    print n

1a Travessa Vicente Chagas
2a Travessa Vicente Chagas
AMG-1625
AMG-3005
AMG-3010
AMG-3020
AMG-3030
AMG-3055
AMG-3060
AMG-3060 / Avenida Juiz de Fora
AMG-3060 / Praça Pio XII
AMG-3060 / Rua Constância de Castro
AMG-3060 / Rua Doutor Augusto Gonçalves
AMG-3060 / Rua Doutor Dilermano Cruz
AMG-3060 / Rua Marciano Loures
AMG-3060 / Rua Silva Jardim
AMG-525
AVENIDA DIAS PAES
Acceso Expominas
Acesso
Acesso AMG-3015
Acesso Avenida Itamar Franco
Acesso BR 116
Acesso BR-116
Acesso BR-267
Acesso Bretas e Posto
Acesso Fazenda Guinilha e Mato Dentro
Acesso Igreja da Glória
Acesso MG-126
Acesso Norte
Acesso Parque de Exposições
Acesso ROTA S.A
Acesso Rodoviária de Juiz de Fora - Miguel Mansur
Acesso Rua Senador Salgado Filho
Acesso Saída
Acesso Sesi
Acesso Votorantin Metais
Acesso ao CAS
Acesso ao Cemitério
Acesso ao Cemitério Municipal
Acesso ao Cemitério Parque da Saudade
Acesso ao Cemitério de São Pedro
Acesso ao HPS - Hospital de Pronto Atendimento
Acesso ao Hospital Militar
Acesso ao IFET - Ins

In [25]:
def clean_name_from_postal(name, city):
    """Access Brazilian Postal Services and check whether the given name
    is a street name. If found, we update the name with its correct complete version.
    Else, return the unchanged name."""
    
    # Prepare name for form
    prepped_name = name.encode('ISO-8859-1')
    
    URL = 'http://www.buscacep.correios.com.br/sistemas/buscacep/resultadoBuscaCep.cfm'
    post_data = {
        'UF': 'MG',
        'Localidade': city,
        'Logradouro': prepped_name
    }
    
    SUCCESS = 'DADOS ENCONTRADOS COM SUCESSO.'
    FAILURE = 'LOGRADOURO NAO ENCONTRADO.'
    
    r = requests.post(URL, post_data)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    result_message = soup.find('div', class_='ctrlcontent').find('p').string
    
    if result_message == SUCCESS:
        table = soup.find(name='table', class_='tmptabela')
        name_found = table.find('td').string   # The first 'td' has what we're looking for
    else:
        return name
    
    # Some street names have notes after '-'. Remove them
    name_found = name_found.split('-')[0]
    
    return name_found.strip()

In [13]:
def get_keys_from_street_names(root):
    """Search all elements and find ones whose underlying 'name' tag begins with
    one of the official street types. Then, return a set of all keys in the
    corresponding element."""

    k = set()
    
    for elem in root.findall('.//'):
        if elem.find('.//tag[@k="name"]') is not None:   # has a 'name' tag
            for under_elem in elem.findall('.//tag[@k]'):
                k.add(under_elem.get('k'))
    
    return k

In [14]:
keys_from_st_names = get_keys_from_street_names(root)

In [5]:
def get_all_tag_kv(root):
    """Get all keys and values from 'tag' elements and return them in a dictionary.
    Each value in the dict is a set containing all values found for that key."""

    dic = {}
    
    for elem in root.findall('.//tag'):
        k, v = elem.get('k'), elem.get('v')
        if k not in dic.keys():
            dic[k] = set()
        dic[k].add(v)
    
    return dic

In [6]:
all_kv = get_all_tag_kv(root)

In [18]:
sorted(all_kv.keys())

['4wd_only',
 'Correios',
 'FIXME',
 'GEOCODIG_D',
 'GEOCODIG_M',
 'IBGE:BACIA',
 'IBGE:CD_ADMINIS',
 'IBGE:GEOCODIGO',
 'IBGE:NASCENTE',
 'IGAM:BACIA',
 'ISO3166-2',
 'LAYER',
 'NOME',
 'PREFEITURA',
 'abandoned',
 'access',
 'addr:city',
 'addr:country',
 'addr:floor',
 'addr:housename',
 'addr:housenumber',
 'addr:postcode',
 'addr:street',
 'addr:suburb',
 'admin_level',
 'aeroway',
 'alt_name',
 'alt_ref',
 'amenity',
 'area',
 'artwork_type',
 'atm',
 'barrier',
 'bench',
 'bicycle',
 'boat',
 'border_type',
 'boundary',
 'brand',
 'bridge',
 'bridge:structure',
 'building',
 'building:levels',
 'bus',
 'cables',
 'capacity',
 'census',
 'collection_times',
 'construction',
 'contact:email',
 'contact:phone',
 'covered',
 'craft',
 'created_by',
 'crop',
 'crossing',
 'crossing:bell',
 'crossing:light',
 'crossing_ref',
 'cuisine',
 'cutting',
 'cycleway',
 'delivery',
 'denomination',
 'description',
 'description:pt',
 'destination',
 'destination:forward',
 'destination:lanes'

In [19]:
def clean_CEP(cep):
    """Correct formatting of the provided CEP (postal code)."""
    
    #correct_cep = re.compile('\d{5}-\d{3}')
    missing_dash = re.compile('\d{8}')
    extra_period = re.compile('\d{2}\.\d{3}-\d{3}')
    if missing_dash.search(cep):
        cep = cep[:5] + '-' + cep[-3:]
    elif extra_period.search(cep):
        cep = cep[:2] + cep[3:]
    
    return cep

In [20]:
addr_keys = [k for k in all_kv.keys() if k.startswith('addr:')]
addr_keys

['addr:street',
 'addr:housenumber',
 'addr:housename',
 'addr:floor',
 'addr:suburb',
 'addr:postcode',
 'addr:country',
 'addr:city']

In [91]:
cities = sorted([v for v in all_kv['addr:city']])
cities

['Anta',
 'Anta (Distrito de Sapucaia)',
 'Anta (distrito de Sapucaia)',
 'Barbacena',
 u'Barra de S\xe3o Francisco (distrito)',
 'Barroso',
 'Bom Jardim de Minas',
 u'Cana\xe3',
 'Cantagalo',
 'Chiador',
 'Chiador MG',
 'Coimbra',
 'Congonhas',
 'Conselheiro Lafaiete',
 'Cordeiro',
 'Coronel Xavier Chaves',
 u'Divin\xe9sia',
 'Dores de Campos',
 'Entre Rios de Minas',
 u'Erv\xe1lia',
 u'Er\xe1lia',
 'Guiricema',
 'Itaocara',
 'Itaverava',
 'Juiz de Fora',
 'Juiz de Fora-MG',
 'Levy Gasparian',
 'Lobo Leite',
 'Macuco',
 'Mar de Espanha',
 'Matias Barbosa',
 'Nossa Senhora da Aparecida',
 'Nossa Senhora de Aparecida',
 'Ouro Branco',
 u'Paula C\xe2ndido',
 'Rio Pomba',
 'Santos Dumont',
 'Sapucaia',
 'Sarandi',
 u'Silveir\xe2nia',
 'Sumidouro',
 'Sumidouro RJ',
 u'S\xe3o Jo\xe3o Nepomuceno',
 u'S\xe3o Jo\xe3o del rei',
 u'S\xe3o Jo\xe3o del-Rei',
 u'S\xe3o Sebasti\xe3o do Alto',
 'Tocantins',
 u'Tr\xeas Rios',
 u'Ub\xe1',
 'Vicosa',
 'Visconde do Rio Branco',
 u'Vi\xe7osa',
 u'Vi\xe7os

In [22]:
countries = sorted([v for v in all_kv['addr:country']])
countries

['BR', 'Brasil']

In [23]:
suburbs = sorted([v for v in all_kv['addr:suburb']])
suburbs

[u'Amaz\xf4nia',
 u'Ang\xe9lica',
 'Barbosa Lage',
 'Barreira',
 'Batatal',
 'Bela Vista',
 'Boa Vista',
 'Campus UFV',
 'Cascatinha',
 'Caxias',
 'Centro',
 'Cidade do Sol',
 'Copasa',
 'Democrata',
 'Distrito Industrial',
 'Dom Oscar',
 u'Esta\xe7\xe3o',
 u'Euclidel\xe2ndia',
 'Francisco Bernardino',
 u'Jardim Am\xe9rica',
 u'J\xf3quei Clube',
 'Linhares',
 'Lourdes',
 u'Louri\xe7al',
 'Maravilha',
 u'Mariano Proc\xf3pio',
 'Martelos',
 u'Monte L\xedbano',
 u'Morro da Gl\xf3ria',
 u'Nossa Senhora de F\xe1tima',
 'Nova Era',
 'Nova Macuco',
 'Novo Horizonte',
 u'Para\xedso',
 u'Parque de Exposi\xe7\xf5es Edgar  Rodrigues Lutterbach',
 u'Parque de Exposi\xe7\xf5es Edgar Rodrigues Lutterbach',
 'Passos',
 'Reta',
 'Santa Helena',
 'Santa Irene',
 'Santa Teresa',
 'Santa Terezinha',
 'Santos Anjos',
 u'Sumar\xe9',
 u'S\xe3o Bernardo',
 u'S\xe3o Mateus',
 u'S\xe3o Pedro',
 u'S\xe3o Sebasti\xe3o',
 'Teixeiras',
 u'Turv\xe3o',
 'Vitorino Braga',
 'Volta da Ferradura',
 'Volta do Umbigo',
 '

In [7]:
street = sorted([v for v in all_kv['addr:street']])
for x in street:
    print x

Alameda Engenheiro Gentil Forn
Alameda Salvaterra
Av. PH Rolfs s/n
Avenida Astolfo Dutra
Avenida Barão do Rio Branco
Avenida Bernardes Filho
Avenida Bias Fortes
Avenida Brasil
Avenida Cardoso Saraiva
Avenida Carlos Alves
Avenida Cucui
Avenida Deusdedith Salgado
Avenida Djalma Beda Coube
Avenida Dom Orione
Avenida Dom Pedro II
Avenida Dom Silverio
Avenida Doutor Carlos Soares
Avenida Doutor Paulo Japiassu Coelho
Avenida Ernesto Lopes
Avenida Garibaldi Campinhos
Avenida General Atratino Cortes Coutinho
Avenida Getúlio Vargas
Avenida Governador Roberto Silveira
Avenida Governador Valadares
Avenida Itamar Franco
Avenida José Carlos Boareto
Avenida José Malaquias
Avenida José Maria dos Santos
Avenida Juiz de Fora
Avenida Macuco
Avenida Minas Gerais
Avenida P. H. Rolfs
Avenida Peter Henry Rolfs
Avenida Peter Henry Rolfs, s/nº - Campus Universitário
Avenida Prefeito Alberto da Silva Lavinas
Avenida Prefeito Mário Rodrigues Pereira
Avenida Pres. Costa e Silva
Avenida Presidente Costa e Silva
A

In [25]:
for k in sorted(all_kv.keys()):
    for v in all_kv[k]:
        print k+': '+v

4wd_only: recommended
Correios: CEE JFA
FIXME: Precisa verificar se este prédio inteiro realmente é um posto de combustível
FIXME: Arrumar traçado via survey
FIXME: Arrumar survey
FIXME: Esta via precisa ser realinhada.
GEOCODIG_D: 313800510
GEOCODIG_D: 312040920
GEOCODIG_D: 312900415
GEOCODIG_D: 310250610
GEOCODIG_D: 315790610
GEOCODIG_D: 312900410
GEOCODIG_D: 316920810
GEOCODIG_D: 313390710
GEOCODIG_D: 313550617
GEOCODIG_D: 310370210
GEOCODIG_D: 314390620
GEOCODIG_D: 311800710
GEOCODIG_D: 310460110
GEOCODIG_D: 315520715
GEOCODIG_D: 311800715
GEOCODIG_D: 315080215
GEOCODIG_D: 310210015
GEOCODIG_D: 316920807
GEOCODIG_D: 312595210
GEOCODIG_D: 314390610
GEOCODIG_D: 316250015
GEOCODIG_D: 311830410
GEOCODIG_D: 311020210
GEOCODIG_D: 314390630
GEOCODIG_D: 312140710
GEOCODIG_D: 317130315
GEOCODIG_D: 310210010
GEOCODIG_D: 317130310
GEOCODIG_D: 313550620
GEOCODIG_D: 312200910
GEOCODIG_D: 314390615
GEOCODIG_D: 310550910
GEOCODIG_D: 316990115
GEOCODIG_D: 313540715
GEOCODIG_D: 310550915
GEOCODIG_D

In [26]:
elev = [v for v in all_kv['ele']]

In [27]:
def clean_elev(elev):
    """Standardize formatting for elevation values by removing measurement units."""
    
    elev_re = re.compile('\d+')
    return elev_re.search(elev).group()

In [65]:
def remove_extra_spaces(string):
    return re.sub(' {2,}', ' ', string)

In [66]:
def clean_suburb(sub):
    return remove_extra_spaces(sub.capitalize())

In [30]:
num = sorted([v for v in all_kv['addr:housenumber']])
for x in num:
    print x

0
01/sala 404
05
10
100
1000
101
1014
102
106
1067
108
11
110
1112
1115
112
115
119
120
1229
1276
1283
129
1301
138
140-166
148
15
150
1522
1531
155
160
1600
165
1655
167
170
170-224
171
173
175
176
18
180
1805
181
184
1847
185
1850
187
19
190
195
197
2
2.587
20
200
2000
2001
204
2067
2102
219
223
229
23
230
2305
2406
241
245
247
2499
25
250
251
2510
2519
2555
261
2691
2725
273
28
286
291
3
300
310
311
330
333
3353
3408
349
35
352
355
356
36
3600
375
376
3760
3820
384
389
40
4001
401
411
415
42
42/54
45
450
455
461
48
49
490
496
5
500
5000
506
520
526
53
5300
540
558
581
6
60
600
610
635
649
677
681
70
700
701
710
740
760
765
777
78
79
790
8
810
82
821
828
844
860
876
88
881
892
908
924
95
99
Km 1
S/N
km 126
km 159
km 7
s/ nº
s/n


In [31]:
def clean_house_number(num):
    """Standardize format for house numbers."""
    
    if re.search('[Ss]/.*[Nn]', num):    # "sem número" (has no house number)
        num = 's/n'
    elif re.search('km', num):    # change km to uppercase
        num = num.capitalize()
    else:
        num = num.replace('.', '')    # remove periods in numbers
    
    return num

In [32]:
def clean_opening_hours(hours):
    """Standardize format for opening hours."""

    one_digit_hour_re = re.compile('(?<!\d)(\d{1})(:\d{2})')
    two_hours_re = re.compile(u'(\d{2}:\d{2})[aàsátée \-]+(\d{2}:\d{2})[Hh]?')
    
    def add_leading_zero(match_obj):
        return '0'+match_obj.group(1)+match_obj.group(2)
    
    def standardize_with_dash(match_obj):
        return match_obj.group(1)+'-'+match_obj.group(2)
        
    hours = one_digit_hour_re.sub(add_leading_zero, hours)
    hours = two_hours_re.sub(standardize_with_dash, hours)
    
    return hours

In [212]:
def clean_phone(phone_number):
    """Standardize format for phone numbers: +XXXXXXXXXXXX.
    Returns either a list of cleaned phone numbers or None
    if phone number is invalid."""
    
    phones = phone_number.split(';')
    cleaned_phones = []
    
    for phone in phones:
    
        if phone.isalpha():
            return None
        else:
            # Remove some unwanted characters
            phone = re.sub('[ \.\-()]', '', phone)

            if phone.startswith('0') and not phone.startswith('0800'):
                phone = phone[1:]

            if phone.startswith('55'):
                phone = '+' + phone
            elif re.search('^\d{10,11}$', phone):
                phone = '+55' + phone

            cleaned_phones.append(phone)
    
    return cleaned_phones

In [42]:
def is_phone_ok(phone):
    format_re = re.compile('\+55\d{10,11}')
    if format_re.search(phone) is not None:
        return True
    else:
        return False




In [57]:
cities = sorted([v for v in all_kv['addr:city']])
for x in cities:
    if x != clean_city(x):
        print x, "/", clean_city(x)

Anta (Distrito de Sapucaia) / Anta
Anta (distrito de Sapucaia) / Anta
Barra de São Francisco (distrito) / Barra de São Francisco
Chiador MG / Chiador
Erália / Ervália
Juiz de Fora-MG / Juiz de Fora
Sumidouro RJ / Sumidouro
São João del rei / São João del-Rei
Vicosa / Viçosa
Viçosa - MG / Viçosa
juiz de Fora / Juiz de Fora


In [215]:
def compare_clean_and_unclean(fieldname, clean_function, all_kv):
    values = sorted([v for v in all_kv[fieldname]])
    for value in values:
        cleaned_value = clean_function(value)
        if value != cleaned_value:
            print value, "/", cleaned_value

In [217]:
compare_clean_and_unclean('addr:postcode', clean_CEP)

28500000 / 28500-000
28540000 / 28540-000
28545000 / 28545-000
28550000 / 28550-000
28570000 / 28570-000
35490000 / 35490-000
36.205-276 / 36205-276
36.213-000 / 36213-000
36.400-000 / 36400-000
36033340 / 36033-340
36037812 / 36037-812
36083070 / 36083-070
36085040 / 36085-040
36180000 / 36180-000
36300000 / 36300-000
36301046 / 36301-046
36330000 / 36330-000
36400000 / 36400-000
36415000 / 36415-000
36440000 / 36440-000
36500000 / 36500-000
36520000 / 36520-000
36525000 / 36525-000
36544000 / 36544-000
36546000 / 36546-000
36550000 / 36550-000
36555000 / 36555-000
36570000 / 36570-000
36680000 / 36680-000


In [58]:
def clean_city(city):
    
    typos = {
        u'Erália': u'Ervália',
        u'Vicosa': u'Viçosa',
        u'São João del rei': u'São João del-Rei'
    }
    
    if city in typos.keys():
        city = typos[city]
    else:
        # Remove parentheses
        if '(' in city:
            city, _, _ = city.partition('(')
            city = city.strip()

        # Remove state
        state_pattern = 'MG|RJ'
        state_match = re.search(state_pattern, city)
        if state_match is not None:
            city = state_pattern.sub('', city)
        city = city[0].upper() + city[1:].strip(' -')

    return city

In [63]:
addrst_names = get_attrib_values_set('.//tag[@k="addr:street"]', root)
way_street_names = get_street_names_in_way(root)

all_street_names = addrst_names.union(way_street_names)

In [64]:
for n in sorted(all_street_names):
    print n

1a Travessa Vicente Chagas
2a Travessa Vicente Chagas
AMG-1625
AMG-3005
AMG-3010
AMG-3020
AMG-3030
AMG-3055
AMG-3060
AMG-3060 / Avenida Juiz de Fora
AMG-3060 / Praça Pio XII
AMG-3060 / Rua Constância de Castro
AMG-3060 / Rua Doutor Augusto Gonçalves
AMG-3060 / Rua Doutor Dilermano Cruz
AMG-3060 / Rua Marciano Loures
AMG-3060 / Rua Silva Jardim
AMG-525
AVENIDA DIAS PAES
Acceso Expominas
Acesso
Acesso AMG-3015
Acesso Avenida Itamar Franco
Acesso BR 116
Acesso BR-116
Acesso BR-267
Acesso Bretas e Posto
Acesso Fazenda Guinilha e Mato Dentro
Acesso Igreja da Glória
Acesso MG-126
Acesso Norte
Acesso Parque de Exposições
Acesso ROTA S.A
Acesso Rodoviária de Juiz de Fora - Miguel Mansur
Acesso Rua Senador Salgado Filho
Acesso Saída
Acesso Sesi
Acesso Votorantin Metais
Acesso ao CAS
Acesso ao Cemitério
Acesso ao Cemitério Municipal
Acesso ao Cemitério Parque da Saudade
Acesso ao Cemitério de São Pedro
Acesso ao HPS - Hospital de Pronto Atendimento
Acesso ao Hospital Militar
Acesso ao IFET - Ins

In [74]:
def audit_street_names(names):
    
    bad_names = set()
    
    unwanted_chars_re = re.compile('[;.\-]')
    official_types = get_street_types_from_postal()
    
    for name in names:
        first_word = name.split(' ')[0]
        if unwanted_chars_re.search(first_word) is not None or\
           first_word not in official_street_types:
            bad_names.add(name)
    
    return bad_names

In [75]:
bad_names = audit_street_names(all_street_names)

In [92]:
three_letter_words = set()
for name in sorted(all_street_names):
    words = name.split(' ')
    for word in words:
        if len(word) <= 3:
            three_letter_words.add(word)

for w in sorted(three_letter_words):
    print w


-
/
04
06
1
10
105
11
116
12
126
13
134
14
149
15
16
18
19
1a
1°
1º
2
20
21
22
23
24
25
26
267
28
29
294
2a
3
3,1
30
31
35
37
4
447
5
6
7
8
89
9
?
A
A.
Aad
Aci
Acy
Ade
Ado
Al
Ali
Am
Ami
Ana
Ane
Ani
Ant
Ari
Ary
Av.
Azi
Aço
B
B.
BR
BR-
Bem
Boa
Bom
Bon
C
C.
CAS
Ce.
Cel
Cid
D
D.
DAS
DE
DO
DOS
DSC
Da
Das
De
Del
Der
Dez
Di
Dia
Dib
Do
Dom
Dr
Dr.
E
E.
E1
E2
EFL
Eda
Ede
El
Eli
Ely
Eni
Epa
Eva
Ex
Eça
F
F.
Fé
G
G.
Gal
Gil
Gê
H
H.
HPS
Hon
I
II
IV
IX
Ida
Ide
Ipe
Ipê
Ito
Ivo
J
J.
JK
Juv
Jós
K
KM
Kin
Km
L
L.
LBI
Lao
Lau
Luz
Lys
Léa
Léo
M
M.
MG
Mar
Max
Me.
Mel
Meu
Mãe
N
N.
Nei
Nhá
Ns
O
Od
Os
Oti
Oto
P
P.
PH
Pai
Pau
Paz
Pio
Pt
Pés
Q
R
R.
RIo
RUA
RUa
Rad
Rei
Rey
Rio
Riv
Ru
Rua
Rui
Rus
Ruy
S
S.
S.A
SAU
Sad
San
Sao
Sem
Sir
Sol
Sul
Sá
São
Sãp
T
T.
Tav
Tia
Tim
Tom
Tv
Tv.
U
U.
UFV
Ubá
Um
V
V.
VI
Vai
Val
Van
Vaz
Via
Von
W
X
XI
XII
XV
Y
Z
Zen
Zé
a
ao
da
das
de
del
do
dos
e
km
ll
o
os
pau
s/n
sem
so
à


In [93]:
def capitalize_proper_name(name):
    """Capitalize proper names according to PT-BR conventions."""
    
    exceptions = ['do', 'da', 'dos', 'das', 'de', 'o', 'a', 'ao', 'para', 'e', u'à', 's/n']
    
    words = name.lower().split(' ')
    cap_name = [words[0].capitalize()]
    for word in words[1:]:
        cap_name.append(word.capitalize() if word not in exceptions else word)
    return " ".join(cap_name)

In [207]:
def clean_street_name(name, street_types):
    
    subs = {}
    subs.update(dict.fromkeys(['R.', u'RUa', 'Ru', 'Rua.', u'Rus', u'Ruia'], 'Rua'))
    subs.update(dict.fromkeys(['Tv.', 'Tracessa', 'Trafessa', 'Travessia', u'Travressa'], 'Travessa'))
    subs.update(dict.fromkeys([u'Av.', 'Avenido'], 'Avenida'))
    subs.update(dict.fromkeys(['Estr.'], 'Estrada'))
    
    name = remove_extra_spaces(name)

    # If there's no space after first word, correct it
    no_space_after_street_re = re.compile('^(R\.|Rua)[^ .].*')
    match_no_space = no_space_after_street_re.search(name)
    if match_no_space is not None:
        name = re.sub(match_no_space.group(1), 'Rua ', name)
    
    # Substitute typos
    words = name.split(' ')
    if words[0] in subs.keys():
        name = subs[words[0]] + ' ' + ' '.join(words[1:])
    
    # Correct capitalization of uppercase names
    if name.isupper():
        for street_type in official_street_types:
            if name.lower().startswith(street_type.lower()):
                name = capitalize_proper_name(name)
    
    # Clean highway names
    highway_re = re.compile('(BR|MG|RJ)[ \-] *(\d{3})')
    if highway_re.search(name) is not None:
        name = highway_re.sub(lambda match: match.group(1) + '-' + match.group(2), name)
    
    return name

In [208]:
for name in sorted(bad_names):
    cleaned = clean_street_name(name, official_street_types)
    if name != cleaned:
        print name+' / '+cleaned

AVENIDA DIAS PAES / Avenida Dias Paes
Acesso BR 116 / Acesso BR-116
Av. Cel. José Guilherme De Almeida / Avenida Cel. José Guilherme De Almeida
Av. Comendador Jacinto Souza Lima / Avenida Comendador Jacinto Souza Lima
Av. PH Rolfs s/n / Avenida PH Rolfs s/n
Av. Rivelli / Avenida Rivelli
Avenido Diogo Braga Filho / Avenida Diogo Braga Filho
BR- 267 / BR-267
Estr. Mariana / Estrada Mariana
Estr. Piranga / Estrada Piranga
LADEIRA AUGUSTO GABRI / Ladeira Augusto Gabri
MG 126 Mar de Espanha / Bicas / MG-126 Mar de Espanha / Bicas
MG 126, Bicas / Mar de Espanha / MG-126, Bicas / Mar de Espanha
MG 126, Mar de Espanha / Bicas / MG-126, Mar de Espanha / Bicas
MG 126, Mar de Espanha Sapucaia / MG-126, Mar de Espanha Sapucaia
MG 126, Santa Helena / Bicas / MG-126, Santa Helena / Bicas
PRAÇA PADRE NÉLSON TAFURI / Praça Padre Nélson Tafuri
R.   José Varanda / Rua José Varanda
R.  Abdala Adad / Rua Abdala Adad
R.  Alice de Souza Matos / Rua Alice de Souza Matos
R.  Alvaro Dias / Rua Alvaro Dias
R.  

In [177]:
def get_typo_candidates(names_list):

    candidates = {k: set() for k in ['Rua', 'Travessa', 'Avenida', 'Estrada']}
    
    for name in sorted(names_list):
        first_word = name.split(' ')[0]
        candidate_key = ''
        
        if name.lower().startswith('r'):
            candidate_key = 'Rua'
        elif name.lower().startswith('tra'):
            candidate_key = 'Travessa'
        elif name.lower().startswith('av'):
            candidate_key = 'Avenida'
        elif name.lower().startswith('est'):
            candidate_key = 'Estrada'

        if candidate_key:
            candidates[candidate_key].add(first_word)
        
    return candidates

In [211]:
for name in sorted(bad_names):
    cleaned = clean_street_name(name, official_street_types)
    if name != cleaned:
        if 'BR' in name or 'MG' in name or 'RJ' in name:
            print name+' / '+cleaned

Acesso BR 116 / Acesso BR-116
BR- 267 / BR-267
LADEIRA AUGUSTO GABRI / Ladeira Augusto Gabri
MG 126 Mar de Espanha / Bicas / MG-126 Mar de Espanha / Bicas
MG 126, Bicas / Mar de Espanha / MG-126, Bicas / Mar de Espanha
MG 126, Mar de Espanha / Bicas / MG-126, Mar de Espanha / Bicas
MG 126, Mar de Espanha Sapucaia / MG-126, Mar de Espanha Sapucaia
MG 126, Santa Helena / Bicas / MG-126, Santa Helena / Bicas
RUA JOÃO JANUZZI SOBRINHO / Rua João Januzzi Sobrinho
RUA LUIZ BRAHIM / Rua Luiz Brahim


In [24]:
addrstreets = []

for node in root.findall('.//node'):
    if node.find('.//tag[@k="addr:street"]') is not None:
        addrstreets.append(node)

for node in addrstreets:
    #print "Node attributes:", node.attrib
    for tag in node.findall('.//tag'):
        is_name = tag.get('k') == 'name'
        if is_name:
            print "Name: "+tag.get('v')

Name: Bazar Vander
Name: Rodoviária de Conselheiro Lafaiete
Name: Bar do Bigode e Xororó
Name: Goiabas
Name: Trigofrios
Name: Armazém Choperia
Name: Residencial Baviera
Name: Bar do Bené
Name: Estação São Pedro
Name: Cervejaria Barbante
Name: 99º Cia de PM
Name: Bar e Lanchonete 100% Caipira
Name: Igreja Matriz de Santo Antonio
Name: Restaurante Jair Neto
Name: Supermercado Casa Barros
Name: Posto Barros
Name: Posto Axtra
Name: Umami
Name: Idade da Pedra
Name: Bar Universitario
Name: Samurai - Japanese Food
Name: Fritas Lanches
Name: Bertus
Name: Mr Tugas
Name: Suprema
Name: Privilège
Name: Bar do Pirata
Name: Bar do Bode
Name: W 100
Name: Museu Ferroviário
Name: Victory
Name: Pet Center
Name: Churrascaria Chimarron
Name: Distribuidora Modesto
Name: Maxi Pão
Name: Hospital São Vicente de Paulo
Name: Sítio Encosta do Sol
Name: Fio Fio
Name: Panificadora Barbosa Lage
Name: Papex
Name: Oswaldo Cruz
Name: No Bugs - Empresa Júnior de Informática
Name: Banco do Brasil
Name: Burger King
Name:

In [40]:
clean_name_from_postal('Romuaaldo', 'Juiz de Fora')

'Romuaaldo'

In [41]:
print clean_name_from_postal(u'Barao do Rio Branco', 'Juiz de Fora')

Avenida Barão do Rio Branco


In [88]:
way_names = get_street_names_in_way(root)

In [90]:
ignore_words = ['Rua', 'Avenida', 'do', 'da', 'dos', 'das', 'de', 'o', 'a', 'ao', 'para', 'e', u'à', 's/n']

#way_names = [u'Rua Paraiba', u'Rua Paraíba']

for name in way_names:
    for name_to_compare in way_names:
        if name != name_to_compare:  # we don't want to compare it to itself
            #print 'Comparing', name, 'to', name_to_compare
            ratio = difflib.SequenceMatcher(lambda x: x == ' ', name, name_to_compare).ratio()
            #print ratio
            if ratio > 0.9:
                print name, '/', name_to_compare

Rua Vicente Vieira da Mota / Rua Vicente Vieira Mota
Rua Padre Alberto / Rua Padre Adalberto
Rua Melo Branco / Rua Melo Franco
Antônio de Oliveira / Rua Antônio de Oliveira


KeyboardInterrupt: 

In [78]:
difflib.SequenceMatcher(lambda x: x == ' ', u'Rua Paraíba', u'Rua Paraiba').ratio()

0.9090909090909091

In [94]:
city_names = set()

for node in root.findall('.//node'):
    city = node.find('.//tag[@k="addr:city"]')
    if city is not None:
        city_names.add(city.get('v'))

# for node in city_names:
#     #print "Node attributes:", node.attrib
#     for tag in node.findall('.//tag'):
#         is_name = tag.get('k') == 'name'
#         if is_name:
#             print "Name: "+tag.get('v')

In [96]:
for name in city_names:
    print name

Guiricema
Ervália
São Sebastião do Alto
Viçosa
Canaã
Levy Gasparian
Juiz de Fora
Macuco
Sarandi
Barbacena
Coronel Xavier Chaves
Ubá
Coimbra
Silveirânia
São João Nepomuceno
Dores de Campos
Cordeiro
Erália
Visconde do Rio Branco
Itaocara
Cantagalo
Juiz de Fora-MG
Itaverava
São João del-Rei
juiz de Fora
Rio Pomba
