In [2]:
# import all libraries
import os
import collections
import pprint
import xml.etree.cElementTree as ET
import re
import codecs
import csv
import cerberus
import copy
import schema
import pprint

In [3]:
"""In street addresses I observed that there are inconsistencies in street types like Avenue has represented
differently (Ave, Ave. ,Av) which can lead to inaccurate results, so I have cleaned and handled these inconsistencies 
in street types using below functions."""

import xml.etree.cElementTree as ET
from collections import defaultdict
import re

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Cove", "Alley", "Park", "Way", "Walk" "Circle", "Highway", 
            "Plaza", "Path", "Center", "Mission"]

mapping = { "Ave": "Avenue",
            "Ave.": "Avenue",
            "Avenue": "Avenue",
            "Av": "Avenue",
            "Blvd": "Boulevard",
            "Blvd.": "Boulevard",
            "Bl": "Boulevard",
            "Boulevard": "Boulevard",
            "Ct": "Court",
            "Dr": "Drive",
            "Dr.": "Drive",
            "Hwy": "Highway",
            "Ln": "Lane",
            "Ln.": "Lane",
            "Pl": "Place",
            "Plz": "Plaza",
            "Park": "Parkway",
            "Pky": "Parkway",
            "Rd": "Road",
            "Rd.": "Road",
            "St": "Street",
            "St.": "Street",
            "st": "Street",
            "street": "Street",
            "wy": "Way"
            }

""""function audit_street_type search the input street name and if it's matched and within the expected list 
then add this as a key and add the string into the set""""

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


# This function checks whether it is a street name 
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


# This function will return list which satisfies the above two function conditions.
def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = collections.defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

# This function will update the old street name with a new updated onnes.
def update_name(name, mapping, regex):
    m = regex.search(name)
    if m:
        st_type = m.group()
        if st_type in mapping:
            name = re.sub(regex, mapping[st_type], name)
    return name



#update and print unclean list of street types to a better name.

LA_st_types = audit( "LA_sample.osm")

for street_type, ways in LA_st_types.iteritems():
    for name in ways:
        better_name = update_name(name, mapping, street_type_re)
        print name, "=>", better_name

East Broadway => East Broadway
South Broadway => South Broadway
Cam Rainbow => Cam Rainbow
Camino Cielo => Camino Cielo
Via Del Cielo => Via Del Cielo
Eagle Ridge => Eagle Ridge
Coltrane => Coltrane
Via De Maranatha => Via De Maranatha
Los Cerritos Ln => Los Cerritos Lane
Flowerwood Ln => Flowerwood Lane
Green View Ln => Green View Lane
Lynden Ln => Lynden Lane
Cazador Ln => Cazador Lane
Thornbury Ln => Thornbury Lane
Daisy Ln => Daisy Lane
Wick Ln => Wick Lane
Hillcrest Ln => Hillcrest Lane
Acacia Ln => Acacia Lane
Bamboo Ln => Bamboo Lane
Sugar Pine Ln => Sugar Pine Lane
Gum Tree Ln => Gum Tree Lane
Mardavido Ln => Mardavido Lane
Los Alisos North Ln => Los Alisos North Lane
Jamies Ln => Jamies Lane
Sweetgrass Ln => Sweetgrass Lane
N Stage Coach Ln => N Stage Coach Lane
Rossiter Ln => Rossiter Lane
Barsky Ln => Barsky Lane
Womsi Ln => Womsi Lane
Norstar Ln => Norstar Lane
Green Canyon Ln => Green Canyon Lane
Hamilton Ln => Hamilton Lane
Avocado Vista Ln => Avocado Vista Lane
Gracey Ln

In [5]:
# Inconsistent Postal Codes

"""In postal code I observed that postal codes are mostly of 5 digits and as per audit we found that there are some postal codes
where four digit extra portion or state abbrebiation like 'CA' is coming ,which needs to be taken care.
   So in the below function I have taken care of the inconsistent postal codes"""

def audit_zipcode(invalid_zipcodes, zipcode):
    twoDigits = zipcode[0:2]
    
    if twoDigits != 90 or twoDigits != 91  or not twoDigits.isdigit():
        invalid_zipcodes[twoDigits].add(zipcode)
        
def is_zipcode(elem):
    return (elem.attrib['k'] == "addr:postcode")

def audit_zip(osmfile):
    osm_file = open(osmfile, "r")
    invalid_zipcodes = collections.defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_zipcode(tag):
                    audit_zipcode(invalid_zipcodes,tag.attrib['v'])

    return invalid_zipcodes

osm_file = "LA_sample.osm"
LA_zipcode = audit_zip(osm_file)
pprint.pprint(dict(LA_zipcode))

{'90': set(['90002-3024',
            '90006-4005',
            '90012',
            '90013',
            '90016',
            '90017',
            '90022',
            '90023',
            '90024',
            '90028',
            '90040',
            '90045',
            '90049',
            '90065',
            '90069',
            '90077',
            '90095',
            '90240',
            '90265',
            '90277',
            '90503',
            '90620',
            '90631',
            '90680',
            '90710',
            '90712',
            '90731',
            '90731-7415',
            '90802',
            '90806',
            '90807',
            '90813',
            '90815']),
 '91': set(['91007',
            '91030',
            '91102',
            '91103',
            '91104',
            '91105',
            '91106',
            '91107',
            '91214',
            '91303-2211',
            '91304',
            '91321',
            '91365',
            

In [6]:
"""Using update_zip function I have cleaned inconsistencies in zip code and changed those extra four digital zip codes
to standard 5 digit zip code and removed string CA."""

#update unclean list of zip codes to the corrected zipcodes.
def update_zip(zipcode):
    zipChar = re.findall('[a-zA-Z]*', zipcode)
    if zipChar:
        zipChar = zipChar[0]
    zipChar.strip()
    if zipChar == "CA":
        updateZip = re.findall(r'\d+', zipcode)
        if updateZip:
            return (re.findall(r'\d+', zipcode))[0]
    else:
        return (re.findall(r'\d+', zipcode))[0]

#print updated zip codes.
for street_type, ways in LA_zipcode.iteritems():
    for name in ways:
        better_name = update_zip(name)
        print name, "=>", better_name

91801 => 91801
91784 => 91784
91786 => 91786
91102 => 91102
91103 => 91103
91106 => 91106
91107 => 91107
91104 => 91104
91105 => 91105
91770 => 91770
91304 => 91304
91776 => 91776
91007 => 91007
91752 => 91752
91506 => 91506
91367 => 91367
91303-2211 => 91303
91365 => 91365
91502 => 91502
91739 => 91739
91711 => 91711
91710 => 91710
91730 => 91730
91733 => 91733
91321 => 91321
91737 => 91737
91701 => 91701
91761 => 91761
91767 => 91767
91764 => 91764
91765 => 91765
91740 => 91740
91741 => 91741
91030 => 91030
91606 => 91606
91214 => 91214
90024 => 90024
90069 => 90069
90022 => 90022
90023 => 90023
90028 => 90028
90065 => 90065
90806 => 90806
90807 => 90807
90802 => 90802
90620 => 90620
90277 => 90277
90731-7415 => 90731
90095 => 90095
90013 => 90013
90012 => 90012
90017 => 90017
90016 => 90016
90503 => 90503
90077 => 90077
90002-3024 => 90002
90240 => 90240
90006-4005 => 90006
90631 => 90631
90813 => 90813
90265 => 90265
90815 => 90815
90040 => 90040
90680 => 90680
90045 => 90045
90731