In [1]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import csv

In [8]:
#osm_file = open("chicago.osm", "r")

In [3]:
street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)

In [4]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name) #check if matches regex for street name
    if m:
        street_type = m.group()
        street_types[street_type] += 1

In [5]:
def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower()) #sort keys by lowered string
    for k in keys:
        v = d[k]
        print "%s: %d" % (k, v) 

In [6]:
def is_street_name(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street") #check if XML element is street name

In [7]:
def audit():
    for event, elem in ET.iterparse(osm_file):
        if is_street_name(elem):
            audit_street_type(street_types, elem.attrib['v'])   
    print_sorted_dict(street_types) 

In [None]:
def ensure_float(v):
    if is_number(v):
        return float(v)

In [2]:
import math

In [4]:
def audit_population_density(input_file):
    for row in input_file:
        try:
            population = float(row['populationTotal'])
            area = float(row['areaLand'])
            population_density = float(row['populationDensity'])
            calculated_density = population/area
            if math.fabs(calculated_density-population_density)>10:
                print 'Possibly bad population density for ', row['name']
        except:
            continue

In [8]:
input_file = csv.DictReader(open('cities.csv'))
input_file.next()
input_file.next()
input_file.next()
audit_population_density(input_file)

Possibly bad population density for  Ketchikan Alaska


In [9]:
import pprint

In [None]:
"""
Your task is to check the "productionStartYear" of the DBPedia autos datafile for valid values.
The following things should be done:
- check if the field "productionStartYear" contains a year
- check if the year is in range 1886-2014
- convert the value of the field to be just a year (not full datetime)
- the rest of the fields and values should stay the same
- if the value of the field is a valid year in range, as described above,
  write that line to the output_good file
- if the value of the field is not a valid year, 
  write that line to the output_bad file
- discard rows (neither write to good nor bad) if the URI is not from dbpedia.org
- you should use the provided way of reading and writing data (DictReader and DictWriter)
  They will take care of dealing with the header.

You can write helper functions for checking the data and writing the files, but we will call only the 
'process_file' with 3 arguments (inputfile, output_good, output_bad).
"""

In [10]:
def process_file(input_file, output_good, output_bad):
    good_data = []
    bad_data = []
    with open(input_file, "r") as f:
        reader = csv.DictReader(f)
        header = reader.fieldnames
        for row in reader:
            #validate URI value
            if row['URI'].find('dbpedia.org') < 0:
                continue #exclude data if not proper URI
            pyear = row['productionStartYear']
            try:
                year = int(pyear[:4])
                if year>=1886 and year<=2014: #check if year is in the specified range
                    row['productionStartYear'] = year
                    good_data.append(row)
                else: 
                    bad_data.append(row)
            except:
                bad_data.append(row)        

    #write good output and bad output to csv files
    with open(output_good, "w") as g:
        writer = csv.DictWriter(g, delimiter=",", fieldnames= header)
        writer.writeheader()
        for row in good_data:
            writer.writerow(row)
    
    with open(output_bad, 'w') as b:
        writer = csv.DictWriter(b, delimiter=',', fieldnames=header)
        writer.writeheader()
        for row in bad_data:
            writer.writerow(row)

In [11]:
process_file('autos.csv', 'goodautos.csv', 'badautos.csv')

In [2]:
def skip_lines(input_file, skip): #skip lines in a csv file reader
    for i in range(0, skip):
        next(input_file)

In [3]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [4]:
def is_array(s):
    if ('{' in s) or ('[' in s):
        return True
    return False

In [16]:
fieldname = 'wgs84_pos#lat'
minval = -90
maxval = 90

In [18]:
def audit_float_field(v, counts):
    v = v.strip()
    if v == "NULL":
        counts['nulls'] += 1
    elif v == '':
        counts['empties'] += 1
    elif is_array(v):
        counts['arrays'] += 1
    elif not is_number(v):
        print 'Found non number:', v
    else:
        v = float(v)
        if not ((minval<v) and (v<maxval)):
            print 'Found out of range value:', v

In [22]:
input_file = csv.DictReader(open('cities.csv'))
skip_lines(input_file, 3)
counts = {'nulls':0, 'empties':0, 'arrays':0}
nrows = 0
for row in input_file:
    audit_float_field(row[fieldname], counts)
    nrows += 1
print 'Number of cities:', nrows
print 'Nulls:', counts['nulls']
print 'Empties:', counts['empties']
print 'Numbr of arrays:', counts['arrays']

Found out of range value: 200.0
Number of cities: 39
Nulls: 0
Empties: 2
Numbr of arrays: 1


In [5]:
FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
          "isPartOf_label", "areaCode", "populationTotal", "elevation",
          "maximumElevation", "minimumElevation", "populationDensity",
          "wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]

In [15]:
def audit_file(filename, fields):
    fieldtypes = {f:set([]) for f in fields}
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        skip_lines(reader, 3)
        for row in reader:
            for field in FIELDS:
                if row[field] == 'NULL' or row[field] == '':
                    fieldtypes[field].add(type(None))
                elif is_array(row[field]):
                    fieldtypes[field].add(type([]))
                elif is_number(row[field]):
                    try:
                        int(row[field])
                        fieldtypes[field].add(type(1))
                    except:
                        fieldtypes[field].add(type(1.1))
                else:
                    fieldtypes[field].add(type('fart'))
    return fieldtypes

In [16]:
types = audit_file('cities.csv', FIELDS)
types

{'areaCode': {int, NoneType, str},
 'areaLand': {float, list, NoneType},
 'areaMetro': {float, NoneType},
 'areaUrban': {float, NoneType},
 'elevation': {int, list, NoneType},
 'governmentType_label': {NoneType, str},
 'homepage': {NoneType, str},
 'isPartOf_label': {list, NoneType, str},
 'maximumElevation': {NoneType},
 'minimumElevation': {NoneType},
 'name': {list, NoneType, str},
 'populationDensity': {float, list, NoneType},
 'populationTotal': {int, NoneType},
 'timeZone_label': {NoneType, str},
 'utcOffset': {int, list, NoneType, str},
 'wgs84_pos#lat': {float, int, list, NoneType, str},
 'wgs84_pos#long': {float}}

In [21]:
def fix_area(area):
    """
    Finish the function fix_area(). It will receive a string as an input, and it
    has to return a float representing the value of the area or None.
    You have to change the function fix_area. You can use extra functions if you
    like, but changes to process_file will not be taken into account.
    The rest of the code is just an example on how this function can be used.
    """
    if area == 'NULL' or area == '':
        return None
    elif area[0] == '{':
        if '|' in area:
            areas = area[1:-1].split('|')
            if len(areas[0]) > len(areas[1]):
                return float(areas[0])
            else:
                return float(areas[1])
    return float(area)

In [26]:
def process_area(filename, fieldname):
    data = []
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        #skipping the extra metadata
        for i in range(3):
            l = reader.next()
        # processing file
        for line in reader:
            # calling your function to fix the area value
            if fieldname in line:
                line[fieldname] = fix_area(line[fieldname])
            data.append(line[fieldname])
    return data

In [28]:
print process_area('cities.csv', 'areaMetro')

[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 7070000000.0, None, None, None]


In [20]:
def fix_name(name):
    """
    In the previous quiz you recognized that the "name" value can be an array (or
    list in Python terms). It would make it easier to process and query the data
    later if all values for the name are in a Python list, instead of being
    just a string separated with special characters, like now.

    Finish the function fix_name(). It will recieve a string as an input, and it
    will return a list of all the names. If there is only one name, the list will
    have only one item in it; if the name is "NULL", the list should be empty.
    The rest of the code is just an example on how this function can be used.
    """
    fixed = []
    if name == 'NULL' or type(name) == type(None):
        return fixed
    elif '{' in name:
        split_names = name[1:-1].split('|')
        for s in split_names:
            fixed.append(s)
        return fixed
    return [name]

True

In [29]:
def check_loc(point, lat, longi):    
    """
    Finish the function check_loc(). It will recieve 3 strings: first, the combined
    value of "point" followed by the separate "wgs84_pos#" values. You have to
    extract the lat and long values from the "point" argument and compare them to
    the "wgs84_pos# values, returning True or False."""
    point_lat = point.split(" ")[0]
    point_long = point.split(" ")[1]
    return (point_lat == lat) and (point_long == longi) 

In [30]:
def process_latlong(filename):
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        #skipping the extra matadata
        for i in range(3):
            l = reader.next()
        # processing file
        for line in reader:
            # calling your function to check the location
            result = check_loc(line["point"], line["wgs84_pos#lat"], line["wgs84_pos#long"])
            if not result:
                print "{}: {} != {} {}".format(line["name"], line["point"], line["wgs84_pos#lat"], line["wgs84_pos#long"])