In [1]:
# import libraries used in this project 

import os
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
import collections
from collections import defaultdict
import bson
import pymongo

In [2]:
# Data used - Weekly OSM Metro Extracts from:
# https://mapzen.com/metro-extracts/
# https://s3.amazonaws.com/metro-extracts.mapzen.com/los-angeles_california.osm.bz2

DATADIR = "data"
DATAFILE = "los-angeles_california.osm"

LA_DATA = os.path.join(DATADIR, DATAFILE)

In [None]:
# count tags in the data set

def count_tags(filename):
    tags = defaultdict(int)
    for line in ET.iterparse(filename, events=("start",)):
        new_tag = line[1].tag
        tags[new_tag] += 1
    return tags

la_tags = count_tags(LA_DATA)

In [None]:
pprint.pprint(la_tags)

In [10]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

def key_type(element, keys):
    if element.tag == "tag":
        k_value = element.attrib['k']
        if lower.search(k_value) is not None:
            keys['lower'] += 1
        elif lower_colon.search(k_value) is not None:
            keys['lower_colon'] += 1
        elif problemchars.search(k_value) is not None:
            keys["problemchars"] += 1
        else:
            keys['other'] += 1

    return keys

def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

la_keys = process_map(LA_DATA)

In [11]:
# check keys in "tag" 

def check_tag_key(osmfile):
    osm_file = open(osmfile, "r")
    key_list = []
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k'].find("addr:")!=-1 and tag.attrib['k'] not in key_list:
                    key_list.append(tag.attrib['k'])

    return key_list       

tag_key_list = check_tag_key(LA_DATA)
print tag_key_list

['addr:city', 'addr:street', 'addr:country', 'addr:postcode', 'addr:housenumber', 'addr:state', 'addr:full', 'addr:housename', 'addr:street_direction_prefix', 'licensee:addr:city', 'licensee:addr:state', 'licensee:addr:suite', 'licensee:addr:street', 'licensee:addr:postcode', 'licensee:addr:housenumber', 'addr:floor', 'addr:suite', 'addr:housenumber_1', 'addr:street_1', 'licensee:addr:street_direction_prefix', 'addr:county', 'addr:unit', 'addr:streetnumber', 'addr:door', 'addr:province', 'addr:sitenumber', 'addr:flats', 'addr:neighbourhood', 'addr:interpolation', 'addr:odd', 'addr:even', 'addr:source', 'addr:source_url', 'addr:street_2', 'addr:housenumber_2', 'addr:street2', 'addr:east_west', 'addr:north_south', 'addr:complete', 'addr:housenumber:max', 'addr:housenumber:min', 'disused:addr:city', 'alt_addr:street', 'alt_addr:housenumber', 'addr:inclusion', 'alt_addr:postcode', 'addr:suburb', 'addr:housenumber_3', 'addr:housenumber_4', 'addr:housenumber_5', 'addr:house', 'addr:place', '

In [None]:
# check s

def check_city(osmfile):
    osm_file = open(osmfile, "r")
    city_list = defaultdict(int)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k']=="addr:city":
                    city_list[tag.attrib['v']]+=1

    return city_list       

city_list = check_city(LA_DATA)
print city_list

In [5]:
# check city

def check_city(osmfile):
    osm_file = open(osmfile, "r")
    city_list = defaultdict(int)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k']=="addr:city":
                    city_list[tag.attrib['v']]+=1

    return city_list       

city_list = check_city(LA_DATA)
print city_list

defaultdict(<type 'int'>, {'Cuatro Vientos, Madrid': 1, 'Valdemoro': 37, u'Fuencemill\xe1n': 1, 'Mostoles': 10, 'Camarena': 1, 'Madrd': 1, u'Villaviciosa de Od\xf3n': 7, 'Collado Mediano': 3, 'Yuncler': 2, 'Arganda del Rey': 82, 'alcala de henares': 1, 'Villalbilla': 9, 'Buitrago de Lozoya': 1, 'El Molar': 1, u'Valsa\xedn': 1, 'Majadahonda': 6, 'Becerril de la Sierra': 13, 'Casarrubios del Monte': 2, 'Rozas de Madrid': 1, 'Pozuelo del Rey': 9, 'Las Matas': 1, 'Santa Cruz de Retamar': 1, u'Torrej\xf3n de Ardoz': 15, 'Collado Villalba': 51, 'Villamanrique de Tajo': 1, 'Navalcarnero': 3, 'La Lastrilla': 2, 'Villaviciosa de Odon': 1, u'San Agust\xedn del Guadalix': 1, 'Algete': 1, 'Daganzo de Arriba': 1, u'Oca\xf1a': 33, u'San Sebasti\xe1n de los Reyes': 9, 'Pozuelo de Alascon': 1, 'Ciempozuelos': 7, u'Legan\xe9s': 34, 'Cerceda': 4, 'Miraflores de la Sierra': 3, 'Collado-Villalba': 1, 'Navacerrada': 363, 'Paracuellos de Jarama': 9, 'Zarzalejjo': 1, 'Humanes': 1, 'Alovera': 20, 'Moralzarzal

In [160]:
city_mapping = {
    "santa Monica": "Santa Monica",
    "Los Angeles-Venice":"Los Angeles",
    "Venice": "Los Angeles",
    "West Los Angeles":"Los Angeles",
    "Marina Del Rey CA":"Marina Del Rey",
    "Marina del Rey":"Marina Del Rey",
    "Marina del Ray":"Marina Del Rey",
    "Venice CA":"Los Angeles"}

In [239]:
def update_city(city_name, city_mapping):
    for key in city_mapping.keys():
        if city_name.find(key)!=-1:
            #city_name = city_name.replace(key,city_mapping[key])
            city_name = city_mapping[key]
            break
    return city_name

In [162]:
for city in city_list:
    better_city = update_city(city,city_mapping)
    print city, "=>", better_city

santa Monica => Santa Monica
Pacific Palisades => Pacific Palisades
Marina Del Rey CA => Marina Del Rey
Los Angeles-Venice => Los Angeles
Marina Del Rey => Marina Del Rey
Santa Monica => Santa Monica
Venice => Los Angeles
West Los Angeles => Los Angeles
Los Angeles => Los Angeles
Marina del Rey => Marina Del Rey
Marina del Ray => Marina Del Rey
Venice CA => Los Angeles


In [82]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons","Highway"]

In [146]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = collections.defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types                    

In [117]:
#List unformatted street type 
def all_street_type(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(int)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    street_type=tag.attrib["v"].strip().split(" ")[-1]
                    street_types[street_type]+=1
    return street_types

print all_street_type(SMC_DATA)

defaultdict(<type 'int'>, {'Boulevard': 66, 'Ln': 1, 'St.': 1, 'Way': 3, 'ave': 2, 'Highway': 4, 'Promenade': 3, 'North': 1, 'Bvd': 1, 'access': 1, 'Road': 1, 'Marina': 1, 'Dr': 2, 'Center': 1, 'Bd.': 1, 'Drive': 2, 'Pico': 1, 'Place': 1, 'Ave': 3, 'South': 1, 'Blvd.': 1, '1180': 1, 'Walk': 3, 'Street': 20, '1101': 1, 'Blvd': 5, 'Broadway': 4, 'Avenue': 23})


In [118]:
# print problem street type
print audit(SMC_DATA).keys()

['Promenade', 'Bd.', 'North', 'Center', 'Ln', 'St.', 'Bvd', '1180', 'Ave', 'access', 'Marina', 'Pico', '1101', 'Way', 'Broadway', 'Blvd', 'ave', 'Walk', 'Dr', 'South', 'Blvd.']


In [126]:
print audit(SMC_DATA)['1101']
print audit(SMC_DATA)['1180']

set(['15th Street Ste. 1101'])
set(['Santa Monica Boulevard, Suite 1180'])


In [192]:
street_mapping = {
    "Ln": "Lane",
    "St.":"Street",
    "Ave": "Avenue",
    "ave":"Avenue",
    "Bd":"Boulevard",
    "Blvd":"Boulevard",
    "Blvd.":"Boulevard",
    "Bvd":"Boulevard",
    "access":"Access",
    "Dr":"Drive",
    "15th Street Ste. 1101":"15th Street",
    "Santa Monica Boulevard, Suite 1180":"Santa Monica Boulevard"}

In [193]:
def update_street(name, street_mapping):
    for key in mapping.keys():
        if name.find(key)!=-1:
            name = name.replace(key,street_mapping[key])
            break
    return name

In [195]:
smc_st_types = audit(SMC_DATA)

for st_type, ways in smc_types.iteritems():
        for name in ways:
            better_name = update_street(name, street_mapping)
            print name, "=>", better_name

Walnut Ln => Walnut Lane
Main St. => Main Street
Ocean Way => Ocean Way
Admiralty Way => Admiralty Way
Olive ave => Olive Avenue
Pacific Coast Highway => Pacific Coast Highway
Santa Monica Pier access => Santa Monica Pier Access
Donald Douglas Loop North => Donald Douglas Loop North
Santa Monica Bvd => Santa Monica Boulevard
Third Street Promenade => Third Street Promenade
Via Marina => Via Marina
Entrada Dr => Entrada Drive
Civic Center => Civic Center
Ocean Bd. => Ocean Boulevard.
West Pico => West Pico
Olive Ave => Olive Avenue
Ohio Ave => Ohio Avenue
Montana Ave => Montana Avenue
Donald Douglas Loop South => Donald Douglas Loop South
Olympic Blvd. => Olympic Boulevard
Santa Monica Boulevard, Suite 1180 => Santa Monica Boulevard
Ocean Front Walk => Ocean Front Walk
15th Street Ste. 1101 => 15th Street
Pico Blvd => Pico Boulevard
Washington Blvd => Washington Boulevard
Wilshire Blvd => Wilshire Boulevard
W Washington Blvd => W Washington Boulevard
Santa Monica Blvd => Santa Monica Bo

In [196]:
# Mapping variables were updated: see full list above
for st_type, ways in smc_st_types.iteritems():
    for name in ways:
        better_name = update_name(name, street_mapping)
        # Only show changed street names in way nodes 
        # since there are a lot more in this city data
        if name != better_name:
            print name, "=>", better_name
# One street would need to be cleaned a lot: 1833 8th Street Northwestalbuquerque Nm 87102
# But almost all have been cleaned pretty well.

Ocean Bd. => Ocean Boulevard.
Walnut Ln => Walnut Lane
Main St. => Main Street
Santa Monica Bvd => Santa Monica Boulevard
Santa Monica Boulevard, Suite 1180 => Santa Monica Boulevard
Olive Ave => Olive Avenue
Ohio Ave => Ohio Avenue
Montana Ave => Montana Avenue
Santa Monica Pier access => Santa Monica Pier Access
15th Street Ste. 1101 => 15th Street
Pico Blvd => Pico Boulevard
Washington Blvd => Washington Boulevard
Wilshire Blvd => Wilshire Boulevard
W Washington Blvd => W Washington Boulevard
Santa Monica Blvd => Santa Monica Boulevard
Olive ave => Olive Avenue
Entrada Dr => Entrada Drive
Olympic Blvd. => Olympic Boulevard


In [197]:
def check_postcode(osmfile):
    osm_file = open(osmfile, "r")
    key_list = defaultdict(int)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k'].find("addr:")!=-1:
                    key_list[tag.attrib['k']]+=1

    return key_list       

postcode_list = check_postcode(SMC_DATA)
print postcode_list
    

defaultdict(<type 'int'>, {'addr:housenumber': 149, 'addr:interpolation': 1, 'addr:city': 121, 'addr:postcode': 149, 'addr:full': 1, 'addr:housename': 12, 'addr:state': 90, 'addr:country': 11, 'addr:street': 155})


In [170]:
def check_postcode(osmfile):
    osm_file = open(osmfile, "r")
    postcode_list = defaultdict(int)
    problem_postcode = defaultdict(int)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k']=="addr:postcode":
                    postcode_list[tag.attrib['v']]+=1

    return postcode_list       

postcode_list = check_postcode(SMC_DATA)
print postcode_list

defaultdict(<type 'int'>, {'90025-9998': 1, 'CA 90291': 1, '90401-2405': 1, '90025': 24, '90291': 13, '90292': 28, 'CA 90405': 3, '90401': 20, '90402': 5, '90403': 9, '90404': 17, '90405': 11, '90064': 4, '90272': 4, 'CA 90272': 1, '90291-3879': 1, 'CA 90404': 1, '90064-1508': 1, '90049': 2, 'CA 90401': 1, '90272-3719': 1})


In [173]:
# Covers cases encountered in cleaning 
def check_5_digits(new_postal):
    # Make sure postal code is 5 digits
    if len(str(new_postal)) == 5:
        return new_postal
    # Else return postal code with descriptive error message attached.
    else:
        return "NOT 5 DIGITS:"+str(new_postal)     

def update_postcode(postal):
    # Try to convert numbers as intended. 
    # No need to check for conditions with if statements if the vast majority will be valid.
    try:
        new_postal = int(postal)
        # Check for 5 sigits with another function.
        return check_5_digits(new_postal)
    except ValueError as value_error:
        # Check if it's a string with a hyphen containing 4 trailing digits'
        if '-' in postal:
            # Grab first digits and convert to int.
            postal = int(postal.split('-')[0])
            # Check for 5 sigits with another function.
            return check_5_digits(postal)
        if ' ' in postal:
            postals = postal.split(' ')
            new_postals = [p for p in postals if len(p)==5]
            try:
                # Check the first entry for a valid 5 digit number
                new_postals = int(new_postals[0])
                return check_5_digits(new_postals)
            # Otherwise return the error thrown and values.
            except Exception as err:
                return err, new_postals
        else:
            # Return the ValueError thrown and value if it doesn't meet these criteria
            return ValueError + ":" + str(postal)

In [181]:
for postcode in postcode_list.keys():
    better_postcode = update_postcode(postcode)
    print postcode,"=>", better_postcode

90025-9998 => 90025
CA 90291 => 90291
90401-2405 => 90401
90025 => 90025
90291 => 90291
90292 => 90292
CA 90405 => 90405
90401 => 90401
90402 => 90402
90403 => 90403
90404 => 90404
90405 => 90405
90064 => 90064
90272 => 90272
CA 90272 => 90272
90291-3879 => 90291
CA 90404 => 90404
90064-1508 => 90064
90049 => 90049
CA 90401 => 90401
90272-3719 => 90272


In [182]:
#check state and country
def check_state(osmfile):
    osm_file = open(osmfile, "r")
    state_list = defaultdict(int)
    problem_postcode = defaultdict(int)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k']=="addr:state":
                    state_list[tag.attrib['v']]+=1
    return state_list  

state_list= check_state(SMC_DATA)
print state_list

defaultdict(<type 'int'>, {'CA': 88, 'CALIFORNIA': 1, 'CA,': 1}) defaultdict(<type 'int'>, {'US': 11})


In [183]:
def update_state(state_name):
    if state_name!="CA":
        state_name="CA"
    return state_name

for state in state_list:
    better_state = update_state(state)
    print state,"=>",better_state

CA => CA
CALIFORNIA => CA
CA, => CA


In [None]:
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def shape_element(element):
    node = {}
    node['created'] = {}
    node['address'] = {}
    node['pos']=[]
    node['node_refs'] = []
    
    if element.tag == "node" or element.tag == "way" :
        # YOUR CODE HERE
        node['type']=element.tag
        if "id" in element.attrib.keys():
            node['id']=element.attrib['id']
        if "visible" in element.attrib.keys():
            node['visible']=element.attrib['visible']
        
        for elem in CREATED:
            if elem in element.attrib.keys():
                node['created'][elem]=element.attrib[elem]
                
        if "lat" in element.attrib:
            node["pos"].append(float(element.attrib["lat"]))
        if "lon" in element.attrib:
            node["pos"].append(float(element.attrib["lon"]))
        
        for tag in element.iter('tag'):
            if not(problemchars.search(tag.attrib['k'])):
                if tag.attrib['k'] == "addr:country":
                    node["address"]["country"]=tag.attrib['v']
                if tag.attrib['k'] == "addr:state":
                    node["address"]["state"]=update_state(tag.attrib['v'])
                if tag.attrib['k'] == "addr:city":
                    node["address"]["city"]=update_city(tag.attrib['v'],city_mapping)
                if tag.attrib['k'] == "addr:housenumber":
                    node["address"]["housenumber"]=tag.attrib['v']
                if tag.attrib['k'] == "addr:postcode":
                    node["address"]["postcode"]=update_postcode(tag.attrib['v'])
                if tag.attrib['k'] == "addr:street":
                    node["address"]["street"]=update_street(tag.attrib['v'],street_mapping)
                if tag.attrib['k'].find("addr")==-1:
                    node[tag.attrib['k']]=tag.attrib['v']
            
        for nd in element.iter("nd"):
            node["node_refs"].append(nd.attrib["ref"]) 
        
        if node["address"] =={}:
            node.pop("address", None)
        
        if node["node_refs"]==[]:
            node.pop("node_refs",None)
        
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [229]:
data = process_map("data/santa-monica_california.osm", False)

In [230]:
## Functions to load our database and collection in pymongo

from pymongo import MongoClient

# Function to return a database of the name specified.
# We want a database named 'project' in this case.
def get_db(db_name):
    client = MongoClient("mongodb://localhost:27017")
    db = client[db_name]
    return db

## Function to return the collection we want to use in MongoDB
def get_collection(db, collection):
    collections_db = db[collection]
    return collections_db

## Function to insert json data into MongoDB
def insert_data(json_data, db_collection):
    with open(json_data, 'r') as f:
        ## json.loads() takes a string, while json.load() takes a file-like object.
        ## http://stackoverflow.com/questions/11568246/
        ## /loading-several-text-files-into-mongodb-using-pymongo
        for each_line in f.readlines():
            db_collection.insert(json.loads(each_line))
    print("Complete.")

In [231]:
def map_aggregate(db, collection, pipeline):
    db_collection = db[collection]
    result = db_collection.aggregate(pipeline)
    return result

In [232]:
# Get 'project' database
db = get_db('project')

# Get 'cities' collection in the 'project' database
# Put honolulu and albuquerque city data in this collection.
db_cities = get_collection(db, 'cities')

In [233]:
smc_json_data = 'data/santa-monica_california.osm.json'  
insert_data(smc_json_data, db_cities)

Complete.


In [234]:
db.cities

Collection(Database(MongoClient('localhost', 27017), u'project'), u'cities')

In [235]:
def make_city_pipeline(city):
    pipeline = [{"$match":{"created.user":{"$exists":1},
                                          "city_name":city}},
                 {"$group": {"_id": {"City":"$city_name",
                                     "User":"$created.user"},
                            "count": {"$sum": 1}}},                            
                 {"$project": {'_id':0,
                               "City":"$_id.City",
                               "User":"$_id.User",
                               "Count":"$count"}},
                 {"$sort": {"Count": -1}},
                 {"$limit" : 5 }]
    return pipeline

pipeline = make_city_pipeline('Santa Monica')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

cursor = db.points.aggregate(...)

# Option 1
print(list(cursor))

SyntaxError: invalid syntax (<ipython-input-235-2ab6febccb84>, line 19)

In [282]:
#top 5 contirbutors#
result1 = db.cities.aggregate([{"$match":{"created.user":{"$exists":1}}},
                {"$group": 
                 {"_id": {"City":"$city_name",
                          "User":"$created.user"},
                            "count": {"$sum": 1}}},                            
                 {"$project": {'_id':0,
                               "City":"$_id.City",
                               "User":"$_id.User",
                               "Count":"$count"}},
                 {"$sort": {"Count": -1}},
                 {"$limit" : 5 }])

pprint.pprint(list(result1))

[{u'Count': 13242, u'User': u'techlady'},
 {u'Count': 8784, u'User': u'Rovastar'},
 {u'Count': 6808, u'User': u'StellanL'},
 {u'Count': 5206, u'User': u'bdiscoe'},
 {u'Count': 2876, u'User': u'mdapol'}]


In [283]:
result2 = db.cities.aggregate([{"$match":{"address.city":"Santa Monica"}},
                {"$group": 
                 {"_id": {"postcode":"$address.postcode"},
                            "count": {"$sum": 1}}},                            
                 {"$project": {'_id':0,
                               "postcode":"$_id.postcode",
                               "Count":"$count"}},
                 {"$sort": {"Count": -1}},])

pprint.pprint(list(result2))

[{u'Count': 38, u'postcode': 90401},
 {u'Count': 32, u'postcode': 90404},
 {u'Count': 28, u'postcode': None},
 {u'Count': 16, u'postcode': 90405},
 {u'Count': 8, u'postcode': 90403},
 {u'Count': 2, u'postcode': 90402}]


In [269]:
#number of documents

db.cities.find().count()  

72368

In [271]:
db.cities.find({"type":"node"}).count()

62954

In [272]:
db.cities.find({"type":"way"}).count()

9406

In [276]:
len(db.cities.distinct("created.user"))

278

In [287]:
# Top 10 appearing amenities
                                                
result3 = db.cities.aggregate([{"$match":{"amenity":{"$exists":1}}}, {"$group":{"_id":"$amenity",
"count":{"$sum":1}}}, {"$sort":{"count":1}}, {"$limit":10}])

pprint.pprint(list(result3))

[{u'_id': u'fountain', u'count': 2},
 {u'_id': u'bus_station', u'count': 2},
 {u'_id': u'community_centre', u'count': 2},
 {u'_id': u'child care', u'count': 2},
 {u'_id': u'cemetery', u'count': 2},
 {u'_id': u'arts_centre', u'count': 2},
 {u'_id': u'car_rental', u'count': 2},
 {u'_id': u'courthouse', u'count': 2},
 {u'_id': u'college', u'count': 2},
 {u'_id': u'townhall', u'count': 4}]


In [300]:
## Find the most popular places of worship
result4 =db.cities.aggregate([{"$match":{"amenity":{"$exists":1},"amenity":"place_of_worship"}},
                       {"$group":{"_id": {"City":"$city_name",
                                          "Religion":"$religion"},
                                  "count":{"$sum":1}}},
                       {"$project":{"_id":0,
                                    "City":"$_id.City",
                                    "Religion":"$_id.Religion",
                                    "Count":"$count"}},
                       {"$sort":{"Count":-1}}])

pprint.pprint(list(result4))

[{u'Count': 92, u'Religion': u'christian'},
 {u'Count': 4, u'Religion': u'buddhist'},
 {u'Count': 2},
 {u'Count': 2, u'Religion': u'jewish'}]


In [304]:
# most popular food#

result5 =db.cities.aggregate([{"$match":{"amenity":{"$exists":1},
                       "amenity":"restaurant"}},
            {"$group":{"_id":{"City":"$city_name","Food":"$cuisine"},
                       "count":{"$sum":1}}},
            {"$project":{"_id":0, 
                         "Food":"$_id.Food", "Count":"$count"}},
            {"$sort":{"Count":-1}},
            {"$limit":6}])
    
pprint.pprint(list(result5))

[{u'Count': 70},
 {u'Count': 16, u'Food': u'american'},
 {u'Count': 10, u'Food': u'italian'},
 {u'Count': 10, u'Food': u'mexican'},
 {u'Count': 8, u'Food': u'burger'},
 {u'Count': 4, u'Food': u'pizza'}]
