In [1]:
import xml.etree.ElementTree as ET
import time
import pprint
import re
import codecs
import json
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.London

fname = r'C:\Users\YJ\Documents\1) Learning\Udacity - Data Analyst\Submissions\004\Central London Sample.osm'

In [2]:
## This steps analyses how many different parent tags are there in the data

In [3]:
element_tags={}
for event, element in ET.iterparse(fname,events = ('start',)):
    if list(element):
        if element.tag not in element_tags:
            element_tags[element.tag]=1
        else:
            element_tags[element.tag]+=1   
pprint.pprint(element_tags)

{'node': 4240, 'osm': 1, 'relation': 296, 'way': 5253}


In [4]:
## As we can see there are 4 main different parent elements which we will have to deal with in the sample set.
## Diving deeper, we will be using the details function below to extract information about each parent element
## Specifically, we will investigate the following:
## 1) What element attributes do each of these parent element has
## 2) What are the data types of the element attributes
## 3) What are the tags of the children of each element
## 4) What element attributes of the children element
## 5) What are the data types of the children element attributes

In [5]:
# Following 2 functions check if element has child elements and if element has attributes

In [6]:
def gotchildren(element):
    return list(element)

In [7]:
def gotattrib(element):
    return dict(element.attrib)

In [8]:
def summary(element,element_attrib,element_attrib_type,child_tags,child_attrib,child_attrib_type):
    #check if element have children
    if gotchildren(element):
        for child in element:
            #count child tags
            if child.tag not in child_tags:
                child_tags[child.tag]=1
            else:
                child_tags[child.tag]+=1
            #child attribs
            if gotattrib(child):
                for key in child.attrib.keys():    
                    if key not in child_attrib:
                        child_attrib[key]=1
                    else:
                        child_attrib[key]+=1

                    if type(child.attrib[key]) not in child_attrib_type:
                        child_attrib_type[type(child.attrib[key])]=1
                    else:
                        child_attrib_type[type(child.attrib[key])]+=1 


    #check if element has attributes
    if gotattrib(element):
        for key in element.attrib.keys():
            if type(element.attrib[key]) not in element_attrib_type:
                element_attrib_type[type(element.attrib[key])]=1
            else:
                element_attrib_type[type(element.attrib[key])]+=1

            if key not in element_attrib:
                element_attrib[key]=1
            else:
                element_attrib[key]+=1
    
    return element_attrib,element_attrib_type,child_tags,child_attrib,child_attrib_type

In [9]:
## Looking at the parent tag of 'node' a little closer,

In [10]:
def node_details(tagname):
    element_attrib= dict()
    element_attrib_type = dict()
    child_tags = dict()
    child_attrib = dict()
    child_attrib_type = dict()

    for event, element in ET.iterparse(fname,events = ('start',)):

        if element.tag==tagname:

            element_attrib,element_attrib_type,child_tags,child_attrib,child_attrib_type = summary(element,element_attrib,element_attrib_type,child_tags,child_attrib,child_attrib_type)

    print "element_attrib : {}".format(element_attrib)
    print "element_attrib_type : {}".format(element_attrib_type)
    print "child_tags : {}".format(child_tags)
    print "child_attrib : {}".format(child_attrib)
    print "child_attrib_type : {}".format(child_attrib_type) 


In [11]:
node_details('node')

element_attrib : {'changeset': 24718, 'uid': 24718, 'timestamp': 24718, 'lon': 24718, 'version': 24718, 'user': 24718, 'lat': 24718, 'id': 24718}
element_attrib_type : {<type 'unicode'>: 113, <type 'str'>: 197631}
child_tags : {'tag': 16232}
child_attrib : {'k': 16232, 'v': 16232}
child_attrib_type : {<type 'unicode'>: 25, <type 'str'>: 32439}


In [12]:
## We note that there are 2 different data types in the parent element of "node".  We can easily modify it such as change the unicode 
## into 'str' data using the following function

In [13]:
import unicodedata
def uniToStr(data):
    return unicodedata.normalize('NFKD', data).encode('ascii','ignore')

In [14]:
## Next we analyse the element.tag == 'way'

In [15]:
node_details('way')

element_attrib : {'changeset': 5259, 'uid': 5259, 'timestamp': 5259, 'version': 5259, 'user': 5259, 'id': 5259}
element_attrib_type : {<type 'unicode'>: 34, <type 'str'>: 31520}
child_tags : {'tag': 19692, 'nd': 33905}
child_attrib : {'k': 19692, 'ref': 33905, 'v': 19692}
child_attrib_type : {<type 'unicode'>: 94, <type 'str'>: 73195}


In [16]:
# We can see the parent node "way" is similar to that of "node" other than it has 2 different types of child tags instead of 1
# Another consistent theme is that element attrib for both "node" and "way" seems to be a system generated details 
# as details are well formatted

In [17]:
for event, element in ET.iterparse(fname,events = ('start',)):
    if element.tag=='way':
        if gotattrib(element):
            print element.attrib

{'changeset': '27537218', 'version': '20', 'uid': '1016290', 'timestamp': '2014-12-17T19:33:05Z', 'id': '1530592', 'user': 'Amaroussi'}
{'changeset': '27327472', 'version': '10', 'uid': '1016290', 'timestamp': '2014-12-08T07:15:32Z', 'id': '2263976', 'user': 'Amaroussi'}
{'changeset': '33116469', 'version': '10', 'uid': '322039', 'timestamp': '2015-08-05T01:15:27Z', 'id': '2484356', 'user': 'MacLondon'}
{'changeset': '24187652', 'version': '10', 'uid': '1016290', 'timestamp': '2014-07-16T18:58:32Z', 'id': '2599449', 'user': 'Amaroussi'}
{'changeset': '21043812', 'version': '18', 'uid': '1016290', 'timestamp': '2014-03-11T12:32:37Z', 'id': '2644766', 'user': 'Amaroussi'}
{'changeset': '15667557', 'version': '5', 'uid': '508', 'timestamp': '2013-04-09T16:45:02Z', 'id': '2837144', 'user': 'Welshie'}
{'changeset': '20105566', 'version': '15', 'uid': '508', 'timestamp': '2014-01-20T15:30:08Z', 'id': '2876892', 'user': 'Welshie'}
{'changeset': '32552936', 'version': '24', 'uid': '88164', 'ti

In [18]:
# Looking at the attributes of the child element "tag" of the "way" element 
# Following fuction gives us a list of the unique keys in the way->tag->attrib_key

In [19]:
way_child_attrib_data=dict()
valueK = []
valueV = []

for event, element in ET.iterparse(fname,events = ('start',)):
    if element.tag=='way':
        if gotchildren(element):
            for child in element:
                if gotattrib(child) and child.tag=='tag':
                    for key in child.attrib.keys():
                        if key == 'k': #and (child.attrib[key].startswith('addr:') or child.attrib[key] in ['postal_code','building']):
                            if child.attrib[key] not in valueK:
                                valueK.append(child.attrib[key])
                        #elif key == 'v' and child.attrib[key] not in valueV:
                            #valueV.append(child.attrib[key])

pprint.pprint(valueK)


['lit',
 'ref',
 'name',
 'lanes',
 'oneway',
 'source',
 'highway',
 'surface',
 'maxspeed',
 'operator',
 'sidewalk',
 'note:highway',
 'abutters',
 'minimum',
 'foot',
 'bicycle',
 'cycleway',
 'junction',
 'wikipedia',
 'class:bicycle:commute',
 'source:ref',
 'postal_code',
 'note',
 'cycleway:right',
 'gauge',
 'layer',
 'usage',
 'bridge',
 'tracks',
 'railway',
 'voltage',
 'frequency',
 'electrified',
 'track_detail',
 'passenger_lines',
 'source:name',
 'lcn',
 'cycleway:left',
 'cycleway:left:width',
 'cycleway:right:width',
 'source:cycleway',
 'tunnel',
 'horse',
 'oneway:bicycle',
 'fixme',
 'created_by',
 'footway',
 'maxweight',
 'incorrect_name',
 'access',
 'old_name',
 'source:old_name',
 'busway',
 'name:ru',
 'alt_name',
 'psv',
 'not:name',
 'building',
 'bus',
 'area',
 'sidewalk:width',
 'construction',
 'traffic_calming',
 'leisure',
 'hgv',
 'service',
 'landuse',
 'reg_ref',
 'motor_vehicle',
 'water',
 'natural',
 'hazmat',
 'tiger:cfcc',
 'tiger:tlid',
 'ti

In [20]:
# Investigating the address specifically we note that there are a few fields in specific that has address details
# Namely, keys that begins with 'addr:' or contains 'postal_code'

In [21]:
way_child_attrib_data=dict()
address = dict()

for event, element in ET.iterparse(fname,events = ('start',)):
    if element.tag=='way':
        if gotchildren(element):
            for child in element:
                if gotattrib(child) and child.tag=='tag':
                    if child.attrib['k'].startswith('addr:') or child.attrib['k'] in ['postal_code']:
                        if child.attrib['k'] not in address:
                            address[child.attrib['k']] = []
                        if child.attrib['v'] not in address[child.attrib['k']]:
                            address[child.attrib['k']].append(child.attrib['v'])

pprint.pprint(address)


{'addr:city': ['London',
               'Bermondsey',
               'London Borough of Southwark',
               'Heist-op-den-Berg',
               u'Pozna\u0144',
               u'O\u0142awa',
               'London Borough of Lambeth',
               'Ladzin',
               'Lexington',
               'Virginia Beach',
               'Milwaukee',
               'Bergheim',
               'Quetzaltenango',
               'Ronse',
               'Legnica',
               u'Jelenia G\xf3ra',
               'Norfolk',
               'Santo Domingo',
               'Reinheim',
               'Naic',
               u'Klon\xf3w',
               u'\u0141\xf3d\u017a',
               u'Le\u017cajsk',
               'Botolan'],
 'addr:city:simc': ['0358894'],
 'addr:country': ['GB', 'DE', 'BE', 'PL'],
 'addr:housename': ['1 Portsmouth Street',
                    'Drysdale Building',
                    'Munro House',
                    'Dunkirk House',
                    'James Clerk Max

In [22]:
# We note that there does not seem to be much abbreviation used for the street name and generally, there are no inconsistencies.
# However, there are a few problems withe the data:

# 1) Some postcodes are in numeric terms which is wrong given that London postcodes are alphanumeric
# 2) There are multiple field for postal codes.  We should be able to consolidate them
# 3) We are getting addresses of DE BE PL.  Given this is a London map, we should be strictly expecting GB only
# 4) There should not be a addr:state field for any of London data
# 5) Neither should there be a addr:county  field 


In [23]:
#removing all the countries that are not 'GB' and merge 'postal_code' with 'addr:postcode'
way_child_attrib_data=dict()
address = dict()

for event, element in ET.iterparse(fname,events = ('start',)):
    if element.tag=='way':
        if gotchildren(element):
            tempaddress=dict()
            for child in element:
                if gotattrib(child) and child.tag=='tag':
                    if child.attrib['k'].startswith('addr:') or child.attrib['k'] in ['postal_code']:
                        if child.attrib['k'] not in tempaddress:
                            tempaddress[child.attrib['k']] = []
                        if child.attrib['v'] not in tempaddress[child.attrib['k']]:
                            tempaddress[child.attrib['k']]=child.attrib['v']
            if 'addr:country' in tempaddress:
                if tempaddress['addr:country'] != 'GB':
                    continue
                else:
                    for key in tempaddress:
                        if key not in address:
                            address[key]=[]
                        if tempaddress[key] not in address[key]:
                            address[key].append(tempaddress[key])
            else:
                for key in tempaddress:
                    if key not in address:
                        address[key]=[]
                    if tempaddress[key] not in address[key]:
                        address[key].append(tempaddress[key])


for key in address:
    if key=='postal_code':
        for each in address[key]:
            address['addr:postcode'].append(each)
address.pop('postal_code')
pprint.pprint(address)

{'addr:city': ['London',
               'Bermondsey',
               'London Borough of Southwark',
               'Heist-op-den-Berg',
               u'Pozna\u0144',
               u'O\u0142awa',
               'London Borough of Lambeth',
               'Ladzin',
               'Lexington',
               'Virginia Beach',
               'Milwaukee',
               'Quetzaltenango',
               'Legnica',
               u'Jelenia G\xf3ra',
               'Norfolk',
               'Santo Domingo',
               'Naic',
               u'Klon\xf3w',
               u'Le\u017cajsk',
               'Botolan'],
 'addr:city:simc': ['0358894'],
 'addr:country': ['GB'],
 'addr:housename': ['1 Portsmouth Street',
                    'Drysdale Building',
                    'Munro House',
                    'Dunkirk House',
                    'James Clerk Maxwell Building',
                    'Sanctuary Buildings',
                    'Ludgate House',
                    'Old War Office',

In [24]:
# After this preliminary sense check, we will be able to import the data into MongoDB

In [25]:
mapping_last={}
mapping_else={}

def update_name(name, mapping,last_trigger):
    if last_trigger == 1:
        namelist = name.split()
        name_type = namelist[len(namelist)-1]
        if name_type in mapping.keys():
            name_type = mapping[name_type]
            print name
        namelist[len(namelist)-1] = name_type

        name = " ".join(namelist)
        return name
    elif last_trigger ==0: 
        namelist = name.split()
        for name_type in namelist[0:-1]:
            if name_type in mapping.keys():
                namelist[namelist.index(name_type)] = mapping[name_type]
                print name

        name = " ".join(namelist)
        return name

In [26]:
address_double_colon= re.compile(r'^addr:([a-zA-Z]+?):')

def shape_element(element):
    node = {}
    address={}
    created={"version":None,
            "changeset":None,
            "timestamp":None,
            "user":None,
            "uid":None}
    node_refs=[]
    parking=[]
    pos = [None,None]
    #print element.tag, element.attrib
    if element.tag == "node" or element.tag == "way" :
        node['type']=element.tag
        for key in element.attrib:
            #print key, element.attrib[key]
            if key in created.keys():
                created[key]=element.attrib[key]
            elif key == 'lat':
                pos[0] = float(element.attrib[key])
            elif  key =='lon':
                pos[1] = float(element.attrib[key])
            elif key in ['id','visible']:
                node[key]=element.attrib[key]
        #print 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
        for child in element:
            if child.tag == 'nd':
                if child.get('ref'):
                    node_refs.append(child.attrib['ref'])
            if child.tag =='tag':
                if child.attrib['k']=='parking':
                    parking.append(child.attrib['v'])
                if child.attrib['k'].startswith('addr:'):
                    
                    if address_double_colon.search(child.attrib['k']):
                        pass
                    else:
                        mykey = re.findall('^addr:([a-zA-Z]*)',child.attrib['k'])
                        address[mykey[0]] = child.attrib['v']
        node['pos']=pos
        node['created']=created
        if address:
            if 'street' in address:
                address['street']=update_name(address['street'],mapping_last,1)
                address['street']=update_name(address['street'],mapping_else,0)
            node['address'] =address
        if node_refs:
            node['node_refs'] =node_refs   
        if parking:
            node['parking']=parking
        return node
    else:
        return None

In [27]:
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    file_in = file_in+".osm"
    #data = []
    db.London.drop()
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                #data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
                db.London.insert_one(el)
    #return data

In [28]:
fname1 = r'C:\Users\YJ\Documents\1) Learning\Udacity - Data Analyst\Submissions\004\Central London'
data = process_map(fname1, True)

In [29]:
# A sample set of data with address in MongoDB

In [30]:
address = db.London.find({ "address" : { "$exists" : True } })
for i in address[0:1]:
    pprint.pprint(i)

{u'_id': ObjectId('57b4f3b8675bef219074cc01'),
 u'address': {u'housenumber': u'31', u'street': u'University Street'},
 u'created': {u'changeset': u'13220663',
              u'timestamp': u'2012-09-23T14:20:55Z',
              u'uid': u'38784',
              u'user': u'Tom Morris',
              u'version': u'11'},
 u'id': u'108042',
 u'pos': [51.5235442, -0.1355991],
 u'type': u'node'}


In [31]:
# Looking at the most common words used for street names we noticed that certain words that are spelled differently 

In [32]:
def common_streetnames(limit):
    aggaddr = db.London.aggregate([
            {'$match':{'address.street':{'$exists':1}}},
            {'$group':{'_id':'$address.street'}}
        ])

    worddict=dict()
    for row in aggaddr:

        for each in row['_id'].split():
            if each in worddict:
                worddict[each]+=1
            else:
                worddict[each]=1

                import operator
    sorted_worddict = sorted(worddict.items(), key=operator.itemgetter(1))
    pprint.pprint(sorted_worddict[-limit:])

common_streetnames(30)

[(u'Grove', 12),
 (u'High', 13),
 (u'John', 14),
 (u'Park', 14),
 (u'Upper', 16),
 (u'St.', 16),
 (u'Bridge', 16),
 (u'Green', 16),
 (u'street', 17),
 (u'Saint', 17),
 (u'Yard', 17),
 (u'Gardens', 22),
 (u'Old', 23),
 (u'New', 24),
 (u'Terrace', 26),
 (u'Close', 29),
 (u'Great', 32),
 (u'Walk', 36),
 (u'Way', 38),
 (u'Row', 39),
 (u'St', 43),
 (u'Mews', 47),
 (u'Drive', 49),
 (u'Avenue', 53),
 (u'Court', 56),
 (u'Lane', 80),
 (u'Square', 100),
 (u'Place', 157),
 (u'Road', 190),
 (u'Street', 979)]


In [33]:
# We wll be able to merge "St", "St.",'street' into "Street"
# Also St. could be read as Saint and could appear anywhere in the name.  We will use mapping_else to correct for that.

In [34]:
mapping_last={u'St': "Street",u'St.': "Street","street":"Street"}
mapping_else={u'St':u'St.',u'Saint':u'St.','Katharines':"Katharine's"}

In [35]:
fname1 = r'C:\Users\YJ\Documents\1) Learning\Udacity - Data Analyst\Submissions\004\Central London'
data = process_map(fname1, True)

Saint Katherine's Way
St Katharines Dock
St Katharines Dock
St John's Square
St John Street
St Katharine's Way
Stamford street
St Martin's Place
St Bride Street
Soho St.
Saint Giles High Street
Bow street
St Martin's Lane
St James
St James
Amwell street
St James's Street
Cleveland street
Cleveland street
Holland street
Shoreditch High St
Museum street
St James's Street
St James's Street
St James's Street
Isabella street
St Martin's Lane
St Martin's Lane
St Martin's Lane
St Martin's Court
Saint Martin's Court
Cowcross street
Floral street
St Swithins Lane
St Swithins Lane
St Chad's Street
Southwark street
St Katherine's Way
St John Street
Cleveland street
Upper Saint Martin's Lane
Saint Mark Street
Saint Mark Street
Saint John Street
St John's Lane
St Martin's Lane
St Giles High Street
Saint John's Lane
St John's Lane
Stamford street
St Andrew Street
St John Street
Upper Tachbrook St
St Cross Street
St Cross Street
Saint George's Square
St John's Lane
Upper St Martin's Lane
Central St
S

In [36]:
common_streetnames(100)

[(u'Boulevard', 4),
 (u'Red', 4),
 (u'Dean', 4),
 (u'Mount', 4),
 (u'1', 4),
 (u'Mill', 4),
 (u'Cumberland', 4),
 (u'Oak', 4),
 (u'Artillery', 4),
 (u'Fields', 4),
 (u'Woburn', 4),
 (u'Euston', 4),
 (u'Grafton', 4),
 (u'Eaton', 4),
 (u'Connaught', 4),
 (u'Bow', 4),
 (u'Bermondsey', 4),
 (u'Southwark', 4),
 (u'Montague', 5),
 (u'Holborn', 5),
 (u"Martin's", 5),
 (u'White', 5),
 (u'Landing', 5),
 (u'Tower', 5),
 (u'Royal', 5),
 (u'Quay', 5),
 (u"James's", 5),
 (u'Horse', 5),
 (u'King', 5),
 (u'Church', 5),
 (u'Wharf', 5),
 (u'Little', 5),
 (u'Palace', 5),
 (u'Bedford', 5),
 (u'Devonshire', 5),
 (u'Oxford', 5),
 (u'Wall', 5),
 (u'Cambridge', 6),
 (u'James', 6),
 (u'Charles', 6),
 (u'road', 6),
 (u'Circle', 6),
 (u'Belgrave', 6),
 (u'Lambeth', 6),
 (u'Cross', 6),
 (u'Hyde', 6),
 (u'Finsbury', 6),
 (u'York', 6),
 (u'Garden', 6),
 (u'Wascana', 6),
 (u'Victoria', 6),
 (u'Marylebone', 7),
 (u'Sloane', 7),
 (u'East', 7),
 (u'Buildings', 7),
 (u'Queen', 8),
 (u'Crescent', 8),
 (u'Grosvenor', 8),

In [37]:
# Looking at the top 100 words used we noticed that there are certain words that are not english characters e.g.
# u'\u0443\u043b\u0438\u0446\u0430' and u'Stra\xdfe'

In [38]:
aggaddr = db.London.aggregate([
        {'$match':{'address.street':{'$exists':1}}},
        {'$group':{'_id':'$address.street'}}
    ])
for row in aggaddr:
    if u'Stra\xdfe' in row['_id'].split() or u'\u0443\u043b\u0438\u0446\u0430'in row['_id'].split():
        print row['_id']


Нижняя улица
Mainzer Straße
Brauweiler Straße
Frankfurter Straße
улица Красный Октябрь
Наличная улица
Fladnitzer Straße
Langenbochumer Straße
Железноводская улица
улица Дзержинского
Thüringer Straße
Июльская улица
Цветочная улица
Grünenberger Straße
Gösslinger Straße
улица Сергеева-Ценского
Полевая улица
Сосновая улица
Кленовая улица
Groß-Bieberauer Straße
Транспортная улица


In [39]:
import re

def delete_address(phrase):
    regx = re.compile(phrase)

    query = {
        'address.street': regx
    }
    result = db.London.find(query)
    for i in result:
        pprint.pprint(i)
    db.London.delete_many(query)

In [40]:
delete_address(u'Stra\xdfe')
delete_address(u'\u0443\u043b\u0438\u0446\u0430')

{u'_id': ObjectId('57b4f553675bef219081a38d'),
 u'address': {u'city': u'Saarbr\xfccken',
              u'housenumber': u'21',
              u'postcode': u'66121',
              u'street': u'Th\xfcringer Stra\xdfe'},
 u'created': {u'changeset': u'38671197',
              u'timestamp': u'2016-04-18T16:08:02Z',
              u'uid': u'344561',
              u'user': u'FahRadler',
              u'version': u'4'},
 u'id': u'92886754',
 u'node_refs': [u'4130843909'],
 u'pos': [None, None],
 u'type': u'way'}
{u'_id': ObjectId('57b4f566675bef21908204ff'),
 u'address': {u'city': u'Herten',
              u'housenumber': u'6',
              u'postcode': u'45701',
              u'street': u'Robert-Koch-Stra\xdfe'},
 u'created': {u'changeset': u'38629589',
              u'timestamp': u'2016-04-16T19:25:47Z',
              u'uid': u'593899',
              u'user': u'hpduwe',
              u'version': u'4'},
 u'id': u'219867372',
 u'node_refs': [u'4126871932'],
 u'pos': [None, None],
 u'type': u'way'

In [41]:
# With the delete_address function we are able to remove addresses that are not in english characters

In [42]:
# We have cleaned up significant portions of the data however, there are other areas that require a closer look such as 
# cross referencing the results against lat-long positions (e.g. house numbers should move sequentially in a particular 
# direction) and check for spelling error for the street names etc

In [43]:
# We can see the following counts:

# 1) Number of documents in the database
print db.London.find().count()

449611


In [44]:
# 2) Number of documents which has "way" type in the database
print db.London.find({"type":"way"}).count()

78852


In [45]:
# 3) Number of documents which has "node" type in the database
print db.London.find({"type":"node"}).count()

370759


In [46]:
# 4) Number of documents which has address in the database
print db.London.find({ "address" : { "$exists" : True } }).count()

19022


In [47]:
# We note that are 835 contributors to the whole London data set
aggtot = db.London.aggregate([
        {'$match':{'address':{'$exists':1}}},
        {'$group':{'_id':'$created.user','address':{'$sum':1}}},
        {'$sort':{'address':-1}}
    ])
print len(list(aggtot))

828


In [48]:
# However, looking at the top 10 contributors, 
aggtopten = db.London.aggregate([
        {'$match':{'address':{'$exists':1}}},
        {'$group':{'_id':'$created.user','address':{'$sum':1}}},
        {'$sort':{'address':-1}},
        {'$limit':10}
    ])
toptenaddr = 0.
for i in aggtopten:
    toptenaddr +=i['address']
top10ratio = toptenaddr/db.London.find({ "address" : { "$exists" : True } }).count()

print '{:2.2f}% of addresses are contributed by the top ten contributors'.format(top10ratio*100)

59.98% of addresses are contributed by the top ten contributors


In [49]:
# Looking at all addresses which has parking 
others = db.London.aggregate([
        {'$match':{'parking':{'$exists':1}}},
        {'$match':{'address':{'$exists':1}}},
        {'$project':{'_id':'$parking',"address":'$address'}}
    ])

In [50]:
for i in others:
    pprint.pprint(i)

{u'_id': [u'surface'], u'address': {u'city': u'London', u'country': u'GB'}}
{u'_id': [u'surface'], u'address': {u'city': u'London', u'country': u'GB'}}
{u'_id': [u'multi-storey'],
 u'address': {u'city': u'London',
              u'housenumber': u'50',
              u'postcode': u'EC3R 6DT',
              u'street': u'Lower Thames Street'}}
{u'_id': [u'multi-storey'],
 u'address': {u'city': u'London',
              u'housenumber': u'1',
              u'postcode': u'E1 8LP',
              u'street': u'Shorter Street'}}
{u'_id': [u'surface'], u'address': {u'city': u'London', u'country': u'GB'}}
{u'_id': [u'surface'], u'address': {u'city': u'London', u'country': u'GB'}}
{u'_id': [u'surface'], u'address': {u'city': u'London', u'country': u'GB'}}
{u'_id': [u'surface'], u'address': {u'city': u'London', u'country': u'GB'}}
{u'_id': [u'surface'], u'address': {u'city': u'London', u'country': u'GB'}}
{u'_id': [u'surface'], u'address': {u'city': u'London', u'country': u'GB'}}
{u'_id': [u'surface'],