In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:

{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB. 

Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to 
update the street names before you save them to JSON. 

In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings. 
- if the second level tag "k" value contains problematic characters, it should be ignored
- if the second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if the second level tag "k" value does not start with "addr:", but contains ":", you can
  process it in a way that you feel is best. For example, you might split it into a two-level
  dictionary like with "addr:", or otherwise convert the ":" to create a valid key.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:

<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>

  should be turned into:

{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}

- for "way" specifically:

  <nd ref="305896090"/>
  <nd ref="1719825889"/>

should be turned into
"node_refs": ["305896090", "1719825889"]
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        # YOUR CODE HERE
        if element.tag == "node":
            node["pos"] = []
            node["pos"].append(float(element.get("lat")))
            node["pos"].append(float(element.get("lon")))
            node["type"] = "node"
        else:
            node["type"] = "way"
        for k in element.keys():
            if k in CREATED:
                if "created" not in node:
                    node["created"] = {}
                node["created"][k] = element.get(k)
            elif k not in ["lat", "lon"]:
                node[k] = element.get(k)
                
        for e in element.iter():
            if e.tag == "tag":
                k_value = e.get("k")
                v_value = e.get("v")
                if re.search(problemchars, k_value):
                    continue
                m = re.search(lower_colon, k_value)
                if m:
                    splits = k_value.split(":")
                    if k_value.startswith("addr:"):
                        if "address" not in node:
                            node["address"] = {}
                        node["address"][splits[1]] = v_value
                else:
                    node[k_value] = v_value
            if e.tag == "nd":
                if "node_refs" not in node:
                    node["node_refs"] = []
                node["node_refs"].append(e.get("ref"))
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    #data = []
    with codecs.open(file_out, "w") as fo:
        #context = ET.iterparse(file_in)
        #context = iter(context)
        #event,root = context.next()
        for _, element in ET.iterparse(file_in):
        #for event, element in context:
            el = shape_element(element)
            if el:
                #data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
            if element.tag == "node" or element.tag == "way":
                element.clear()
    #return data

In [None]:
process_map("new-delhi_india.osm")

In [1]:
import pymongo

In [2]:
from pymongo import MongoClient

In [3]:
client = MongoClient("mongodb://localhost:27017")

In [4]:
db = client.test

In [5]:
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), u'test')

In [None]:
#agg = db..aggregate([{"$match":{"created.user":"sourabh2910"}}])

In [6]:
db.delhi.find_one()

{u'_id': ObjectId('5753b18fb397fe16bb193dd9'),
 u'capital': u'2',
 u'created': {u'changeset': u'39241310',
  u'timestamp': u'2016-05-11T12:48:13Z',
  u'uid': u'290271',
  u'user': u'Zverik',
  u'version': u'48'},
 u'id': u'16173236',
 u'is_capital': u'country',
 u'is_in': u'National Capital Region, NCR, India',
 u'is_in:iso_3166_2': u'IN-DL',
 u'name': u'New Delhi',
 u'name:bat-smg': u'Naujas\u0117s Del\u0117s',
 u'name:be-tarask': u'\u041d\u044c\u044e-\u0414\u044d\u043b\u0456',
 u'name:cbk-zam': u'Nueva Delhi',
 u'name:fiu-vro': u'New Delhi',
 u'name:zh-min-nan': u'Sin Delhi',
 u'name:zh-yue': u'\u65b0\u5fb7\u91cc',
 u'place': u'city',
 u'population': u'249998',
 u'pos': [28.6138967, 77.2159562],
 u'type': u'node',
 u'wikipedia': u'en:New Delhi'}

mongoimport.exe --db test --collection delhi --file "H:\ADATA_ANALYST_NANODEGREE...\new_de....json"

In [None]:
pprint.pprint(list(agg))

In [7]:
db.delhi.distinct("address.postcode")

[u'122001',
 u'110003',
 u'110025',
 u'110011',
 u'110070',
 u'122018',
 u'110067',
 u'122002',
 u'110017',
 u'110002',
 u'110060',
 u'100006',
 u'2242',
 u'11008',
 u'110007',
 u'201014',
 u'201301',
 u'110042',
 u'250003',
 u'201307',
 u'110065',
 u'110055',
 u'110054',
 u'110015',
 u'110078',
 u'201001',
 u'122011',
 u'201011',
 u'110092',
 u'110021',
 u'122016',
 u'110016',
 u'110085',
 u'110047',
 u'110075',
 u'110043',
 u'1100016',
 u'020626',
 u'420420',
 u'110029',
 u'1100049',
 u'110074',
 u'110048',
 u'110096',
 u'121003',
 u'110005',
 u'122017',
 u'201002',
 u'201010',
 u'110034',
 u'110008',
 u'110088',
 u'110001',
 u'110035',
 u'110091',
 u'Sunpat House Village',
 u'122003',
 u'110032',
 u'110094',
 u'110093',
 u'110084',
 u'110014',
 u'203202',
 u'201303',
 u'110058',
 u'110062',
 u'110076',
 u'110087',
 u'2013010',
 u'101301',
 u'201308',
 u'110006',
 u'201005',
 u'110018',
 u'10089',
 u'110089',
 u'201306',
 u'110026',
 u'110009',
 u'122105',
 u'110019',
 u'110077',
 u'

Validity/Accuracy Issues with postcode
- Not all digits
- Not all 6 digits
Correction Needed
- Don't put errorneous data
- If only space is problem, remove space

In [8]:
db.delhi.distinct("amenity")

[u'cafe',
 u'bank',
 u'fuel',
 u'hospital',
 u'post_office',
 u'atm',
 u'pharmacy',
 u'bus_station',
 u'school',
 u'restaurant',
 u'fast_food',
 u'pub',
 u'place_of_worship',
 u'post_box',
 u'telephone',
 u'public_building',
 u'cinema',
 u'parking',
 u'bureau_de_change',
 u'police',
 u'embassy',
 u'college',
 u'university',
 u'fire_station',
 u'taxi',
 u'kindergarten',
 u'bar',
 u'toilets',
 u'grave_yard',
 u'biergarten',
 u'library',
 u'Ksan Ghat',
 u'veterinary',
 u'marketplace',
 u'dentist',
 u'doctors',
 u'club',
 u'theatre',
 u'Netaji Nagar Market',
 u'Suvidha Market, Netaji Nagar',
 u'Ayurvedic Hospital',
 u'residential',
 u'House',
 u'car_rental',
 u'driving_school',
 u'drinking_water',
 u'swimming_pool',
 u'prison',
 u'fountain',
 u'waste_basket',
 u'Garbage Collection Units',
 u'vending_machine',
 u'clinic',
 u'architect',
 u'recycling',
 u'traffic education',
 u'car_wash',
 u'arts_centre',
 u'community_centre',
 u'bench',
 u'shelter',
 u'parking_entrance',
 u'parking_space',


Consistency Issue
- doctors <-> clinic
- school <-> School
- pub <-> bar

In [9]:
db.delhi.distinct("historic")

[u'monument',
 u'archaeological_site',
 u'ruins',
 u'memorial',
 u'Shiv Mandir Temple',
 u'city_gate',
 u'castle',
 u'wayside_shrine',
 u'fort',
 u'tomb',
 u'yes',
 u'mon']

This should be converted to a single flag to denote that is is a historical site.

In [None]:
# Number of historic sites
# Numbers of schools
# Number of Universities
# Numbers of hospitals
# Find all "tourism": "attraction" places
# restaurant of chinese cuisine "cuisine": "asian", "amenity": "restaurant",

In [26]:
agg = db.delhi.aggregate(
    [
        {"$match":{"historic":{"$exists":"true"}} },
        {"$group":{"_id": "historic", "count": {"$sum":1} } }
    ]
)

In [27]:
import pprint
pprint.pprint(list(agg))

[{u'_id': u'historic', u'count': 189}]


In [32]:
agg = db.delhi.aggregate(
    [
        {"$match":{"tourism":"attraction"}},
        {"$project":{"name":1, "_id":0}}
    ]
)

In [33]:
pprint.pprint(list(agg))

[{u'name': u'South City - I Gate'},
 {u'name': u'11 Murthi'},
 {u'name': u'Teen Murti'},
 {u'name': u'Rashtrapati Bhawan'},
 {u'name': u'IFFCO Chowk'},
 {u'name': u'Zoo - National Zoological Park'},
 {u'name': u'Police Memorial'},
 {u'name': u'Jantar Mantar'},
 {u'name': u'Diwan-e-Aam'},
 {u'name': u'Diwan-e-Khaas'},
 {u'name': u'Purana Qila'},
 {u'name': u'Jaipur Column'},
 {u'name': u'Khooni Darwaza'},
 {u'name': u'Vijaymandal'},
 {u'name': u'Secret Passage'},
 {u'name': u'Sri Aurobindo Bust'},
 {u'name': u'Red Fort'},
 {u'name': u'Jama Masjid'},
 {u'name': u'Lotus Temple'},
 {u'name': u'India Gate'},
 {u'name': u'Mystery Rooms'},
 {u'name': u'Pragati Maidan'},
 {u'name': u'Delhi Haat'},
 {u'name': u'Salimgarh Fort'},
 {u'name': u'Qudsia Bagh'},
 {},
 {u'name': u'Qutub Minar'},
 {u'name': u"Safdarjung's Tomb"},
 {u'name': u'Hazar Ustun (Hall of Thousand Columns)'},
 {u'name': u'Ruins of Palatial Section of Jahanpanah'},
 {u'name': u'Kotla Firoz Shah'},
 {u'name': u'Meena Bazar'},
 {u

Can missing name be handled???

In [45]:
agg = db.delhi.aggregate([  
        {"$match":{"cuisine":"chinese"}},
        {"$project":{"name":1,"_id":0}}
    ])

In [46]:
pprint.pprint(list(agg))

[{u'name': u"Berco's Garden"},
 {},
 {u'name': u'Royal China'},
 {u'name': u'China Garden'},
 {u'name': u'Hao Shi Nian Nian'}]


In [None]:
db.collection_names

In [None]:
db.test

In [None]:
db.pa.find_one()

In [None]:
db.pa.count()

In [None]:
db.pa.find_one({"created.user":'Zverik'})