# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [3]:
from xml.etree import ElementTree as ET
document = ET.parse( 'C:\data_wrangling_xml/mondial_database.xml' )

In [6]:
from operator import itemgetter, attrgetter, methodcaller

Solution 1

In [144]:
data = []
for element in document.iterfind('country'):
    name = element.find('name').text
    if element.find('infant_mortality') is None:
        rate = None
    else:
        rate = float(element.find('infant_mortality').text)
    data.append((rate,name))
        
sorted(data, key=itemgetter(0))
    

[(None, 'Montenegro'),
 (None, 'Kosovo'),
 (None, 'Holy See'),
 (None, 'Ceuta'),
 (None, 'Melilla'),
 (None, 'Svalbard'),
 (None, 'Christmas Island'),
 (None, 'Cocos Islands'),
 (None, 'Curacao'),
 (None, 'Saint Martin'),
 (None, 'Saint Barthelemy'),
 (None, 'Niue'),
 (None, 'Norfolk Island'),
 (None, 'Pitcairn'),
 (None, 'Tokelau'),
 (None, 'Falkland Islands'),
 (1.81, 'Monaco'),
 (2.13, 'Japan'),
 (2.48, 'Norway'),
 (2.48, 'Bermuda'),
 (2.53, 'Singapore'),
 (2.6, 'Sweden'),
 (2.63, 'Czech Republic'),
 (2.73, 'Hong Kong'),
 (3.13, 'Macao'),
 (3.15, 'Iceland'),
 (3.31, 'France'),
 (3.31, 'Italy'),
 (3.33, 'Spain'),
 (3.36, 'Finland'),
 (3.4, 'Anguilla'),
 (3.46, 'Germany'),
 (3.47, 'Guernsey'),
 (3.59, 'Malta'),
 (3.64, 'Belarus'),
 (3.66, 'Netherlands'),
 (3.69, 'Andorra'),
 (3.73, 'Switzerland'),
 (3.74, 'Ireland'),
 (3.86, 'Jersey'),
 (3.93, 'South Korea'),
 (3.98, 'Israel'),
 (4.04, 'Slovenia'),
 (4.1, 'Denmark'),
 (4.16, 'Austria'),
 (4.17, 'Isle of Man'),
 (4.18, 'Belgium'),
 (4.

Solution 2

In [23]:
import unicodedata

def remove_diacritic(input):
    '''
    Accept a unicode string, and return a normal string (bytes in Python 3)
    without any diacritical marks.
    '''
    return unicodedata.normalize('NFKD', input).encode('ASCII', 'ignore')
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

In [26]:
cities = []
for element in document.iterfind('country'):
    for subelement in element.getiterator('city'):
        name = subelement.find('name').text  
        if subelement.find('population') is None:
            pop = 'No data found'
        else:
            pop = int(subelement.find('population').text)
    cities.append((pop,name))
        
sorted(cities, key=itemgetter(0), reverse=True) 

[('No data found', 'Focsani'),
 ('No data found', 'Soroe'),
 ('No data found', 'Gibraltar'),
 ('No data found', 'Saint Peter Port'),
 ('No data found', 'Saint Helier'),
 ('No data found', 'Douglas'),
 ('No data found', 'Longyearbyen'),
 ('No data found', 'Dehra Dun'),
 ('No data found', 'Putrajaya'),
 ('No data found', 'Macao'),
 ('No data found', 'Flying Fish Cove'),
 ('No data found', 'West Island'),
 ('No data found', 'Tabouk'),
 ('No data found', 'Naha'),
 ('No data found', 'The Valley'),
 ('No data found', 'Oranjestad'),
 ('No data found', 'Hamilton'),
 ('No data found', 'Road Town'),
 ('No data found', 'George Town'),
 ('No data found', 'Chichica'),
 ('No data found', u'San Jos\xe9 de las Lajas'),
 ('No data found', 'Nuuk'),
 ('No data found', 'Basse-Terre'),
 ('No data found', 'Puerto Carreno'),
 ('No data found', 'Grand Turk'),
 ('No data found', 'Pago Pago'),
 ('No data found', 'Avarua'),
 ('No data found', 'Papeete'),
 ('No data found', 'Agana'),
 ('No data found', 'Yaren'),


Solution 3

In [48]:
groups = []
for element in document.iterfind('country'):
    name = element.find('name').text  
    if element.find('ethnicgroup') is None:
        ethnic = "No data available"
    else:
        per = element.find('ethnicgroup').attrib
        ethnic = element.find('ethnicgroup').text
    groups.append((per,ethnic,name))
        
sorted(groups, key=itemgetter(0), reverse=True) 

[({'percentage': '99.7'}, 'Sotho', 'Lesotho'),
 ({'percentage': '99.7'}, 'No data available', 'Madagascar'),
 ({'percentage': '99.7'}, 'No data available', 'Malawi'),
 ({'percentage': '99.4'}, 'Japanese', 'Japan'),
 ({'percentage': '99.4'}, 'No data available', 'South Korea'),
 ({'percentage': '99.4'}, 'No data available', 'Maldives'),
 ({'percentage': '99.4'}, 'No data available', 'Oman'),
 ({'percentage': '99'}, 'African', 'Benin'),
 ({'percentage': '99'}, 'African', 'Nigeria'),
 ({'percentage': '99'}, 'African', 'Togo'),
 ({'percentage': '99'}, 'African', 'Gambia'),
 ({'percentage': '98'}, 'Bengali', 'Bangladesh'),
 ({'percentage': '97.7'}, 'Armenian', 'Armenia'),
 ({'percentage': '97'}, 'European', 'Argentina'),
 ({'percentage': '97'}, 'Berber Arab', 'Libya'),
 ({'percentage': '96'}, 'Polynesian', 'Tuvalu'),
 ({'percentage': '95'}, 'Albanian', 'Albania'),
 ({'percentage': '95'}, 'Chinese', 'Hong Kong'),
 ({'percentage': '95'}, 'Chinese', 'Macao'),
 ({'percentage': '95'}, 'African',