# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np
from collections import Counter

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )

# Question 1

In [6]:
def safe_mortality_find(country):
    """
    Given an xml element will try to return the infant mortality
    but when if it fails it will return NaN
    """
    try:
        return country.find('infant_mortality').text
    except AttributeError:
        return np.NaN

data = [(country.find('name').text, safe_mortality_find(country)) for country in document.findall('country')]

## Create a dataframe with the found data

In [7]:
df = pd.DataFrame(data, columns=['country', 'infant_mortality'])
df.infant_mortality = df.infant_mortality.astype(float)
df.sort_values('infant_mortality').head(10)

Unnamed: 0,country,infant_mortality
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


# Question 2

In [8]:
def safe_population_find(element):
    """
    Returns the most recent population estimate given a city
    element.
    returns NaN if no population elements are found
    """
    try:
        return element.findall('population')[-1].text
    except IndexError:
        return np.NaN

data = [(city.find('name').text, safe_population_find(city)) for city in document.findall('.//city')]

In [9]:
df = pd.DataFrame(data, columns=['City', 'Population'])
df = df.dropna()
df['Population'] = df.Population.astype(int)
df.sort_values('Population', ascending=False).head(10)

Unnamed: 0,City,Population
1341,Shanghai,22315474
771,Istanbul,13710512
1527,Mumbai,12442373
479,Moskva,11979529
1340,Beijing,11716620
2810,São Paulo,11152344
1342,Tianjin,11090314
1064,Guangzhou,11071424
1582,Delhi,11034555
1067,Shenzhen,10358381


# Question 3

In [10]:
# cycle through all countries, grab most recent population and use ethnic percentage to find the population number

ctr = Counter()

for country in document.findall('country'):
    for ethn_group in country.findall('ethnicgroup'):
        pop = safe_population_find(country)
        ctr[ethn_group.text] += int(pop) * (float(ethn_group.get('percentage', default=0)) / 100)

In [11]:
df = pd.DataFrame(ctr, index=[0]).stack().reset_index().drop('level_0', axis=1)
df.columns = ['ethnic_group', 'population']
df.sort_values('population', ascending=False).head(10)

Unnamed: 0,ethnic_group,population
113,Han Chinese,1245059000.0
120,Indo-Aryan,871815600.0
89,European,494872200.0
2,African,318325100.0
77,Dravidian,302713700.0
176,Mestizo,157734400.0
42,Bengali,146776900.0
217,Russian,131857000.0
128,Japanese,126534200.0
163,Malay,121993600.0


# Question 4

## A) Longest River

In [12]:
def safe_find(element, search):
    try:
        return element.find(search).text
    except AttributeError:
        return np.NaN

In [13]:
# have to split countries because there can be more than one
data = [(river.get('country').split(' '), safe_find(river,'name'), safe_find(river,'length')) 
        for river in document.findall('river')]

# map country codes to country names
country_codes = {country.get('car_code'): country.find('name').text for country in document.findall('country')}


In [14]:
df = pd.DataFrame(data, columns=['country_countries', 'river', 'length'])
df.length = df.length.astype(float)

# add full names to the country codes
df.country_countries = df.country_countries.apply(lambda codes: ", ".join([country_codes[code] for code in codes]))
df.sort_values('length', ascending=False).head(1)

Unnamed: 0,country_countries,river,length
174,"Colombia, Brazil, Peru",Amazonas,6448.0


## B) Largest Lake


In [15]:
data = [(lake.get('country').split(' '), safe_find(lake,'name'), safe_find(lake,'area')) 
        for lake in document.findall('.//lake')]

df = pd.DataFrame(data, columns=['country_countries', 'lake', 'area'])
df.area = df.area.astype(float)

# add full names to the country codes
df.country_countries = df.country_countries.apply(lambda codes: ", ".join([country_codes[code] for code in codes]))
df.sort_values('area', ascending=False).head(1)


Unnamed: 0,country_countries,lake,area
54,"Russia, Azerbaijan, Kazakhstan, Iran, Turkmeni...",Caspian Sea,386400.0


## C) Highest Airport

In [16]:
data = [(airport.get('country').split(' '), safe_find(airport,'name'), safe_find(airport,'elevation')) 
        for airport in document.findall('airport')]

df = pd.DataFrame(data, columns=['country_countries', 'airport', 'elevation'])
df.elevation = df.elevation.astype(float)

# add full names to the country codes
df.country_countries = df.country_countries.apply(lambda codes: ", ".join([country_codes[code] for code in codes]))
df.sort_values('elevation', ascending=False).head(1)

Unnamed: 0,country_countries,airport,elevation
80,Bolivia,El Alto Intl,4063.0
