# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [109]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [110]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [111]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [112]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [113]:
document = ET.parse( './data/mondial_database.xml' )

In [114]:
#10 countries with the lowest infant mortality rates
import pandas as pd
data = []
for element in document.iterfind('country'):
    country = element.find('name').text
    if element.find('infant_mortality') is not None:
        mortality = float(element.find('infant_mortality').text)
        data.append({'country': country, 'infant_mortality' : mortality}) 

df = pd.DataFrame(data)

df.sort('infant_mortality').head(10)

Unnamed: 0,country,infant_mortality
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


In [115]:
#10 cities with the largest population
data = []
for element in document.iterfind('country/city'):
    city = element.find('name').text
    if element.find('population') is not None:
        population = float(element.find('population').text)
        data.append({'city': city, 'population' : population}) 

df = pd.DataFrame(data)
df

df.sort('population').tail(10)

Unnamed: 0,city,population
204,Taipei,2626138
153,Al Iskandariyah,2917000
84,Hanoi,3056146
205,New Taipei,3722082
166,Busan,3813814
87,Ho Chi Minh,3924435
75,Bangkok,5876000
154,Al Qahirah,6053000
123,Hong Kong,7055071
165,Seoul,10229262


In [196]:
#10 ethnic groups with the largest overall populations 
#(sum of best/latest estimates over all countries)
data2 = []
for element in document.iterfind('country'):
    #country = element.find('name').text
    #population = float(element.find('population').text)
    if element.find('ethnicgroup') is not None:
        country = element.find('name').text
        population = float(element.find('population').text)
        for ethnic in element.findall('ethnicgroup'):
            eth = ethnic.text
            eth_percent = float(ethnic.attrib['percentage'])
            data2.append({'country': country, 'population' : population,
                      'eth_percent': eth_percent, 'ethnic': eth})

df2 = pd.DataFrame(data2)
df2['eth_pop'] = df2['eth_percent'] * df2['population']
df2.groupby('ethnic')[['eth_pop']].sum().sort('eth_pop').tail(10)


Unnamed: 0_level_0,eth_pop
ethnic,Unnamed: 1_level_1
Mestizo,3554233000.0
English,4231499000.0
Dravidian,5959908000.0
German,6623219000.0
Japanese,8170627000.0
African,8632937000.0
Russian,9275844000.0
Indo-Aryan,17164540000.0
European,19286580000.0
Han Chinese,49755510000.0


In [228]:
#name and country of a) longest river, 
import pandas as pd
data = []
for element in document.iterfind('river'):
    river = element.find('name').text
    if element.find('length') is not None:
        length = element.find('length').text
        country = element.find('source').attrib['country']
    #if element.find('infant_mortality') is not None:
        #mortality = float(element.find('infant_mortality').text)
    data.append({'country': country, 'river' : river, 'length': length}) 

df = pd.DataFrame(data)
df
df.sort('length').tail(1)


Unnamed: 0,country,length,river
121,MNG,992,Selenge


In [232]:
#b) largest lake
data = []
for element in document.iterfind('lake'):
    lake = element.find('name').text
    if element.find('area') is not None and element.find('located') is not None:
        area = element.find('area').text
        country = element.find('located').attrib['country']
    #if element.find('infant_mortality') is not None:
        #mortality = float(element.find('infant_mortality').text)
    data.append({'country': country, 'lake' : lake, 'area': area}) 

df = pd.DataFrame(data)
#df
df.sort('area').tail(1)

Unnamed: 0,area,country,lake
122,981,USA,Fort Peck Lake


In [269]:
#c) airport at highest elevation
import numpy as np
data = []
for element in document.iterfind('airport'):    
    country = element.attrib['country']
    airport = element.find('name').text
    if element.find('elevation').text is not None:
        elevation = float(element.find('elevation').text)
    #print elevation
    #if element.find('area') is not None and element.find('located') is not None:
        #area = element.find('area').text
        #country = element.find('located').attrib['country']
    #if element.find('infant_mortality') is not None:
        #mortality = float(element.find('infant_mortality').text)
    data.append({'country': country, 'airport' : airport, 'elevation': elevation}) 

df = pd.DataFrame(data)
df.sort('elevation').tail(1)






Unnamed: 0,airport,country,elevation
80,El Alto Intl,BOL,4063
