# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )

## Problem 1
### 10 countries with the lowest infant mortality rates

In [6]:
import pandas as pd

In [7]:
infmor = dict()
for element in document.iterfind('country'):
    for inf in list(element):
        if inf.tag == 'infant_mortality':
            infmor[element.find('name').text] = float(inf.text)
#pd.DataFrame({'date' : dict_dates.keys() , 'date_value' : dict_dates.values() })
df = pd.DataFrame({'country' : infmor.keys(), 'infant_mortality' : infmor.values()})

In [8]:
df.sort_values(by='infant_mortality', ascending = True).head(10)

Unnamed: 0,country,infant_mortality
34,Monaco,1.81
210,Japan,2.13
71,Norway,2.48
64,Bermuda,2.48
76,Singapore,2.53
106,Sweden,2.6
55,Czech Republic,2.63
143,Hong Kong,2.73
52,Macao,3.13
189,Iceland,3.15


## Problem 2
### 10 cities with the largest population

In [9]:
citypop = dict()
for element in document.iterfind('country/city'):
    for subele in list(element):
        #print subele.text
        if subele.tag == 'population':
            #print subele.text
            if subele.attrib['year'] == '2011':
                #print subele.text
                citypop[element.find('name').text] = int(subele.text)

In [10]:
df = pd.DataFrame({'city' : citypop.keys(), 'population' : citypop.values()})

In [11]:
df.sort_values(by='population', ascending = False).head(10)

Unnamed: 0,city,population
8,Beograd,1639121
14,Montevideo,1318755
56,Sofia,1270284
31,Yerevan,1060138
46,Kathmandu,1003285
20,Zagreb,686568
62,Kingston,662426
61,Rīga,658640
63,Vilnius,535631
40,Dublin,525383


## Problem 3
### 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [12]:
ethngp = dict()
for element in document.iterfind('country'):
    yrlst = []
    for inf in list(element):
        if inf.tag == 'population':
            yrlst.append(int(inf.attrib['year']))
    latest = max(yrlst)
    for inf in list(element):
        if inf.tag == 'population':
            if int(inf.attrib['year']) == latest:
                popu = float(element.find('population').text)
        if inf.tag == 'ethnicgroup':
            perc = float(inf.attrib['percentage'])
            ethngp[inf.text] = (popu * perc) / 100

In [13]:
df = pd.DataFrame({'ethnic_group' : ethngp.keys(), 'population' : ethngp.values()})

In [14]:
df['pop_sum'] = df.groupby('ethnic_group')['population'].transform(sum)
df.sort_values(['pop_sum'], ascending=[False]).head(10).drop('pop_sum', axis = 1)

Unnamed: 0,ethnic_group,population
93,Han Chinese,497555100.0
103,Indo-Aryan,171645400.0
269,Japanese,81706270.0
178,Dravidian,59599080.0
150,English,42314990.0
183,Bengali,28349230.0
265,Viet/Kinh,21381690.0
250,Mulatto,20780270.0
85,Eastern Hamitic,18777330.0
169,Mediterranean Nordic,18618090.0


## Problem 4
### name and country of a) longest river, b) largest lake and c) airport at highest elevation

### Longest river

In [15]:
rvlst = []
#rvlenlst = []
#rvconlst = []
for element in document.iterfind('river'):
    for subele in list(element):
        if subele.tag == 'name':
            rvnam = subele.text
        if subele.tag == 'length':
            rvlen = float(subele.text)
        if subele.tag == 'source':
            rvcon = subele.attrib['country']
    rvlst.append((rvnam, rvlen, rvcon))

In [16]:
rdf = pd.DataFrame(rvlst, columns=['Name', 'Length', 'C_Code']).sort_values(by='Length', ascending=False)

In [17]:
# Longest river
rdf.head(1)

Unnamed: 0,Name,Length,C_Code
174,Amazonas,6448,PE


### Largest lake

In [18]:
lklst = []
#rvlenlst = []
#rvconlst = []
for element in document.iterfind('lake'):
    for subele in list(element):
        if subele.tag == 'name':
            lknam = subele.text
        if subele.tag == 'area':
            lkarea = float(subele.text)
        if subele.tag == 'located':
            lkcon = subele.attrib['country']
    lklst.append((lknam, lkarea, lkcon))

In [19]:
ldf = pd.DataFrame(lklst, columns=['Name', 'Area', 'C_Code']).sort_values(by='Area', ascending=False)

In [20]:
# Largest lake
ldf.head(1)

Unnamed: 0,Name,Area,C_Code
54,Caspian Sea,386400,TM


### Airport at highest elevation

In [23]:
aplst = []
for element in document.iterfind('airport'):
    apcon = element.attrib['country']
    for subele in list(element):
        apele = 0.0
        if subele.tag == 'name':
            apnam = subele.text
        if subele.tag == 'elevation':
            if subele.text == None:
                apele == 0.0
            else:
                apele = float(subele.text)
            #print apele
        aplst.append((apnam, apele, apcon))

In [24]:
apdf = pd.DataFrame(aplst, columns=['Name', 'Elevation', 'C_Code']).sort_values(by='Elevation', ascending=False)

In [25]:
# Airport at highest elevation
apdf.head(1)

Unnamed: 0,Name,Elevation,C_Code
412,El Alto Intl,4063,BOL
