# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [7]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [8]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [9]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [10]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [14]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np
document = ET.parse( './data/mondial_database.xml' )

In [16]:
root = document.getroot()

In [21]:
country1 = []
infantmort = []
for element in document.iterfind('country'):
    country1.append(element.find('name').text)
    try:
        infantmort.append(float(element.find('infant_mortality').text))
    except AttributeError:
        infantmort.append('none')

In [37]:
problem1 = pd.DataFrame({'Country' : country1,
 'Infant_Mortality' : infantmort})
problem1.sort_values('Infant_Mortality').head(10)

Unnamed: 0,Country,Infant_Mortality
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


In [38]:
#Problem 2

In [39]:
pop = []
city = []
for element in document.iterfind('country'):
    for subelement in element.findall('city'):
        city.append(subelement.find('name').text)
        try:
            pop.append(int(subelement.find("population[0]").text))
        except AttributeError:
            pop.append('none')


In [45]:
problem2 = pd.DataFrame({'City' : city,
 'Population' : pop})
problem2 = problem2[problem2['Population'] != 'none']
problem2.sort_values('Population',ascending=False).head(10)

Unnamed: 0,City,Population
176,Seoul,9708483
164,Al Qahirah,8471859
80,Bangkok,7506700
128,Hong Kong,7055071
92,Ho Chi Minh,5968384
212,Singapore,5076700
163,Al Iskandariyah,4123869
216,New Taipei,3939305
177,Busan,3403135
107,Pyongyang,3255288


In [47]:
#Problem 3
#10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [85]:
# Problem 3
df3 = pd.DataFrame(columns=('Population', 'Ethnic_Group', 'Percentage'))

i = 0

for element in document.iterfind('country'):
    country = element.find('name').text
    pop = int(element.find("population[0]").text)
    for subelement in element.findall('ethnicgroup'):
        eg = subelement.text
        egp = float(subelement.get('percentage')) // 1
        df3.loc[i] = [pop, eg, egp]
        i += 1

df3['EG_Pop'] = (df.Population * (df.Percentage / 100)) // 1
        
df3.sort_values('Ethnic_Group').head()


Unnamed: 0,Population,Ethnic_Group,Percentage,EG_Pop
609,34856813.0,Acholi,4.0,1394272.0
579,84320987.0,Afar,1.0,843209.0
563,834036.0,Afar,35.0,291912.0
598,1586624.0,African,99.0,1570757.0
408,31458.0,African,90.0,28312.0


In [86]:
df3 = df3.drop(['Population','Percentage'],1)

In [87]:
problem3 = df3.reset_index().groupby("Ethnic_Group").sum()
problem3 = problem3.drop(['index'],1)
problem3.sort_values('EG_Pop',ascending=False).head(10)

Unnamed: 0_level_0,EG_Pop
Ethnic_Group,Unnamed: 1_level_1
Han Chinese,1238255000.0
Indo-Aryan,871815600.0
European,490265700.0
African,314894800.0
Dravidian,302713700.0
Mestizo,157732200.0
Bengali,146776900.0
Russian,129961700.0
Japanese,126025000.0
Malay,120115300.0


In [None]:
#problem 4 
#name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [None]:
#problem 4a - longest river 

In [80]:
rivers = pd.DataFrame()
for element in document.iterfind('river'):
    try:
        rivers_entry = pd.DataFrame({'country': element.attrib['country'], 'name': element.find('name').text, 
                                 'length': float(element.find('length').text)}, index = range(1))
    except AttributeError:
        pass
    rivers = rivers.append(rivers_entry, ignore_index=True)

In [88]:
problem4a = rivers.sort_values('length',ascending=False).head(1)
problem4a

Unnamed: 0,country,length,name
174,CO BR PE,6448.0,Amazonas


In [None]:
#Problem 4b - Largest Lake

In [76]:
lakes = pd.DataFrame()
for element in document.iterfind('lake'):
    try:
        lakes_entry = pd.DataFrame({'country': element.attrib['country'], 'name': element.find('name').text, 
                                 'area': float(element.find('area').text)}, index = range(1))
    except AttributeError:
        pass
    lakes = lakes.append(lakes_entry, ignore_index=True)

lakes.sort_values(by='area', ascending =False).head(1)

Unnamed: 0,area,country,name
54,386400.0,R AZ KAZ IR TM,Caspian Sea


In [83]:
#Problem 4c - Highest Airport

In [84]:
airports = pd.DataFrame()
for element in document.iterfind('airport'):
    try:
        airports_entry = pd.DataFrame({'country': element.attrib['country'], 'name': element.find('name').text, 
                                 'elevation': float(element.find('elevation').text)}, index = range(1))
    except TypeError:
        pass
    airports = airports.append(airports_entry, ignore_index=True)

airports.sort_values(by='elevation', ascending =False).head(1)


Unnamed: 0,country,elevation,name
80,BOL,4063.0,El Alto Intl
