# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [17]:
document = ET.parse( './data/mondial_database.xml' )

In [13]:
import pandas as pd

#1. 10 countries with lowest infant mortality rates

def extract_name(country):
    """
    extracts the name (str) of a country
    """
    return country.find('name').text

def extract_infant_mortality(country):
    """
    extracts the infant mortality rate of a country (float).
    If it isn't specified, then return (None).
    """
    inf_mortality = country.find('infant_mortality')
    if inf_mortality is not None:
      return float(country.find('infant_mortality').text)
    else:
      return None

# for each country generate a dictionary containing the name and infant mortality rate
data_pts = ({"name": extract_name(country), "infant_mortality": extract_infant_mortality(country)} \
             for country in document.iterfind('country'))

# turn generated data into a data frame
df = pd.DataFrame(data_pts)

# remove invalid entries in the dataframe (these are countries without an infant mortlaity rate)
df = df.dropna()

# sort data frame by infant mortality rate and display 10 lowest
df.sort_values(by='infant_mortality', ascending=True).head(10)

Unnamed: 0,infant_mortality,name
38,1.81,Monaco
98,2.13,Japan
117,2.48,Bermuda
36,2.48,Norway
106,2.53,Singapore
37,2.6,Sweden
10,2.63,Czech Republic
78,2.73,Hong Kong
79,3.13,Macao
44,3.15,Iceland


In [35]:
#2. 10 cities with the largest population
# NOTE: to solve this, we assume that we only want to look at the LATEST population recording for each city.
#
df = pd.DataFrame()

def extract_latest_population(elm):
    max_pop = None
    max_year = None
    for population in elm.findall('population'):
        year = int(population.attrib['year'])
        pop = int(population.text)
        if pop > max_pop:
          max_pop = pop
          max_year = year
    return max_pop, max_year

for city in document.iterfind('.//city'):
    country_name = city.attrib['country']
    city_name = city.find('name').text
    pop, year = extract_latest_population(city)
    df = df.append({"city": city_name,
                    "country": country_name,
                    "year": year,
                    "population": pop}, ignore_index=True)

df = df.dropna()
df.sort_values(by='population').tail(10)

Unnamed: 0,city,country,population,year
1067,Shenzhen,CN,10358381,2010
1064,Guangzhou,CN,11071424,2010
1342,Tianjin,CN,11090314,2010
2810,São Paulo,BR,11152344,2010
1340,Beijing,CN,11716620,2010
479,Moskva,R,11979529,2013
1527,Mumbai,IND,12442373,2011
1582,Delhi,IND,12877470,2001
771,Istanbul,TR,13710512,2012
1341,Shanghai,CN,22315474,2010


In [44]:
# 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

df = pd.DataFrame()
for country in document.iterfind('country'):
    country_name = country.find('name').text
    # find latest population info
    pop, year = extract_latest_population(country)
    # find all ethnicgroups info
    for ethnic_group in country.iterfind('ethnicgroup'):
        percentage = float(ethnic_group.attrib['percentage']) / 100
        group_name = ethnic_group.text
        df = df.append({"country": country_name,
                        "population": pop,
                        "year": year,
                        "percentage": percentage,
                        "ethnicity": group_name}, ignore_index=True)
df = df.dropna()
df['ethnicity_population'] = df.population * df.percentage
df.groupby('ethnicity').ethnicity_population.sum().sort_values().reset_index().tail(10)

Unnamed: 0,ethnicity,ethnicity_population
270,Malay,121993600.0
271,Japanese,127289000.0
272,Russian,136866600.0
273,Bengali,146776900.0
274,Mestizo,157855300.0
275,Dravidian,302713700.0
276,African,318359700.0
277,European,494939500.0
278,Indo-Aryan,871815600.0
279,Han Chinese,1245059000.0


In [69]:
# 4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

# NOTE: we will notice that river, lake, and airport elements all have similar structures.
# In particular, each of the elements will contain the country information, while their
# children elements contain the remaining (name, [length, area, elevantion]) info.

def get_resource_data_points(resource_type, target_feat):
  """
  finds all elements specified by $resource_type. For each of
  these elements, return its country, its name, and the value
  of a target feature as specified by the user (via $target_feat).
  """
  for resource in document.iterfind(".//" + resource_type):
    country = resource.attrib['country']
    name = resource.find('name').text
    feat_elm = resource.find(target_feat)
    feat = float(feat_elm.text) if feat_elm != None and feat_elm.text != None else None
    yield {"country": country, 
           "name": name, 
           target_feat: feat}

river_df = pd.DataFrame(pt for pt in get_resource_data_points("river", "length"))
lake_df = pd.DataFrame(pt for pt in get_resource_data_points("lake", "area"))
airport_df = pd.DataFrame(pt for pt in get_resource_data_points("airport", "elevation"))

print "longest river"
river_df[river_df.length == river_df.length.max()]

 longest river


Unnamed: 0,country,length,name
174,CO BR PE,6448,Amazonas


In [64]:
print "largest lake"
lake_df[lake_df.area == lake_df.area.max()]

largest lake


Unnamed: 0,area,country,name
54,386400,R AZ KAZ IR TM,Caspian Sea


In [70]:
print "highest airport"
airport_df[airport_df.elevation == airport_df.elevation.max()]

highest airport


Unnamed: 0,country,elevation,name
80,BOL,4063,El Alto Intl
