In [1]:
from xml.etree import ElementTree as ET

In [2]:
document_tree = ET.parse( './data/mondial_database.xml' )

In [3]:
# 10 countries with lowest infant mortality rates:

data = []                                                    # open empty list
for inmort in document_tree.iterfind('country'):             # start for-loop for search
    country = inmort.find('name')                            # find name of country
    mortality = inmort.find('infant_mortality')              # find infant mortality value
    if mortality is not None:                                # taking care of missing values
        data.append((country.text, float(mortality.text)))   # append text (country) and float (mortality) in list  
data.sort(key=lambda tup: tup[1], reverse=False)             # sort data according tuple 1 (mortality), ascending 
print "Lowest infant mortality rates:"
for c in data[:10]:                                          # for data in the Top Ten:
    print "*" ,c[0],":",c[1]                         # print star, the country name, the colon, and the mortality value

Lowest infant mortality rates:
* Monaco : 1.81
* Japan : 2.13
* Norway : 2.48
* Bermuda : 2.48
* Singapore : 2.53
* Sweden : 2.6
* Czech Republic : 2.63
* Hong Kong : 2.73
* Macao : 3.13
* Iceland : 3.15


In [4]:
# 10 cities with largest population:

from xml.etree import ElementTree as ET
document_tree = ET.parse( './data/mondial_database.xml' )

largest_cities = []

for city in document_tree.iter("city"):
    city_name = city.find('name').text
    # find latest pop value
    popvalue_latest = 0
    year_latest = 0
    for population in city.findall('population'):
        year = population.attrib.get('year',0) 
        popvalue = int(population.text)
        if year > year_latest:
            year_latest = year
            popvalue_latest = popvalue
    
    largest_cities.append((city_name, popvalue_latest))
    
largest_cities.sort(key=lambda tup: tup[1], reverse=True)   # sort according latest population value, descending
    
for city in largest_cities[:10]:
    print city[0], city[1]

Shanghai 22315474
Istanbul 13710512
Mumbai 12442373
Moskva 11979529
Beijing 11716620
São Paulo 11152344
Tianjin 11090314
Guangzhou 11071424
Delhi 11034555
Shenzhen 10358381


In [5]:
# 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries):

d_ethnic = {}                                           # open empty dictionary

for country in document_tree.findall('country'):
    # First: find latest estimates of population
    pop_latest = -1                                     # -1 (int) indicates that there is no value
    pop_latest_year = 0                                 # set variable to 0
    
    # Second: find best estimates of population
    pop_census = -1               
    pop_census_year = 0
    
    for pop in country.findall('population'):           # for-loop to search for population values
        year = int(pop.attrib.get("year", "0"))         # 'year' is an attribute    
                                                        # 0 as default value if there's no attribute 'year' 
        if year > pop_latest_year:                      # if this year is greater than the one you have seen so far:
            pop_latest = int(pop.text)                  # give me the integer of this variable
            pop_latest_year = year                      # remember this as the latest year so far
        
        if pop.attrib.get("measured", '') == 'census':  #if there's attribute called "measured' that equals 'cencus"
            
            if year > pop_census_year:                  # if the year there is bigger than the one you have seen so far:
                pop_census = int(pop.text)              # give me the integer of this variable
                pop_census_year = year                  # remember this as the latest year so far
    
    # Now: I have to sum up ethnic groups
    for egroup in country.findall('ethnicgroup'):       # for ethnic group applies:
        pct = float(egroup.attrib.get("percentage", "0")) / 100.0  # pct is the float of its attribute "percentage"
        
        absvalue_latest = pop_latest * pct              # the value of latest = pop_latest (int) * pct (float)
        absvalue_census = pop_census * pct              # the value of cencus = pop_latest (int) * pct (float)
        egroup_name = egroup.text                       # take the text of egroup for the regarding egroup
        
        if egroup_name not in data:                     # if there are missings in egroup:
            d_ethnic[egroup_name] = [0,0]               # take this list/array to store both values (latest und census) in one
        
        latest_census = d_ethnic[egroup_name]           # take the name of the egroup
        latest_census[0] += absvalue_latest             # take the value of the latest estimate (year 2011)
        latest_census[1] += absvalue_census             # take the value of the best estimate (census)
        d_ethnic[egroup_name] = latest_census           # remember tthis value

results = d_ethnic.items()                              # items-command because of dictionary 
                                                        # items are an array of (key, value) tuples

# BUT: I can't sort a dictionary
results.sort(key=lambda tup: tup[1][0], reverse=True)   # sort according latest population value, desecending
print "top population (latest)"                         # print this
for c in results[:10]:                                  # for the values in results (Top Ten) applies:
    print "* %s: %.0f" % (c[0], c[1][0])                # print star, string, colon, no decimal
print
    
results.sort(key=lambda tup: tup[1][1], reverse=True)   # sort according best population value, desecending
print "top population (census)"                         # print this
for c in results[:10]:                                  # for the values in results (Top Ten) applies:
    print "* %s: %.0f" % (c[0], c[1][1])                #% string: % float with ... decimals (here: 0)

top population (latest)
* Han Chinese: 1245058800
* Indo-Aryan: 871815583
* Dravidian: 302713744
* Bengali: 146776917
* Japanese: 126534212
* Eastern Hamitic: 82830377
* Mulatto: 78065896
* Viet/Kinh: 76078375
* English: 53592327
* Mediterranean Nordic: 46815916

top population (census)
* Han Chinese: 1225848240
* Indo-Aryan: 871815583
* Dravidian: 302713744
* Bengali: 146776917
* Japanese: 127289008
* Viet/Kinh: 73570876
* Mulatto: 73432087
* Eastern Hamitic: 72070051
* English: 52820301
* Mediterranean Nordic: 46815916


In [6]:
# name and country of a) longest river:

longest_river_length = 0
longest_river_name = None
longest_river_country = None

for river in document_tree.findall('river'):
    country_name = river.attrib.get("country","-")
    river_name = river.find('name').text
    
    # get river length (can be missing)
    river_length_node = river.find('length')
    river_length = -1
    if river_length_node is not None:
        river_length = float(river_length_node.text)
    
    # check if current river length is longer than what we have seen before
    if river_length > longest_river_length:
        longest_river_length = river_length
        longest_river_name = river_name
        longest_river_country = country_name

print "longest river:"
print longest_river_country, longest_river_name, longest_river_length

longest river:
CO BR PE Amazonas 6448.0


In [7]:
# name and country of b) largest lake:

largest_lake_area = 0
largest_lake_name = None
largest_lake_country = None

for lake in document_tree.findall('lake'):
    country_name = lake.attrib.get("country","-")
    lake_name = lake.find('name').text
    
    # get lake area (can be missing)
    lake_area_node = lake.find('area')
    lake_area = -1
    if lake_area_node is not None:
        if lake_area_node.text is not None:
            lake_area = float(lake_area_node.text)
    
    # check if current lake area is higher than what we have seen before
    if lake_area > largest_lake_area:
        largest_lake_area = lake_area
        largest_lake_name = lake_name
        largest_lake_country = country_name

print "largest lake:"
print largest_lake_country, largest_lake_name, largest_lake_area

largest lake:
R AZ KAZ IR TM Caspian Sea 386400.0


In [8]:
# name and country of c) airport at highest elevation:

highest_airport_elevation = 0
highest_airport_name = None
highest_airport_country = None

for airport in document_tree.findall('airport'):
    country_name = airport.attrib.get("country","-")
    airport_name = airport.find('name').text
    
    # get airport length (can be missing)
    airport_elevation_node = airport.find('elevation')
    airport_elevation = -1
    if airport_elevation_node is not None:
        if airport_elevation_node.text is not None:
            airport_elevation = float(airport_elevation_node.text)
    
    # check if current airport elevation is higher than what we have seen before
    if airport_elevation > highest_airport_elevation:
        highest_airport_elevation = airport_elevation
        highest_airport_name = airport_name
        highest_airport_country = country_name

print "highest airport:"
print highest_airport_country, highest_airport_name, highest_airport_elevation

highest airport:
BOL El Alto Intl 4063.0
