# Part 1. Parse XML as element tree 
In this method, we read the whole xml file into the memory as a tree

In [1]:
import xml.etree.cElementTree as ET
import pprint

In [2]:
tree = ET.parse("exampleResearchArticle.xml") ## This is document oriented xml file. This means the tag has specific name, and not many open tag.
root = tree.getroot() ## This is a element class

print root.tag


art
Children of root:
ui
ji
fm
bdy
bm


In [3]:
print "Children of root:"
for child in root:
    print child.tag  ## tag is an attribute

Children of root:
ui
ji
fm
bdy
bm


In [4]:
print "Children of root:"
for child in root:
    for grandchild in child:
        print grandchild.tag  ## tag is an attribute

Children of root:
dochead
bibl
history
cpyrt
kwdg
abs
sec
sec
sec
sec
sec
sec
sec
refgrp
sec


In [8]:
print "Tile:"
title = root.find("./fm/bibl/title/p") ## This is xpath expression. Find something mathches this expression
print type(title.text)
print title.text

Tile:
<type 'str'>
Standardization of the functional syndesmosis widening by dynamic U.S examination


In [9]:
title = root.find("./fm/bibl/title")
count = 0
for i in title:
    print i.text

Standardization of the functional syndesmosis widening by dynamic U.S examination


In [12]:
email = root.findall("./fm/bibl/aug/au/email")
for i in email:
    print i.text

omer@extremegate.com
mcarmont@hotmail.com
laver17@gmail.com
nyska@internet-zahav.net
kammarh@gmail.com
gideon.mann.md@gmail.com
barns.nz@gmail.com
eukots@gmail.com


In [18]:
## Print out all the information of all the authors, put each author's information in a dictionary
## Put all the authors in a list

author_list = []

author_element = root.findall("./fm/bibl/aug/au")
for author in author_element:
    author_dict = {}
    for i in author:
        if i.tag == 'insr' and i.tag not in author_dict:
            author_dict[i.tag]=[i.attrib]
        if i.tag == 'insr' and i.tag in author_dict:
            author_dict[i.tag].append(i.attrib)
        author_dict[i.tag] = i.text
    author_list.append(author_dict)
    
for i in author_list:
    pprint.pprint (i) 

AttributeError: 'NoneType' object has no attribute 'append'

In [15]:
#!/usr/bin/env python
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys
import xml.etree.ElementTree as ET

article_file = "exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None
        }

        firstname = author.find('./fnm')
        lastname = author.find('./snm')
        email = author.find('./email')
        
        data['fnm'] = firstname.text
        data['snm'] = lastname.text
        data['email'] = email.text
 
        authors.append(data)

    return authors


def test():
    solution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}, {'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}, {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}, {'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}, {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}, {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}, {'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}, {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]
    
    root = get_root(article_file)
    data = get_authors(root)

    assert data[0] == solution[0]
    assert data[1]["fnm"] == solution[1]["fnm"]


test()

In [32]:
#!/usr/bin/env python
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys, but you have to extract the attributes from the "insr" tag
# and add them to the list for the dictionary key "insr"
import xml.etree.ElementTree as ET

article_file = "exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None,
                "insr": []
        }

        
        data['fnm'] = author.find('./fnm').text
        
        data['snm'] = author.find('./snm').text
        
        data['email'] = author.find('./email').text
        
        insr = author.findall('./insr')
        for i in insr:
            data['insr'].append(i.attrib['iid'])

        authors.append(data)

    return authors

In [33]:
pprint.pprint (get_authors(root))

[{'email': 'omer@extremegate.com',
  'fnm': 'Omer',
  'insr': ['I1'],
  'snm': 'Mei-Dan'},
 {'email': 'mcarmont@hotmail.com',
  'fnm': 'Mike',
  'insr': ['I2'],
  'snm': 'Carmont'},
 {'email': 'laver17@gmail.com',
  'fnm': 'Lior',
  'insr': ['I3', 'I4'],
  'snm': 'Laver'},
 {'email': 'nyska@internet-zahav.net',
  'fnm': 'Meir',
  'insr': ['I3'],
  'snm': 'Nyska'},
 {'email': 'kammarh@gmail.com',
  'fnm': 'Hagay',
  'insr': ['I8'],
  'snm': 'Kammar'},
 {'email': 'gideon.mann.md@gmail.com',
  'fnm': 'Gideon',
  'insr': ['I3', 'I5'],
  'snm': 'Mann'},
 {'email': 'barns.nz@gmail.com',
  'fnm': 'Barnaby',
  'insr': ['I6'],
  'snm': 'Clarck'},
 {'email': 'eukots@gmail.com', 'fnm': 'Eugene', 'insr': ['I7'], 'snm': 'Kots'}]
