In [73]:
from lxml import etree

In [112]:
xmlstring = """<?xml version="1.0"?>
<data>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <country name="Singapore">
        <rank>4</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor name="Malaysia" direction="N"/>
    </country>
    <country name="Panama">
        <rank>68</rank>
        <year>2011</year>
        <gdppc>13600</gdppc>
        <neighbor name="Costa Rica" direction="W"/>
        <neighbor name="Colombia" direction="E"/>
    </country>
</data>
"""

### Parsing 

In [113]:
# etree.parse('country_data.xml') from file
root = etree.fromstring(xmlstring)

In [114]:
root.tag, root.attrib

('data', {})

In [115]:
for child in root:
    print(child.tag, child.attrib)

country {'name': 'Liechtenstein'}
country {'name': 'Singapore'}
country {'name': 'Panama'}


In [116]:
# Children are nested, and we can access specific child nodes by index:
root[1].attrib

{'name': 'Singapore'}

In [117]:
root[1][0].tag, root[1][0].text

('rank', '4')

In [118]:
# first element with tag country
root.find('country')

<Element country at 0x1057a1b40>

In [119]:
# all elements with tag country
root.findall('country')

[<Element country at 0x1057a1b40>,
 <Element country at 0x104aed140>,
 <Element country at 0x1058777c0>]

# Xpath examples

There are two usedfull functions depending on how you parse the xml.

In [82]:
root = etree.fromstring(xmlstring)

In [83]:
# Top-level elements
# "." = selects current node
root.findall(".")

[<Element data at 0x105877bc0>]

In [84]:
root.xpath(".")

[<Element data at 0x105877bc0>]

Boths are very useful and xpath is more powefull. 

In [85]:
# All neighbor grand-children of 'country' of the top-level elements
# matches country tag and then neighbor tag
root.xpath("./country/neighbor")

[<Element neighbor at 0x1057cc700>,
 <Element neighbor at 0x105870300>,
 <Element neighbor at 0x105870440>,
 <Element neighbor at 0x105870880>,
 <Element neighbor at 0x105870800>]

In [86]:
for neigh in root.xpath("./country/neighbor"):
    print(neigh.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Malaysia', 'direction': 'N'}
{'name': 'Costa Rica', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


In [87]:
# similar result * matches everything
for neigh in root.xpath("./*/neighbor"):
    print(neigh.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Malaysia', 'direction': 'N'}
{'name': 'Costa Rica', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


In [88]:
# matches neighbor anywhere
# // matches anywhere
for neigh in root.xpath(".//neighbor"):
    print(neigh.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Malaysia', 'direction': 'N'}
{'name': 'Costa Rica', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


In [89]:
for el in root.xpath(".//year"):
    print(el.text)

2008
2011
2011


### Diference between "/" and "//"

In [90]:
xmlstring = """<?xml version="1.0"?>
<data>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <neighbor name="Colombia" direction="E"/>
</data>
"""

In [91]:
root = etree.fromstring(xmlstring)

In [93]:
for neigh in root.xpath("./country/neighbor"):
    print(neigh.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}


In [94]:
for neigh in root.xpath("./*/neighbor"):
    print(neigh.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}


In [95]:
for neigh in root.xpath(".//neighbor"):
    print(neigh.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


## using @

In [99]:
neigs = root.xpath(".//neighbor[@name='Austria']")

In [105]:
neigs[0].attrib

{'name': 'Austria', 'direction': 'E'}

In [124]:
xmlstring = """<?xml version="1.0"?>
<root>
  <node1>
    <nodename>Matched Value 1</nodename>
  </node1>
  <node2>
    <nodename>Matched Value 2</nodename>
  </node2>
</root>"""

In [125]:
root = etree.fromstring(xmlstring)

In [127]:
root.xpath('//nodename')

[<Element nodename at 0x104ad5280>, <Element nodename at 0x10579b8c0>]

### More complex example

In [121]:
xmlstring = """<?xml version="1.0"?>
<bookstore>
    <book>
        <title lang="en">Introduction to XPath</title>
        <author>John Doe</author>
        <price>29.99</price>
    </book>
    <book>
        <title lang="fr">XPath et XML</title>
        <author>Marie Dupont</author>
        <price>34.95</price>
    </book>
    <book>
        <title lang="en">XML Programming</title>
        <author>David Smith</author>
        <price>19.99</price>
    </book>
</bookstore>"""

In [123]:
# Parse the XML string
root = etree.fromstring(xmlstring)

# Find authors who write books in English
authors = root.xpath(".//book[title/@lang='en']/author")

# Print the authors
for author in authors:
    print(author.text)

John Doe
David Smith
