https://lxml.de/tutorial.html

## lxml

In [106]:
from lxml import etree

In [107]:
root = etree.Element("root")

In [108]:
root.tag

'root'

In [109]:
root.append(etree.Element("child1"))

In [110]:
root

<Element root at 0x7f7b68a592c8>

In [111]:
child2 = etree.SubElement(root, "child2")
child3 = etree.SubElement(root, "child3")

In [112]:
print(str(etree.tostring(root, pretty_print=True)))

b'<root>\n  <child1/>\n  <child2/>\n  <child3/>\n</root>\n'


In [118]:
from IPython.display import Image, display
display(etree.tostring(root))

b'<root><child1/><child2/><child3/></root>'

In [114]:
import pprint as pp

In [116]:
pp.pprint(etree.tostring(root))

b'<root><child1/><child2/><child3/></root>'


In [15]:
root[0].tag,root[1].tag

('child1', 'child2')

In [17]:
len(root)

3

In [18]:
>>> for child in root:
...     print(child.tag)

child1
child2
child3


In [20]:
root[0].getparent().tag

'root'

In [21]:
>>> root = etree.Element("root", interesting="totally")
>>> etree.tostring(root)

b'<root interesting="totally"/>'

In [22]:
print(root.get("interesting"))

totally


In [24]:
>>> print(root.get("hello"))
>>> root.set("hello", "Huhu")
>>> print(root.get("hello"))

Huhu
Huhu


In [25]:
etree.tostring(root)

b'<root interesting="totally" hello="Huhu"/>'

In [26]:
>>> for name, value in sorted(root.items()):
...     print('%s = %r' % (name, value))

hello = 'Huhu'
interesting = 'totally'


In [27]:
>>> d = dict(root.attrib)
>>> sorted(d.items())

[('hello', 'Huhu'), ('interesting', 'totally')]

In [28]:
>>> root = etree.Element("root")
>>> root.text = "TEXT"

>>> print(root.text)

>>> etree.tostring(root)

TEXT


b'<root>TEXT</root>'

In [29]:
>>> root = etree.Element("root")
>>> etree.SubElement(root, "child").text = "Child 1"
>>> etree.SubElement(root, "child").text = "Child 2"
>>> etree.SubElement(root, "another").text = "Child 3"

>>> print(etree.tostring(root, pretty_print=True))

b'<root>\n  <child>Child 1</child>\n  <child>Child 2</child>\n  <another>Child 3</another>\n</root>\n'


In [30]:
>>> for element in root.iter():
...     print("%s - %s" % (element.tag, element.text))

root - None
child - Child 1
child - Child 2
another - Child 3


In [31]:
>>> for element in root.iter("child"):
...     print("%s - %s" % (element.tag, element.text))

child - Child 1
child - Child 2


In [32]:
>>> for element in root.iter("another", "child"):
...     print("%s - %s" % (element.tag, element.text))

child - Child 1
child - Child 2
another - Child 3


In [33]:
>>> root.append(etree.Entity("#234"))
>>> root.append(etree.Comment("some comment"))

In [34]:
etree.tostring(root)

b'<root><child>Child 1</child><child>Child 2</child><another>Child 3</another>&#234;<!--some comment--></root>'

In [36]:
>>> for element in root.iter():
...     if isinstance(element.tag, str):  # or 'str' in Python 3
...         print("%s - %s" % (element.tag, element.text))
...     else:
...         print("SPECIAL: %s - %s" % (element, element.text))

root - None
child - Child 1
child - Child 2
another - Child 3
SPECIAL: &#234; - &#234;
SPECIAL: <!--some comment--> - some comment


In [37]:
>>> for element in root.iter(tag=etree.Element):
...     print("%s - %s" % (element.tag, element.text))

root - None
child - Child 1
child - Child 2
another - Child 3


In [38]:
>>> root2 = etree.XML('<root><a><b/></a></root>')

In [39]:
root2

<Element root at 0x7f7b6802e2c8>

In [40]:
etree.tostring(root2)

b'<root><a><b/></a></root>'

In [42]:
print(etree.tostring(root2, pretty_print=True))

b'<root>\n  <a>\n    <b/>\n  </a>\n</root>\n'


In [43]:
>>> root = etree.XML(
...    '<html><head/><body><p>Hello<br/>World</p></body></html>')

In [45]:
etree.tostring(root, pretty_print=True)

b'<html>\n  <head/>\n  <body>\n    <p>Hello<br/>World</p>\n  </body>\n</html>\n'

In [46]:
>>> root3 = etree.XML('''\
... <?xml version="1.0"?>
... <!DOCTYPE root SYSTEM "test" [ <!ENTITY tasty "parsnips"> ]>
... <root>
...   <a>&tasty;</a>
... </root>
... ''')

In [47]:
etree.tostring(root3)

b'<root>\n  <a>parsnips</a>\n</root>'

In [48]:
>>> some_xml_data = "<root>data</root>"

>>> root = etree.fromstring(some_xml_data)
>>> print(root.tag)

>>> etree.tostring(root)

root


b'<root>data</root>'

In [49]:
>>> root = etree.XML("<root>data</root>")
>>> print(root.tag)

root


### Namespaces

In [50]:
>>> from lxml.builder import E

>>> def CLASS(*args): # class is a reserved word in Python
...     return {"class":' '.join(args)}

>>> html = page = (
...   E.html(       # create an Element called "html"
...     E.head(
...       E.title("This is a sample document")
...     ),
...     E.body(
...       E.h1("Hello!", CLASS("title")),
...       E.p("This is a paragraph with ", E.b("bold"), " text in it!"),
...       E.p("This is another paragraph, with a", "\n      ",
...         E.a("link", href="http://www.python.org"), "."),
...       E.p("Here are some reserved characters: <spam&egg>."),
...       etree.XML("<p>And finally an embedded XHTML fragment.</p>"),
...     )
...   )
... )

In [51]:
print(etree.tostring(page, pretty_print=True))

b'<html>\n  <head>\n    <title>This is a sample document</title>\n  </head>\n  <body>\n    <h1 class="title">Hello!</h1>\n    <p>This is a paragraph with <b>bold</b> text in it!</p>\n    <p>This is another paragraph, with a\n      <a href="http://www.python.org">link</a>.</p>\n    <p>Here are some reserved characters: &lt;spam&amp;egg&gt;.</p>\n    <p>And finally an embedded XHTML fragment.</p>\n  </body>\n</html>\n'


In [78]:
>>> from lxml.builder import ElementMaker # lxml only !

>>> E = ElementMaker(namespace="http://my.de/fault/namespace",
...                  nsmap={'p' : "http://my.de/fault/namespace"})

>>> DOC = E.doc
>>> TITLE = E.title
>>> SECTION = E.section
>>> PAR = E.par

>>> my_doc = DOC(
...   TITLE("The dog and the hog"),
...   SECTION(
...     TITLE("The dog"),
...     PAR("Once upon a time, ..."),
...     PAR("And then ...")
...   ),
...   SECTION(
...     TITLE("The hog"),
...     PAR("Sooner or later ...")
...   )
... )

In [79]:
print(etree.tostring(my_doc, pretty_print=True))

b'<p:doc xmlns:p="http://my.de/fault/namespace">\n  <p:title>The dog and the hog</p:title>\n  <p:section>\n    <p:title>The dog</p:title>\n    <p:par>Once upon a time, ...</p:par>\n    <p:par>And then ...</p:par>\n  </p:section>\n  <p:section>\n    <p:title>The hog</p:title>\n    <p:par>Sooner or later ...</p:par>\n  </p:section>\n</p:doc>\n'


In [80]:
root = etree.XML(etree.tostring(my_doc, pretty_print=True))

In [81]:
etree.tostring(root)

b'<p:doc xmlns:p="http://my.de/fault/namespace">\n  <p:title>The dog and the hog</p:title>\n  <p:section>\n    <p:title>The dog</p:title>\n    <p:par>Once upon a time, ...</p:par>\n    <p:par>And then ...</p:par>\n  </p:section>\n  <p:section>\n    <p:title>The hog</p:title>\n    <p:par>Sooner or later ...</p:par>\n  </p:section>\n</p:doc>'

In [82]:
root.nsmap

{'p': 'http://my.de/fault/namespace'}

In [83]:
for el in root.iter('*'):
    print(el.tag)

{http://my.de/fault/namespace}doc
{http://my.de/fault/namespace}title
{http://my.de/fault/namespace}section
{http://my.de/fault/namespace}title
{http://my.de/fault/namespace}par
{http://my.de/fault/namespace}par
{http://my.de/fault/namespace}section
{http://my.de/fault/namespace}title
{http://my.de/fault/namespace}par


In [97]:
# iter() - iterate thru all matching elements in the doc
for el in root.iter('{http://my.de/fault/namespace}title'):
    print(el.tag)

{http://my.de/fault/namespace}title
{http://my.de/fault/namespace}title
{http://my.de/fault/namespace}title


In [96]:
# findall() returns a list of matching elements under root
[ b.tag for b in root.findall("{http://my.de/fault/namespace}title") ]

['{http://my.de/fault/namespace}title']

In [98]:
[ b.tag for b in root.find("{http://my.de/fault/namespace}title") ]

[]

In [99]:
[ b.tag for b in root.iterfind("{http://my.de/fault/namespace}title") ]

['{http://my.de/fault/namespace}title']

In [103]:
[ b.tag for b in root.iterfind(".//{http://my.de/fault/namespace}title") ]

['{http://my.de/fault/namespace}title',
 '{http://my.de/fault/namespace}title',
 '{http://my.de/fault/namespace}title']

In [69]:
root = etree.XML("<root><a x='123'>aText<b/><c/><b/></a></root>")

In [72]:
print(root.find("a").tag)

a


In [73]:
>>> [ b.tag for b in root.iterfind(".//b") ]

['b', 'b']

## parse XML using Beautifulsoup

https://linuxhint.com/parse_xml_python_beautifulsoup/

In [1]:
# import bs4, lxml
from bs4 import BeautifulSoup as bs

In [2]:
xml_doc = """
<?xml version="1.0" encoding="UTF-8"?>
<root testAttr="testValue">
The Tree
<children>
<child name="Jack">First</child>
<child name="Rose">Second</child>
<child name="Blue Ivy">
Third
<grandchildren>
<data>One</data>
<data>Two</data>
<unique>Twins</unique>
</grandchildren>
</child>
<child name="Jane">Fourth</child>
</children>
</root>
"""

In [3]:
file_xml = "sample.xml"
with open(file_xml, "w") as f:
    f.write(xml_doc)

In [4]:
!ls -l {file_xml}

-rw-r--r-- 1 gong gong 338 Feb 15 11:40 sample.xml


In [5]:


# Read the XML file
with open(file_xml) as f:
    bs_content = bs(f.read(), "lxml")

In [6]:
type(bs_content)

bs4.BeautifulSoup

In [14]:
print(bs_content.prettify())

<?xml version="1.0" encoding="UTF-8"?>
<html>
 <body>
  <root testattr="testValue">
   The Tree
   <children>
    <child name="Jack">
     First
    </child>
    <child name="Rose">
     Second
    </child>
    <child name="Blue Ivy">
     Third
     <grandchildren>
      <data>
       One
      </data>
      <data>
       Two
      </data>
      <unique>
       Twins
      </unique>
     </grandchildren>
    </child>
    <child name="Jane">
     Fourth
    </child>
   </children>
  </root>
 </body>
</html>


In [7]:
# 1st matching element
e = bs_content.find("data")
print(e)

<data>One</data>


In [8]:
print(e.prettify())

<data>
 One
</data>



In [12]:
# all matching elements
for e in bs_content.find_all("data"):
    print(e.prettify())

<data>
 One
</data>

<data>
 Two
</data>



In [13]:
for e in bs_content.find_all("child"):
    print(e.prettify())

<child name="Jack">
 First
</child>

<child name="Rose">
 Second
</child>

<child name="Blue Ivy">
 Third
 <grandchildren>
  <data>
   One
  </data>
  <data>
   Two
  </data>
  <unique>
   Twins
  </unique>
 </grandchildren>
</child>

<child name="Jane">
 Fourth
</child>



In [15]:
for e in bs_content.find_all("child", {"name": "Jane"}):
    print(e.prettify())

<child name="Jane">
 Fourth
</child>



In [18]:
# parent
third_child = bs_content.find("child", {"name": "Blue Ivy"})
print(third_child.prettify())

<child name="Blue Ivy">
 Third
 <grandchildren>
  <data>
   One
  </data>
  <data>
   Two
  </data>
  <unique>
   Twins
  </unique>
 </grandchildren>
</child>



In [19]:
print(third_child.parent.prettify())

<children>
 <child name="Jack">
  First
 </child>
 <child name="Rose">
  Second
 </child>
 <child name="Blue Ivy">
  Third
  <grandchildren>
   <data>
    One
   </data>
   <data>
    Two
   </data>
   <unique>
    Twins
   </unique>
  </grandchildren>
 </child>
 <child name="Jane">
  Fourth
 </child>
</children>



In [20]:
# children
print(list(third_child.children))

['\nThird\n', <grandchildren>
<data>One</data>
<data>Two</data>
<unique>Twins</unique>
</grandchildren>, '\n']


In [21]:
# siblings
print(list(third_child.previous_siblings))

['\n', <child name="Rose">Second</child>, '\n', <child name="Jack">First</child>, '\n']


In [22]:
print(list(third_child.next_siblings))

['\n', <child name="Jane">Fourth</child>, '\n']


```Tag Attribute Values
Tag Text
Tag Content```

In [23]:
print(third_child.get("name"), ",", third_child.get("age"))

Blue Ivy , None


In [24]:
print(third_child.text)


Third

One
Two
Twins




In [25]:
print(list(third_child.strings))

['\nThird\n', '\n', 'One', '\n', 'Two', '\n', 'Twins', '\n', '\n']


In [26]:
third_child.contents

['\nThird\n', <grandchildren>
 <data>One</data>
 <data>Two</data>
 <unique>Twins</unique>
 </grandchildren>, '\n']