## Материалы

https://lxml.de/

## import

In [1]:
from lxml import etree

## Парсер

```
help(etree.XMLParser) =>

`ns_clean` - try to clean up redundant namespace declarations
`recover` - try hard to parse through broken XML
`remove_blank_text` - discard blank text nodes between tags, also known as ignorable whitespace
`remove_comments` - discard comments
`remove_pis` - discard processing instructions
`strip_cdata` - replace CDATA sections by normal text content (on by default)
`resolve_entities` - replace entities by their text value (on by default)
```

In [2]:
parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True)  #, remove_comments=True)
parser_blank = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=False)

In [3]:
xml = '''\
<?xml version="1.0" encoding="UTF-8"?>
<a xmlns:abc="http://www.abc.ru" xmlns:unused="http://www.unused.ru" xmlns="http://www.default.ru">
    <!--Comment-->
    <abc:g>1</abc:g>
    <abc:g>2</abc:g>
    <abc:g>3</abc:g>
    <h>1</h>
    <xyz:s xmlns:xyz="http:/www.xyz.ru">100<br/>200</xyz:s>
    <ss>
        <!--Comment for ss-->
        100
    </ss>
</a>
'''



In [4]:
def parse(xml, parser):
    try:
        try:
            root = etree.XML(xml, parser)
            return root
        except ValueError:
            # <?xml version="1.0" encoding="UTF-8"?>
            # =>
            # ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
            root = etree.XML(xml.encode(encoding='utf-8'), parser)
            return root
    except etree.XMLSyntaxError:
        print(len(parser.error_log))
        for error in parser.error_log:
            print(error.message, error.line, error.column)
        raise

## Пробелы между тэгами

In [5]:
xml_blank = '''\
<?xml version="1.0" encoding="UTF-8"?>
<a xmlns="http://www.default.ru">
    <h> <div>123<b>bold</b>456</div></h>
</a>
'''

root_blank = parse(xml_blank, parser_blank)

h = root_blank[0]
assert h.text == ' '

## Inner xml

The two properties `.text` and `.tail` are enough to represent any text content in an XML document.

In [6]:
div = h[0]
assert etree.QName(div).localname == 'div'

In [7]:
div.text

'123'

In [8]:
b = div[0]
b.tag, b.text

('{http://www.default.ru}b', 'bold')

In [9]:
b.tail

'456'

In [10]:
etree.tostring(b)  # с "хвостом"

b'<b xmlns="http://www.default.ru">bold</b>456'

In [11]:
etree.tostring(b, with_tail=False)  # без "хвоста"

b'<b xmlns="http://www.default.ru">bold</b>'

In [12]:
etree.tostring(div, encoding='unicode')

'<div xmlns="http://www.default.ru">123<b>bold</b>456</div>'

In [13]:
etree.tostring(div, method='text', encoding='unicode')  # only the text, without any intermediate tags,

'123bold456'

In [14]:
# Избавляемся от внешних тэгов
(div.text or '') + ''.join(etree.tostring(e, encoding='unicode') for e in div.iterdescendants())

'123<b xmlns="http://www.default.ru">bold</b>456'

In [15]:
def convert_element_to_str(element: etree._Element, *, only_inner_xml: bool = False) -> str:
    if only_inner_xml:
        return (element.text or '') + ''.join(etree.tostring(el, encoding='unicode') for el in element.iterdescendants())
    return etree.tostring(element, encoding='unicode')

In [16]:
convert_element_to_str(div)

'<div xmlns="http://www.default.ru">123<b>bold</b>456</div>'

In [17]:
convert_element_to_str(div, only_inner_xml=True)

'123<b xmlns="http://www.default.ru">bold</b>456'

## Cleanup unused namespace declarations

In [18]:
root = parse(xml, parser)
root

<Element {http://www.default.ru}a at 0x7f90bc45d640>

In [19]:
s = etree.tostring(root, encoding='unicode')
s

'<a xmlns:abc="http://www.abc.ru" xmlns:unused="http://www.unused.ru" xmlns="http://www.default.ru"><!--Comment--><abc:g>1</abc:g><abc:g>2</abc:g><abc:g>3</abc:g><h>1</h><xyz:s xmlns:xyz="http:/www.xyz.ru">100<br/>200</xyz:s><ss><!--Comment for ss-->\n        100\n    </ss></a>'

In [20]:
assert 'www.unused.ru' in s  # присутствует!

In [21]:
etree.cleanup_namespaces(root)

In [22]:
s = etree.tostring(root, encoding='unicode')
s

'<a xmlns:abc="http://www.abc.ru" xmlns="http://www.default.ru"><!--Comment--><abc:g>1</abc:g><abc:g>2</abc:g><abc:g>3</abc:g><h>1</h><xyz:s xmlns:xyz="http:/www.xyz.ru">100<br/>200</xyz:s><ss><!--Comment for ss-->\n        100\n    </ss></a>'

In [23]:
assert 'unused' not in s  # отсутствует!

Повторим то же самое с деревом.

In [24]:
[i for i in dir(root) if 'tree' in i]

['getroottree']

In [25]:
root = parse(xml, parser)
tree = root.getroottree()
etree.cleanup_namespaces(tree)

In [26]:
s = etree.tostring(tree, encoding='unicode')
assert 'www.unused.ru' not in s  # тот же результат!

Экспериментально выяснил, что `etree.cleanup_namespaces(element)` оставляет контекст namespace-ов для элемента `element`,
который мы хотим преобразовать в строку посредством `etree.tostring(element)`. Это происходит даже если
в данном элементе и ниже они не используются. Используй `deepcopy`.

## Итерация по дереву

Важная особенность!

```
root[0] = root[-1]  # this moves the element in lxml.etree!

# If you want to copy an element to a different position in `lxml.etree`,
# consider creating an independent deep copy using the copy module from Python's standard library:

from copy import deepcopy

element = etree.Element('neu')
element.append( deepcopy(root[1]))
```

In [27]:
root = parse(xml, parser)

`lxml.etree` allows you to look up the current namespaces defined for a node through the `.nsmap` property.

Note, however, that this includes **all prefixes** known in the context of an `Element`, not only those that it defines itself.

### Не видите? А дочерний элемент есть!

In [28]:
# Существует дочерний элемент у `s_node`!
s_node = root[5]
s_node.tag, s_node.text

('{http:/www.xyz.ru}s', '100')

In [29]:
assert len(s_node) == 1

In [30]:
child = s_node[0]
child, child.tag, child.text, child.tail

(<Element {http://www.default.ru}br at 0x7f90bc45e5c0>,
 '{http://www.default.ru}br',
 None,
 '200')

In [31]:
# Комментарий также добавляет дочерний элемент!
ss_node = root[6]
len(ss_node), type(ss_node[0]), ss_node[0].text

(1, lxml.etree._Comment, 'Comment for ss')

In [32]:
comment = ss_node[0]
comment.tag is etree.Comment, ss_node.tag

(True, '{http://www.default.ru}ss')

In [33]:
type(comment).__mro__

(lxml.etree._Comment,
 lxml.etree.__ContentOnlyElement,
 lxml.etree._Element,
 object)

In [34]:
type(ss_node).__mro__

(lxml.etree._Element, object)

In [35]:
etree.Element

<cyfunction Element at 0x7f90bc428110>

In [36]:
for element in root.iter():
    print('---> Usual element ---' if isinstance(element.tag, str) else '---> Comment, PIS, Entity etc ---')
    print(
        '.tag=', element.tag,
        '.text=', element.text,
        '.nsmap=', element.nsmap
    )
    if isinstance(element.tag, str):
        tag = etree.QName(element)
        print(tag.text, tag.namespace, tag.localname)
        print('=>')
        nsmap_inv = {v: k for k, v in element.nsmap.items()}
        print(nsmap_inv[tag.namespace], ':', tag.localname, sep='')
    print('\n')

---> Usual element ---
.tag= {http://www.default.ru}a .text= None .nsmap= {'abc': 'http://www.abc.ru', 'unused': 'http://www.unused.ru', None: 'http://www.default.ru'}
{http://www.default.ru}a http://www.default.ru a
=>
None:a


---> Comment, PIS, Entity etc ---
.tag= <cyfunction Comment at 0x7f90bc4281e0> .text= Comment .nsmap= {}


---> Usual element ---
.tag= {http://www.abc.ru}g .text= 1 .nsmap= {'abc': 'http://www.abc.ru', 'unused': 'http://www.unused.ru', None: 'http://www.default.ru'}
{http://www.abc.ru}g http://www.abc.ru g
=>
abc:g


---> Usual element ---
.tag= {http://www.abc.ru}g .text= 2 .nsmap= {'abc': 'http://www.abc.ru', 'unused': 'http://www.unused.ru', None: 'http://www.default.ru'}
{http://www.abc.ru}g http://www.abc.ru g
=>
abc:g


---> Usual element ---
.tag= {http://www.abc.ru}g .text= 3 .nsmap= {'abc': 'http://www.abc.ru', 'unused': 'http://www.unused.ru', None: 'http://www.default.ru'}
{http://www.abc.ru}g http://www.abc.ru g
=>
abc:g


---> Usual element ---
.t

## Namespaces

In [37]:
xml_data = '''\
<h attr_h="attr_h">
    <a xmlns:aixm="http://www.aixm.ru" xmlns="http://www.default.ru">
        <aixm:b aixm:attr_b1="111" attr_b2="222">abc</aixm:b>
        <aixm:X></aixm:X>
        <aixm:Y></aixm:Y>
    </a>
</h>
'''
h = etree.fromstring(xml_data)

In [38]:
hn = etree.QName(h)
hn.localname, hn.namespace, hn.text, h.nsmap, h.attrib

('h', None, 'h', {}, {'attr_h': 'attr_h'})

In [39]:
a = h[0]
an = etree.QName(a)
an.localname, an.namespace, an.text, a.nsmap, a.attrib

('a',
 'http://www.default.ru',
 '{http://www.default.ru}a',
 {'aixm': 'http://www.aixm.ru', None: 'http://www.default.ru'},
 {})

In [40]:
b = a[0]
bn = etree.QName(b)
bn.localname, bn.namespace, bn.text, b.nsmap, b.attrib
# Заметьте, что у `attr_b2` нет default namespace!

('b',
 'http://www.aixm.ru',
 '{http://www.aixm.ru}b',
 {'aixm': 'http://www.aixm.ru', None: 'http://www.default.ru'},
 {'{http://www.aixm.ru}attr_b1': '111', 'attr_b2': '222'})

In [41]:
from typing import Optional, Union

class QualifiedName:
    def __init__(self, element_or_attr: Union[etree._Element, str], /, *, attr_element: Optional[etree._Element] = None) -> None:
        if isinstance(element_or_attr, str):
            assert attr_element is not None
            self._qname = etree.QName(element_or_attr)
            self._element = attr_element
        else:
            self._element = element_or_attr
            self._qname = etree.QName(element_or_attr)

    @property
    def local_name(self) -> str:
        return self._qname.localname

    @property
    def namespace_uri(self) -> Optional[str]:
        return self._qname.namespace

    @property
    def full_name_with_uri(self) -> str:
        # full name == qualified name
        # James Clark notation: {http://www.w3.org/1999/xhtml}body
        return self._qname.text

    @property
    def full_name_with_prefix(self) -> str:
        if self.namespace_prefix is None:
            return self.local_name
        return f'{self.namespace_prefix}:{self.local_name}'

    @property
    def namespace_prefix(self) -> Optional[str]:
        # элемент вне namespace-а
        if self.namespace_uri is None:
            return None

        # `.nsmap` includes all prefixes known in the context of an Element,
        # not only those that it defines itself
        for namespace_prefix, namespace_uri in self._element.nsmap.items():
            if namespace_uri == self.namespace_uri:
                # namespace_prefix == None, если элемент находится в default namespace
                return namespace_prefix

        # невероятная ошибка, метод должен вернуть раньше
        raise RuntimeError('Ошибка получения namespace prefix.')

In [42]:
print(QualifiedName(b).full_name_with_prefix)

aixm:b


In [43]:
for e in (h, a, b):
    en = QualifiedName(e)
    print(en.local_name, en.namespace_uri, en.namespace_prefix, en.full_name_with_uri, en.full_name_with_prefix)

h None None h h
a http://www.default.ru None {http://www.default.ru}a a
b http://www.aixm.ru aixm {http://www.aixm.ru}b aixm:b


In [44]:
assert ['h', 'a', 'aixm:b'] == [QualifiedName(e).full_name_with_prefix for e in (h, a, b)]

In [45]:
b.attrib

{'{http://www.aixm.ru}attr_b1': '111', 'attr_b2': '222'}

In [46]:
for attr in b.attrib:
    print(QualifiedName(attr, attr_element=b).full_name_with_prefix)

aixm:attr_b1
attr_b2


## br

In [47]:
xml = '<a>foo<br/>bar</a>'  # <br/> == <br />
root = etree.fromstring(xml)
root

<Element a at 0x7f90bc45cac0>

In [48]:
root.text, root[0], root[0].text, root[0].tail

('foo', <Element br at 0x7f90bc44fb40>, None, 'bar')

In [49]:
s = etree.tostring(root, method='html', encoding='unicode')

In [50]:
assert '<br>' in s

In [51]:
etree.tostring(root)

b'<a>foo<br/>bar</a>'

In [52]:
xml_br = '<a><div>foo<br>bar</div></a>'
# root = etree.fromstring(xml)  # -> XMLSyntaxError: Opening and ending tag mismatch: br line 1 and div

In [53]:
root = parse(xml_br, parser)  # XMLSyntaxError НЕ возникнет, если `recover=True`

In [54]:
root

<Element a at 0x7f90c054ecc0>

In [55]:
root[0].text, root[0][0], root[0][0].text, root[0][0].tail

('foo', <Element br at 0x7f90bc464f00>, 'bar', None)

In [56]:
for e in root.iter():
    print(e.tag, e.text)

a None
div foo
br bar


In [57]:
etree.tostring(root)  # repairing

b'<a><div>foo<br>bar</br></div></a>'

In [58]:

# Используем HTML парсер - OK
tree = etree.HTML(xml_br)

In [59]:
etree.tostring(tree)

b'<html><body><a><div>foo<br/>bar</div></a></body></html>'

In [60]:
from lxml.html.clean import Cleaner

# Удалим br
cleaner = Cleaner(remove_tags=['br'])
s = cleaner.clean_html(xml_br)
s

'<a><div>foobar</div></a>'

In [61]:
assert 'foobar' in s