# XPath

In [1]:
from lxml import etree
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
# 声明了一段HTML文本，
# 调用HTML类进行初始化，这样就成功构造了一个XPath解析对象
html = etree.HTML(text)
result = etree.tostring(html)

# HTML文本中的最后一个li节点是没有闭合的，
# 但是etree模块可以自动修正HTML文本。
print(result.decode('utf-8'))

# 经过处理之后，li节点标签被补全，
# 并且还自动添加了body、html节点。

<html><body><div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </li></ul>
 </div>
</body></html>


## 所有节点

In [2]:
html = etree.HTML(text)
# result = etree.tostring(html)
all_nodes = html.xpath('//*')
# 使用*代表匹配所有节点，也就是整个HTML文本中的所有节点都会被获取。
# 可以看到，返回形式是一个列表，每个元素是Element类型，其后跟了节点的名称，
# 如html、body、div、ul、li、a等，所有节点都包含在列表中了。
print(all_nodes)

[<Element html at 0x1fe4866b148>, <Element body at 0x1fe4a2f8388>, <Element div at 0x1fe4a2f8c48>, <Element ul at 0x1fe4a2f85c8>, <Element li at 0x1fe4a425208>, <Element a at 0x1fe48690f48>, <Element li at 0x1fe4a3de388>, <Element a at 0x1fe4a3de348>, <Element li at 0x1fe4a3de608>, <Element a at 0x1fe48690ec8>, <Element li at 0x1fe4a3d9808>, <Element a at 0x1fe4a3d9648>, <Element li at 0x1fe4a3d98c8>, <Element a at 0x1fe4a3d9b08>]


匹配也可以指定节点名称。如果想获取所有li节点

In [4]:
result = html.xpath('//li')
print(result)

[<Element li at 0x1fe4a425208>, <Element li at 0x1fe4a3de388>, <Element li at 0x1fe4a3de608>, <Element li at 0x1fe4a3d9808>, <Element li at 0x1fe4a3d98c8>]


In [5]:
print(result[0])

<Element li at 0x1fe4a425208>


## 子节点

In [7]:
html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))

<html><body><div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </li></ul>
 </div>
</body></html>


In [8]:
# 选择li节点的所有直接a子节点
result = html.xpath('//li/a')
print(result)

[<Element a at 0x1fe4a48b708>, <Element a at 0x1fe4a2f8188>, <Element a at 0x1fe4a4acc48>, <Element a at 0x1fe4a4acb88>, <Element a at 0x1fe4a4acb48>]


In [9]:
# 要获取所有子孙节点，就可以使用//

result = html.xpath('//ul//a')
print(result)

[<Element a at 0x1fe4a48b708>, <Element a at 0x1fe4a2f8188>, <Element a at 0x1fe4a4acc48>, <Element a at 0x1fe4a4acb88>, <Element a at 0x1fe4a4acb48>]


In [10]:
# 如果这里用//ul/a，就无法获取任何结果了。
# 因为/用于获取直接子节点，而在ul节点下没有直接的a子节点，
# 只有li节点，所以无法获取任何匹配结果

result = html.xpath('//ul/a')
print(result)


[]


## 父节点

In [11]:
# 查找父节点呢？这可以用..来实现。
# 选中href属性为link4.html的a节点，然后再获取其父节点，然后再获取其class属性，

html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))

<html><body><div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </li></ul>
 </div>
</body></html>


In [14]:
result = html.xpath('//a[@href="link4.html"]/../@class')
print(result)
print(type(result))

['item-1']
<class 'list'>


## 属性匹配

In [13]:
result = html.xpath('//li[@class="item-0"]')
print(result)

[<Element li at 0x1fe4a4b0e08>, <Element li at 0x1fe4a4ac788>]


##  文本获取

In [15]:
#XPath中的text()方法获取节点中的文本，
# 接下来尝试获取前面li节点中的文本

result = html.xpath('//li[@class="item-0"]/text()')
print(result)

['\n     ']


XPath中text()前面是/，而此处/的含义是选取直接子节点，很明显li的直接子节点都是a节点，文本都是在a节点内部的，所以这里匹配到的结果就是被修正的li节点内部的换行符，因为自动修正的li节点的尾标签换行了

如果想获取li节点内部的文本，就有两种方式，一种是先选取a节点再获取文本，另一种就是使用//

In [16]:

result = html.xpath('//li[@class="item-0"]/a/text()')
print(result)

['first item', 'fifth item']


In [17]:
result = html.xpath('//li[@class="item-0"]//text()')
print(result)

['first item', 'fifth item', '\n     ']


## 属性获取

In [18]:
result = html.xpath('//li/a/@href')
print(result)

['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']


## 属性多值匹配

In [20]:
result = etree.tostring(html)
print(result.decode('utf-8'))

<html><body><div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </li></ul>
 </div>
</body></html>


In [21]:
result = html.xpath('//li[@class="li"]/a/text()')
print(result)

[]


HTML文本中li节点的class属性有两个值li和li-first，此时如果还想用之前的属性匹配获取，就无法匹配了，此时的运行结果如下

In [22]:

from lxml import etree
text = '''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[@class="li"]/a/text()')
print(result)

[]


In [23]:
from lxml import etree
text = '''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)

['first item']


## 多属性匹配

In [24]:
from lxml import etree
text = '''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
print(result)

['first item']


##  按序选择

In [25]:
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/a/text()')
print(result)

['first item']


In [26]:
result = html.xpath('//li[last()]/a/text()')
print(result)

['fifth item']


In [27]:
result = html.xpath('//li[position()<3]/a/text()')
print(result)

['first item', 'second item']


In [28]:
result = html.xpath('//li[last()-2]/a/text()')
print(result)

['third item']


## 节点轴选择

调用了ancestor轴，可以获取所有祖先节点。其后需要跟两个冒号，然后是节点的选择器，这里我们直接使用*，表示匹配所有节点，因此返回结果是第一个li节点的所有祖先节点，包括html、body、div和ul。

In [40]:
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html"><span>first item</span></a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))

<html><body><div>
    <ul>
         <li class="item-0"><a href="link1.html"><span>first item</span></a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </li></ul>
 </div>
</body></html>


In [32]:
result = html.xpath('//li[1]/ancestor::*')
print(result)

[<Element html at 0x1fe4a4bafc8>, <Element body at 0x1fe4a4ba708>, <Element div at 0x1fe4a4b0c48>, <Element ul at 0x1fe4a4b01c8>]


In [34]:
result = html.xpath('//li[1]/ancestor::div')
print(result)

[<Element div at 0x1fe4a4b0c48>]


In [35]:
result = html.xpath('//li[1]/attribute::*')
print(result)

['item-0']


In [37]:
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)

[<Element a at 0x1fe48655188>]


In [38]:
result = html.xpath('//li[1]/descendant::span')
print(result)

[<Element span at 0x1fe4a48ba08>]


In [39]:
result = html.xpath('//li[1]/following::*[2]')
print(result)

[<Element a at 0x1fe4a486988>]
