In [1]:
from lxml import etree
text = """
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class ="item-0"><a href="links html">fifth item</a>
</ul>
</div>
"""

In [2]:
"""To get all html nodes"""

html = etree.parse('../data/test.html', etree.HTMLParser())
result = html.xpath('//*')
print(result)
result = html.xpath('//li')
print(result)
print(result[0])

[<Element html at 0x7f0054e02200>, <Element body at 0x7f0054e68700>, <Element div at 0x7f0054ec1940>, <Element ul at 0x7f0054ec1680>, <Element li at 0x7f0054ec1980>, <Element a at 0x7f0054ec19c0>, <Element li at 0x7f0054ec1a40>, <Element a at 0x7f0054ec1a80>, <Element li at 0x7f0054ec1ac0>, <Element a at 0x7f0054ec1a00>, <Element li at 0x7f0054ec1b00>, <Element a at 0x7f0054ec1b40>, <Element li at 0x7f0054ec1b80>, <Element a at 0x7f0054ec1bc0>]
[<Element li at 0x7f0054ec1980>, <Element li at 0x7f0054ec1a40>, <Element li at 0x7f0054ec1ac0>, <Element li at 0x7f0054ec1b00>, <Element li at 0x7f0054ec1b80>]
<Element li at 0x7f0054ec1980>


In [3]:
"""To get the direct child ndoe"""

result = html.xpath('//li/a')
print(result)

[<Element a at 0x7f0054ec2240>, <Element a at 0x7f0054ec23c0>, <Element a at 0x7f0054ec2400>, <Element a at 0x7f0054ec2440>, <Element a at 0x7f0054ec2480>]


In [4]:
"""To get the father node"""

result = html.xpath('//a[@href="link4.html"]/../@class')
print(result) # 或将..换成 'parent::*'

['item-1']


In [5]:
"""Search by attributes"""

result = html.xpath('//li[@class="item-0"]')
print(result)

[<Element li at 0x7f0054ec27c0>, <Element li at 0x7f0054ec3440>]


In [6]:
"""To get the text content"""

result = html.xpath('//li[@class="item-0"]//text()')    # 所有子孙节点的文本
print(result)   # 会包含换行符等特殊字符
result = html.xpath('//li[@class="item-0"]/a/text()')   # a节点中的文本
print(result)   # 可以保证获取结果是整洁的

['first item', 'fifth item', '\r\n    ']
['first item', 'fifth item']


In [7]:
"""To get the attributes"""

result = html.xpath('//li/a/@href')
print(result)

['link1.html', 'link2.html', 'link3.html', 'link4.html', 'links html']


In [8]:
"""Attribute multi-value matching"""

text = """
<li class="li li-first"><a href="link.html">first item</a></li>
"""
html = etree.HTML(text)
result = html.xpath('//li[@class="li"]/a/text()')
print(result)   # 无输出
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)   # 正确输出------contains(@属性名称，属性值)

[]
['first item']


In [9]:
"""Multi-attribute matching"""

text = """
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
"""
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
print(result)   # 注意@符号，任何属性都必须有

['first item']


In [10]:
"""Select sequentially"""

html = etree.parse('../data/test.html', etree.HTMLParser())
result = html.xpath('//li[1]/a/text()')     # 第一个li节点
print(result)
result = html.xpath('//li[last()]/a/text()')    # 最后一个li节点
print(result)
result = html.xpath('//li[position()<3]/a/text()')  # 位置小于3的节点
print(result)
result = html.xpath('//li[last()-2]/a/text()')  # 倒数第三个节点，不能用-3
print(result)

['first item']
['fifth item']
['first item', 'second item']
['third item']


In [11]:
"""Node axis selection"""

html = etree.parse('../data/test.html', etree.HTMLParser())

result = html.xpath('//li[1]/ancestor::*')  # 获取所有祖先节点，需要跟::，*表示节点选择器匹配所有节点
print(result)
result = html.xpath('//li[1]/ancestor::div')    # 节点选择器为div
print(result)
result = html.xpath('//li[1]/attribute::*')     # 调用attribute轴，获取所有属性值，*表示获取节点所有属性
print(result)
result = html.xpath('//li[1]/child::a[@href="link1.html"]') # child获取直接子节点，::后跟节点选择器
print(result)
result = html.xpath('//li[1]/descendant::span')  # descendant轴获取所有子孙节点
print(result)
result = html.xpath('//li[1]/following::*[2]')   # following轴获取当前节点之后的所有节点
print(result)
result = html.xpath('//li[1]/following-sibling::*')  # 当前节点之后所有同级节点
print(result)

[<Element html at 0x7f00315df000>, <Element body at 0x7f00315df1c0>, <Element div at 0x7f00315df280>, <Element ul at 0x7f00315df2c0>]
[<Element div at 0x7f00315df280>]
['item-0']
[<Element a at 0x7f00315df3c0>]
[]
[<Element a at 0x7f00315dc9c0>]
[<Element li at 0x7f00315df2c0>, <Element li at 0x7f00315ddc00>, <Element li at 0x7f00315df380>, <Element li at 0x7f00315df300>]
