![](.2_images/bdd63d40.png)

In [1]:
from lxml import etree

text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
# 这里我们可以使用etree来进行解析，这里我们的HTML文本最后一个节点是没有闭和的，但是我们可以使用etree来自动修正HTML文本
html = etree.HTML(text)
# 这里转换出来的是二进制的信息
result = etree.tostring(html)
# 我们这里把二进制编码成字符串
print(result.decode('utf-8'))

<html><body><div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </li></ul>
 </div>
</body></html>


In [2]:
# 也可以直接解析文件，这里会多出<!DOCTYPE声明
html = etree.parse('./test.html', etree.HTMLParser())
result = etree.tostring(html)
print(result.decode('utf-8'))

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body><div>&#13;
    <ul>&#13;
         <li class="item-0"><a href="link1.html">first item</a></li>&#13;
         <li class="item-1"><a href="link2.html">second item</a></li>&#13;
         <li class="item-inactive"><a href="link3.html">third item</a></li>&#13;
         <li class="item-1"><a href="link4.html">fourth item</a></li>&#13;
         <li class="item-0"><a href="link5.html">fifth item</a>&#13;
     </li></ul>&#13;
 </div></body></html>


In [3]:
# 下面我们使用xpath来解析所有的节点
html = etree.parse('./test.html', etree.HTMLParser())
# 选取所有的li节点
result = html.xpath('//li')
print(result)
print(result[0])

[<Element li at 0x2a7c23f2408>, <Element li at 0x2a7c23f2308>, <Element li at 0x2a7c23f23c8>, <Element li at 0x2a7c23f2448>, <Element li at 0x2a7c23f2488>]
<Element li at 0x2a7c23f2408>


In [5]:
# 我们也可以解析子节点，比如解析li下的a标签
result = html.xpath('//li/a')
print(result)

[<Element a at 0x2a7c23fab08>, <Element a at 0x2a7c23faac8>, <Element a at 0x2a7c2405448>, <Element a at 0x2a7c2405108>, <Element a at 0x2a7c2405048>]


In [7]:
# 如果想获取子孙节点，可以这样
result = html.xpath('//ul//a')
print(result)

[<Element a at 0x2a7c23fab08>, <Element a at 0x2a7c23faac8>, <Element a at 0x2a7c2405448>, <Element a at 0x2a7c2405108>, <Element a at 0x2a7c2405048>]


In [9]:
# 下面这样就不行，会获取不到
result = html.xpath('//ul/a')
print(result)

[]


In [10]:
# 我们也可以根据子节点来获取到父节点的信息
result = html.xpath('//a[@href="link4.html"]/../@class')
print(result)

['item-1']


In [11]:
# 除了使用..我们也可以使用parent来获取
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
print(result)

['item-1']


In [12]:
# 根据属性来匹配内容，这里查找class="item-0"的标签
result = html.xpath('//li[@class="item-0"]')
print(result)

[<Element li at 0x2a7c2427308>, <Element li at 0x2a7c2427348>]


In [13]:
# 获取标签下面的文本内容
result = html.xpath('//li[@class="item-0"]/text()')
# 这里会获取不到文本，因为xpath这里使用了子节点也就是<li>标签里面的东西
print(result)

['\r\n     ']


In [14]:
# 这样就可以获取到了
result = html.xpath('//li[@class="item-0"]/a/text()')
print(result)

['first item', 'fifth item']


In [15]:
# 当然也可以直接使用子孙节点的方式来获取。这里会获取所有的子孙节点信息
result = html.xpath('//li[@class="item-0"]//text()')
print(result)

['first item', 'fifth item', '\r\n     ']


In [16]:
# 除了可以获取文本，还可以获取属性，比如这里获取a标签的链接信息
result = html.xpath('//li/a/@href')
print(result)

['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']


In [17]:
# 如果节点属性有多个值时，这个时候就无法匹配到内容了
text = '''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
# 比如这里class属性有两个，这里就不能匹配到结果了
result = html.xpath('//li[@class="li"]/a/text()')
print(result)

[]


In [18]:
# 这个时候我们可以使用contains，表示包含
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)

['first item']


In [19]:
# 还有另外一种情况，就是根据多个属性来确定一个节点，这里我们可以使用and来连接
text = '''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
print(result)

['first item']


xpath还有很多运算符，下面是这些运算符的简单介绍

![](.2_images/9d4da00d.png)

In [20]:
# 下面演示一下安装顺序来选择对应的节点
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
html = etree.HTML(text)
# 获取第一个（注意，顺序是从1开始的）
result = html.xpath('//li[1]/a/text()')
print(result)
# 获取最后一个节点
result = html.xpath('//li[last()]/a/text()')
print(result)
# 获取前面两个节点
result = html.xpath('//li[position()<3]/a/text()')
print(result)
# 获取倒数第三个节点
result = html.xpath('//li[last()-2]/a/text()')
print(result)

['first item']
['fifth item']
['first item', 'second item']
['third item']


In [21]:
# 下面演示一下节点轴选择
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html"><span>first item</span></a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
html = etree.HTML(text)
# 获取所有的祖先节点
result = html.xpath('//li[1]/ancestor::*')
print(result)
# 获取某一个祖先节点
result = html.xpath('//li[1]/ancestor::div')
print(result)
# 获取节点所有的属性
result = html.xpath('//li[1]/attribute::*')
print(result)
# 获取特定的字节点
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
# 获取特定的子孙节点
result = html.xpath('//li[1]/descendant::span')
print(result)
# 获取当前节点之后的所有节点
result = html.xpath('//li[1]/following::*[2]')
print(result)
# 获取当前节点之后的同级节点
result = html.xpath('//li[1]/following-sibling::*')
print(result)

[<Element html at 0x2a7c241eb08>, <Element body at 0x2a7c2415788>, <Element div at 0x2a7c2415348>, <Element ul at 0x2a7c2415248>]
[<Element div at 0x2a7c2415348>]
['item-0']
[<Element a at 0x2a7c2415608>]
[<Element span at 0x2a7c2415348>]
[<Element a at 0x2a7c2415e08>]
[<Element li at 0x2a7c2415248>, <Element li at 0x2a7c24159c8>, <Element li at 0x2a7c2415788>, <Element li at 0x2a7c2415f48>]
