In [1]:
import requests
from lxml import etree

In [2]:
text='''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html=etree.HTML(text)

In [3]:
etree.tostring(html).decode('utf-8')

'<html><body><div>\n<ul>\n<li class="item-0"><a href="link1.html">first item</a></li>\n<li class="item-1"><a href="link2.html">second item</a></li>\n<li class="item-inactive"><a href="link3.html">third item</a></li>\n<li class="item-1"><a href="link4.html">fourth item</a></li>\n<li class="item-0"><a href="link5.html">fifth item</a>\n</li></ul>\n</div>\n</body></html>'

In [4]:
html=etree.parse('text.html',etree.HTMLParser())

In [5]:
etree.tostring(html).decode('utf-8')

'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">\n<html><body><div>\n<ul>\n<li class="item-0"><a href="link1.html">first item</a></li>\n<li class="item-1"><a href="link2.html">second item</a></li>\n<li class="item-inactive"><a href="link3.html">third item</a></li>\n<li class="item-1"><a href="link4.html">fourth item</a></li>\n<li class="item-0"><a href="link5.html">fifth item</a>\n</li></ul>\n</div></body></html>'

## 所有节点

In [6]:
html.xpath('//*')

[<Element html at 0x7f7ac4f98bc8>,
 <Element body at 0x7f7ad40e5ac8>,
 <Element div at 0x7f7ac4f98b88>,
 <Element ul at 0x7f7ac4d05708>,
 <Element li at 0x7f7ac4d05808>,
 <Element a at 0x7f7ac4d05788>,
 <Element li at 0x7f7ac4d05948>,
 <Element a at 0x7f7ac4d05988>,
 <Element li at 0x7f7ac4d059c8>,
 <Element a at 0x7f7ac4d057c8>,
 <Element li at 0x7f7ac4d05a08>,
 <Element a at 0x7f7ac4d05a48>,
 <Element li at 0x7f7ac4d05a88>,
 <Element a at 0x7f7ac4d05ac8>]

In [7]:
html.xpath('//li')

[<Element li at 0x7f7ac4d05808>,
 <Element li at 0x7f7ac4d05948>,
 <Element li at 0x7f7ac4d059c8>,
 <Element li at 0x7f7ac4d05a08>,
 <Element li at 0x7f7ac4d05a88>]

## 子节点

In [8]:
html.xpath('//li/a')

[<Element a at 0x7f7ac4d05788>,
 <Element a at 0x7f7ac4d05988>,
 <Element a at 0x7f7ac4d057c8>,
 <Element a at 0x7f7ac4d05a48>,
 <Element a at 0x7f7ac4d05ac8>]

In [9]:
html.xpath('//ul//a')

[<Element a at 0x7f7ac4d05788>,
 <Element a at 0x7f7ac4d05988>,
 <Element a at 0x7f7ac4d057c8>,
 <Element a at 0x7f7ac4d05a48>,
 <Element a at 0x7f7ac4d05ac8>]

In [10]:
html.xpath('//ul/a') # 没有直接子节点 所以为空

[]

## 父节点

In [11]:
html.xpath('//a[@href="link4.html"]/../@class')

['item-1']

In [12]:
html.xpath('//a[@href="link4.html"]/parent::*/@class')

['item-1']

## 属性匹配

In [13]:
html.xpath('//li[@class="item-0"]')

[<Element li at 0x7f7ac4d05808>, <Element li at 0x7f7ac4d05a88>]

## 文本匹配

In [14]:
html.xpath('//li[@class="item-0"]/text()')

['\n']

In [15]:
html.xpath('//li[@class="item-0"]/a/text()')

['first item', 'fifth item']

In [16]:
html.xpath('//li[@class="item-0"]//text()')

['first item', 'fifth item', '\n']

## 属性获取

In [17]:
html.xpath('//li/a/@href')

['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']

## 属性多值匹配

In [18]:
text='''
    <li class="li li-first"><a href="link.html">first item</a></li>
'''
html=etree.HTML(text)

In [19]:
html.xpath('//li[@class="li"]/a/text()')

[]

In [20]:
html.xpath('//li[contains(@class,"li")]/a/text()')

['first item']

## 多属性匹配 

In [21]:
text='''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html=etree.HTML(text)

In [22]:
html.xpath('//li[contains(@class,"li") and @name="item"]/a/text()')

['first item']

## 按序选择

In [23]:
text='''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html=etree.HTML(text)

In [24]:
html.xpath('//li[1]/a/text()')

['first item']

In [25]:
html.xpath('//li[last()]/a/text()')

['fifth item']

In [26]:
html.xpath('//li[position()<3]/a/text()')

['first item', 'second item']

In [27]:
html.xpath('//li[last()-2]/a/text()')

['third item']

## 节点轴选择

In [37]:
text='''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html=etree.HTML(text)

In [44]:
html.xpath('//li[1]/ancestor::*')   # 祖先节点

[<Element html at 0x7f7ac4d11bc8>,
 <Element body at 0x7f7ac4d11e48>,
 <Element div at 0x7f7ac4d16a08>,
 <Element ul at 0x7f7ac4d16908>]

In [39]:
html.xpath('//li[1]/ancestor::div')# div 祖先节点

[<Element div at 0x7f7ac4d16a08>]

In [45]:
html.xpath('//li[1]/attribute::*') #节点属性值

['item-0']

In [41]:
html.xpath('//li[1]/child::a[@href="link1.html"]') #直接子节点

[<Element a at 0x7f7ac4d16288>]

In [55]:
html.xpath('//li[1]/descendant::span') #子孙节点

['first item']

In [46]:
html.xpath('//li[1]/following::*[2]') #当前节点之后的所有节点

[<Element a at 0x7f7ac4d16548>]

In [54]:
html.xpath('//li[1]/following-sibling::*') #当前节点之后的所有同级节点

[<Element li at 0x7f7ac4d23348>,
 <Element li at 0x7f7ac4d23608>,
 <Element li at 0x7f7ac4d23288>,
 <Element li at 0x7f7ac4d23648>]