# 解析 xpath的基本使用

安装lxml库 到python解释库中

`pip install lxml -i https://pypy.douban.com/simple`

In [1]:
from lxml import etree

# xpath解析
# (1) 本地文件                                                    etree.parse
# (2) 服务器响应的数据 response.read().decode('utf-8') *********   etree.HTML()

In [3]:
# xpath 解析本地文件
tree  = etree.parse('020_解析_xpath的基本使用.html')
print(tree)

# 如果报错：
# XMLSyntaxError: Opening and ending tag mismatch: meta line 5 and head, line 7, column 8
# 说明html文件的编码格式有问题
# etree.parse 需要严格遵循xml规范，meta没有结束标签，所以报错
# <meta charset="UTF-8"> -> <meta charset="UTF-8" />

<lxml.etree._ElementTree object at 0x7ff5cc2365f0>


In [None]:
"""
tree.xpath('xpath路径')
    1. 路径查询
        //：查找所有子孙节点，不考虑层级关系
        /：查找直接子节点
    2. 谓词查询
        //div[@id]
        //div[@id="maincontent"]
    3.  属性查询
        //@class
    4. 模糊查询
        //div[contains(@id, 'header')]
        //div[starts-with(@id, 'he')]
    5. 内容查询
       //dev/h1/text()  
    6. 逻辑运算
       //div[@id="head" and @class="s_down"]
       //title | //price 
"""

In [31]:

# 查找ul下面的li
li_list = tree.xpath('//ul/li')

print(li_list)
print(len(li_list))

[<Element li at 0x7f586574e820>, <Element li at 0x7f5864297140>, <Element li at 0x7f5864297f50>, <Element li at 0x7f58642978c0>, <Element li at 0x7f58642975a0>, <Element li at 0x7f5864297870>, <Element li at 0x7f5864297550>, <Element li at 0x7f5865753410>]
8


In [32]:
# 查找所有有id属性的li标签
li_list = tree.xpath('//ul/li[@id]')

print(li_list)
print(len(li_list))

[<Element li at 0x7f586574e820>, <Element li at 0x7f5864297140>]
2


In [33]:
# text() 获取标签中的文本内容
li_list = tree.xpath('//ul/li[@id]/text()')

print(li_list)
print(len(li_list))

['北京', '上海']
2


In [34]:
# 找到id为l1的li标签  注意引号的问题
li_list = tree.xpath('//ul/li[@id="l1"]/text()')

print(li_list)
print(len(li_list))

['北京']
1


In [35]:
# 查找到id为l1的li标签的class属性
li_list = tree.xpath('//ul/li[@id="l1"]/@class')

print(li_list)
print(len(li_list))

['c1']
1


In [36]:
# 查询id中包含l的li标签
li_list = tree.xpath('//ul/li[contains(@id, "l")]/text()')

print(li_list)
print(len(li_list))

['北京', '上海']
2


In [37]:
# 查询id的值以l开头的li标签
li_list = tree.xpath('//ul/li[starts-with(@id, "l")]/text()')

print(li_list)
print(len(li_list))

['北京', '上海']
2


In [None]:
# 逻辑：属性与、或
# 查询id为l1和class为c1的li标签
li_list = tree.xpath('//ul/li[@id="l1" and @class="c1"]/text()')

print(li_list)
print(len(li_list))

li_list = tree.xpath('//ul/li[@id="l1" or @id="c3"]/text()')

print(li_list)
print(len(li_list))

['北京']
1
['北京', '广州']
2


In [52]:
# 逻辑：标签或
# 查询
li_list = tree.xpath('//ul/li[@id="l1"]/text() | //ul/li[@id="l2"]/text()')

print(li_list)
print(len(li_list))

['北京', '上海']
2


In [None]:
"""
补充用法
tree.xpath("string(.)")
    . 表示当前节点。
    string(.) 将当前节点及其所有子节点的文本内容连接成一个字符串。
"""
# 提取当前节点和子节点文本内容
ul = tree.xpath('//ul')
ul = ul[0].xpath('string(.)')
print(ul)


        北京
        上海
        广州
        深圳
    
