### xpath解析
- 通用性比较强
- 环境的安装: pip install lxml
- 解析原理:
    - 1. 实例化一个etree对象，且将解析的页面源码加载到该对象中
    - 2. 使用该对象中的xpath方法结合着xpath表达式进行标签定位和数据解析提取
- etree对象的实例化:
    - 本地加载: 
        tree = etree.parse("filePath")
    - 网络加载: 
        tree = etree.HTML(page_text)

#### 常用的xpath表达式: 基于标签的层级实现定位,返回的永远是一个列表
- /: 从标签开始实现层级定位
- //: 从任意位置实现标签的定位
- 属性定位: tag[@attrName="attrValue"]
- 索引定位: //div[@class="tang"]/ul/li[5] 注意索引值是从1开始
- 取文本:
    - 取直系文本内容: /text()
    - 取所有文本内容: //text()
- 取属性: /@attrName

In [1]:
from lxml import etree

In [2]:
tree = etree.parse('./test_page.html')
tree

<lxml.etree._ElementTree at 0x17035a6f6c8>

In [6]:
tree.xpath('//div')

[<Element div at 0x17035aed988>,
 <Element div at 0x17035b305c8>,
 <Element div at 0x17035b30608>]

In [5]:
tree.xpath('//div[@class="song"]')

[<Element div at 0x17035b305c8>]

In [7]:
tree.xpath('//div[@class="song"]/img')

[<Element img at 0x17035b30748>]

In [9]:
tree.xpath('//div[@class="song"]/img/@src')[0]

'http://www.baidu.com/meinv.jpg'

In [17]:
# 需求1: 取class="tang"下面的杜小月文本
tree.xpath('//div[@class="tang"]/ul/li[6]/b/text()')[0]

'杜小月'

In [24]:
# 需求2: 取"总为浮云能"这一段话
tree.xpath('//div[@class="song"]/a[2]/text()')[0]

[<Element a at 0x17035b30bc8>,
 <Element a at 0x17035b30f88>,
 <Element a at 0x17035b30fc8>]

In [27]:
# 需求3:  取“http://www.haha.com”这个域名
tree.xpath('//a[@id="feng"]/@href')[0]

'http://www.haha.com'

In [30]:
# 案例1： 爬取58二手房上面的房源信息
import requests
from lxml import etree

url = "https://sz.58.com/ershoufang/?PGTID=0d100000-0000-43a0-55c0-fbb7aead05c1&ClickID=2"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"
}

# 向二手房列表页发送请求，获取页面源码数据
page_text = requests.get(url=url, headers=headers).text

# 使用xpath进行数据解析
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')

all_data_list = list()
for li in li_list:
    title = li.xpath('./div[2]/h2/a/text()')[0]
    detail_url = li.xpath('./div[2]/h2/a/@href')[0]
    price = li.xpath('./div[3]//text()')
    
    # 向详情页的URL发送请求，获取详情页的数据
    detail_page_text = requests.get(url=detail_url, headers=headers).text
    detail_tree = etree.HTML(detail_page_text)
    detail_content = detail_tree.xpath('//div[@id="generalSituation"]/div//text()')
    
    dic = {
        "title": title, 
        "price": price,
        "detail_content": detail_content
    }
    all_data_list.append(dic)

print(all_data_list)

# 获取到的价格信息有乱码，解决乱码的方式参见我的博客（顺便加个关注）： https://www.cnblogs.com/tiger666/articles/11414949.html

[{'title': '国展苑 业主已经下定其他新房 低于市场价急售此房\xa0', 'price': ['\n                            ', '142', '万', '\n                            ', '28389元/㎡', '\n                        '], 'detail_content': ['\n                ', '\n                                            ', '\n                            ', '房屋总价', '\n                            ', '\n                                    龤餼龒万(单价龒鸺麣鸺齤元/㎡)\n                                ', '\n                        ', '\n                                                                ', '\n                            ', '房屋户型', '\n                            ', '3室2厅1卫', '\n                        ', '\n                                                                ', '\n                            ', '房本面积', '\n                            ', '50.02㎡', '\n                        ', '\n                                                                ', '\n                            ', '房屋朝向', '\n                            ', '南北', '\n        

In [35]:
# 案例2: 解析出所有城市名称https://www.aqistudy.cn/historydata/
import requests
from lxml import etree

url = "https://www.aqistudy.cn/historydata/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"
}

# 使用requests模块发送请求获取页面源码数据
page_text = requests.get(url=url, headers=headers).text

tree = etree.HTML(page_text)

city_list = tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()')
print(city_list)

['北京', '上海', '广州', '深圳', '杭州', '天津', '成都', '南京', '西安', '武汉', '阿坝州', '安康', '阿克苏地区', '阿里地区', '阿拉善盟', '阿勒泰地区', '安庆', '安顺', '鞍山', '克孜勒苏州', '安阳', '蚌埠', '白城', '保定', '北海', '宝鸡', '北京', '毕节', '博州', '白山', '百色', '保山', '白沙', '包头', '保亭', '本溪', '巴彦淖尔', '白银', '巴中', '滨州', '亳州', '长春', '昌都', '常德', '成都', '承德', '赤峰', '昌吉州', '五家渠', '昌江', '澄迈', '重庆', '长沙', '常熟', '楚雄州', '朝阳', '沧州', '长治', '常州', '潮州', '郴州', '池州', '崇左', '滁州', '定安', '丹东', '东方', '东莞', '德宏州', '大理州', '大连', '大庆', '大同', '定西', '大兴安岭地区', '德阳', '东营', '黔南州', '达州', '德州', '儋州', '鄂尔多斯', '恩施州', '鄂州', '防城港', '佛山', '抚顺', '阜新', '阜阳', '富阳', '抚州', '福州', '广安', '贵港', '桂林', '果洛州', '甘南州', '固原', '广元', '贵阳', '甘孜州', '赣州', '广州', '淮安', '海北州', '鹤壁', '淮北', '河池', '海东地区', '邯郸', '哈尔滨', '合肥', '鹤岗', '黄冈', '黑河', '红河州', '怀化', '呼和浩特', '海口', '呼伦贝尔', '葫芦岛', '哈密地区', '海门', '海南州', '淮南', '黄南州', '衡水', '黄山', '黄石', '和田地区', '海西州', '河源', '衡阳', '汉中', '杭州', '菏泽', '贺州', '湖州', '惠州', '吉安', '金昌', '晋城', '景德镇', '金华', '西双版纳州', '九江', '吉林', '即墨', '江门', '荆门', '佳木斯', '济南', '济宁', '胶南', '酒泉', '句容', '湘西州', '

In [38]:
# 案例3(处理中文乱码)：解析图片数据：http://pic.netbian.com/4kmeinv/
import requests
from lxml import etree

url = "http://pic.netbian.com/4kqiche/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"
}

response = requests.get(url=url, headers=headers)
page_text = response.text

tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@class="clearfix"]/li')

for li in li_list:
    title = li.xpath('./a/b/text()')[0]
    title = title.encode('ISO-8859-1').decode('gbk')  # 使用ISO-8859-1这种通用编码方式可以处理大部分中文乱码的情况
    detail_url = "http://pic.netbian.com" + li.xpath('./a/@href')[0]
    
    print(title, detail_url)

《Karma SC1 Vision Con http://pic.netbian.com/tupian/24859.html
2019 Ford GT MK II 福特 http://pic.netbian.com/tupian/24849.html
福特ford gt mk ii 4k跑 http://pic.netbian.com/tupian/24831.html
法拉利ferrari 488 pist http://pic.netbian.com/tupian/24830.html
奥迪audi r8 lms gt2 赛 http://pic.netbian.com/tupian/24829.html
2019年法拉利Portofino跑 http://pic.netbian.com/tupian/23939.html
2019年劳斯莱斯幽灵黑徽 http://pic.netbian.com/tupian/23824.html
2019 McLaren Senna GTR http://pic.netbian.com/tupian/23823.html
2019年劳斯莱斯幽灵黑徽 http://pic.netbian.com/tupian/23822.html
迈凯伦McLaren 600LT Sp http://pic.netbian.com/tupian/23695.html
白色劳斯莱斯5k图片 http://pic.netbian.com/tupian/23654.html
《迈凯伦720S GT3》4k壁 http://pic.netbian.com/tupian/23426.html
兰博基尼Lamborghini Ur http://pic.netbian.com/tupian/23017.html
保时捷Porsche 911 Carr http://pic.netbian.com/tupian/23016.html
兰博基尼LP580橙色跑车4 http://pic.netbian.com/tupian/22673.html
奔驰银箭Mercedes-Benz  http://pic.netbian.com/tupian/22109.html
劳斯莱斯幻影Rolls-Royc http://pic.netbian.com/tupian/22