# HTML解析入门及准备URL生成连续技
![for humans](https://requests-html.kennethreitz.org/_static/requests-html-logo.png#thumbnail)

*  本周主要内容：批量抓取页面基础及技巧
*  上周主要内容：HTML解析（parse HTML）及准备URL生成连续技
*  20春_Web数据挖掘_week04
*  电子讲义设计者：廖汉腾, 许智超
<br/>
<br/>

-----
## 复习

复习：上周内容，实践

* 猎聘PC版 liepin.com 取工作URL参数的牛肉
* 如何生成一连串新URL以进一步爬取数据


-----
## 本周内容及学习目标

本周内容聚焦在

<mark> 如何有系统的把更多页数据(相同结构)作系统性爬取 </mark>

为此，我们需要学习

* 翻页：参数字典的拆解
  * xpath
  * 建构参数模板
  * 建构参数字典
* 翻页：系统性迭代
  * robots.txt
  * 频率及时间
* 翻页：数据备份与整合
  * 储存备份
  * 数据整合
  
### 目标
1. 使用 requests-html 爬取并存取网页文字档，查找[requests-html 中文文档](https://cncert.github.io/requests-html-doc-cn/#/)
2. 熟悉 [xpath 语法](https://www.w3cschool.cn/xpath/xpath-syntax.html)丶[xpath 节点](https://www.w3cschool.cn/xpath/xpath-nodes.html)
3. 使用 [xpath cheatsheet](https://devhints.io/xpath)
  * 在 Chrome Inspector 使用
  * 在 requests-html (Python) 使用
4. 简易使用 [pd.DataFrame](https://www.pypandas.cn/doc/getting_started/dsintro.html#dataframe)
5. 参数字典的拆解与迭代
6. 翻页数据备份与整合

In [1]:
%%html
<style>
/* 本电子讲义使用之CSS */
div.code_cell {
    background-color: #e5f1fe;
}
div.cell.selected {
    background-color: #effee2;
    font-size: 2rem;
    line-height: 2.4rem;
}
div.cell.selected .rendered_html table {
    font-size: 2rem !important;
    line-height: 2.4rem !important;
}
.rendered_html pre code {
    background-color: #C4E4ff;   
    padding: 2px 25px;
}
.rendered_html pre {
    background-color: #99c9ff;
}
div.code_cell .CodeMirror {
    font-size: 2rem !important;
    line-height: 2.4rem !important;
}
.rendered_html img, .rendered_html svg {
    max-width: 60%;
    height: auto;
    float: right;
}

.rendered_html img[src*="#full"], .rendered_html svg[src*="#full"] {
    max-width: 100%;
    height: auto;
    float: none;
}

.rendered_html img[src*="#thumbnail"], .rendered_html svg[src*="#thumbnail"] {
    max-width: 15%;
    height: auto;
}

/* Gradient transparent - color - transparent */
hr {
    border: 0;
    border-bottom: 1px dashed #ccc;
}
.emoticon{
    font-size: 5rem;
    line-height: 4.4rem;
    text-align: center;
    vertical-align: middle;
}
.bg-split_apply_comine {
    width: 500px;     
    height: 300px;
    background: url('02_split-apply-comine_500x300.png') -10px -10px;
    float: right;
}
.bg-comine {
    width: 175px;
    height: 150px;
    background: url('02_split-apply-comine_500x300.png') -280px -80px;
    float: right;
}
.bg-apply {
    width: 155px;
    height: 225px;
    background: url('02_split-apply-comine_500x300.png') -160px -30px;
    float: right;
}
.bg-split {
    width: 205px;
    height: 225px;
    background: url('02_split-apply-comine_500x300.png') -10px -30px;
    float: right;
}
.break {
                   page-break-after: right; 
                   width:700px;
                   clear:both;
}
</style>

In [2]:
# 基本模块
import pandas as pd
from requests_html import HTMLSession

## 0. 上周整合代码

In [3]:
# 上周C-1B-5 建构 参数模板  
参数_compTag_用户体验 = {'中国500强': {'init': ['-1'], 'headckid': ['58d828c357a8cb19'], 'flushckid': ['1'], 'fromSearchBtn': ['2'], 'keyword': ['用户体验'], 'compTag': ['155'], 'ckid': ['58d828c357a8cb19'], 'siTag': ['1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw'], 'd_sfrom': ['search_unknown'], 'd_ckId': ['6aa779111c1b4ca77cff3648d9dee049'], 'd_curPage': ['0'], 'd_pageSize': ['40'], 'd_headId': ['6aa779111c1b4ca77cff3648d9dee049']}, '2018互联网300强': {'init': ['-1'], 'headckid': ['58d828c357a8cb19'], 'flushckid': ['1'], 'fromSearchBtn': ['2'], 'keyword': ['用户体验'], 'compTag': ['182'], 'ckid': ['58d828c357a8cb19'], 'siTag': ['1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw'], 'd_sfrom': ['search_unknown'], 'd_ckId': ['6aa779111c1b4ca77cff3648d9dee049'], 'd_curPage': ['0'], 'd_pageSize': ['40'], 'd_headId': ['6aa779111c1b4ca77cff3648d9dee049']}, '制造业500强': {'init': ['-1'], 'headckid': ['58d828c357a8cb19'], 'flushckid': ['1'], 'fromSearchBtn': ['2'], 'keyword': ['用户体验'], 'compTag': ['186'], 'ckid': ['58d828c357a8cb19'], 'siTag': ['1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw'], 'd_sfrom': ['search_unknown'], 'd_ckId': ['6aa779111c1b4ca77cff3648d9dee049'], 'd_curPage': ['0'], 'd_pageSize': ['40'], 'd_headId': ['6aa779111c1b4ca77cff3648d9dee049']}, 'AI创新成长50强 ': {'init': ['-1'], 'headckid': ['58d828c357a8cb19'], 'flushckid': ['1'], 'fromSearchBtn': ['2'], 'keyword': ['用户体验'], 'compTag': ['189'], 'ckid': ['58d828c357a8cb19'], 'siTag': ['1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw'], 'd_sfrom': ['search_unknown'], 'd_ckId': ['6aa779111c1b4ca77cff3648d9dee049'], 'd_curPage': ['0'], 'd_pageSize': ['40'], 'd_headId': ['6aa779111c1b4ca77cff3648d9dee049']}, '独角兽': {'init': ['-1'], 'headckid': ['58d828c357a8cb19'], 'flushckid': ['1'], 'fromSearchBtn': ['2'], 'keyword': ['用户体验'], 'compTag': ['130'], 'ckid': ['58d828c357a8cb19'], 'siTag': ['1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw'], 'd_sfrom': ['search_unknown'], 'd_ckId': ['6aa779111c1b4ca77cff3648d9dee049'], 'd_curPage': ['0'], 'd_pageSize': ['40'], 'd_headId': ['6aa779111c1b4ca77cff3648d9dee049']}, '上市公司': {'init': ['-1'], 'headckid': ['58d828c357a8cb19'], 'flushckid': ['1'], 'fromSearchBtn': ['2'], 'keyword': ['用户体验'], 'compTag': ['156'], 'ckid': ['58d828c357a8cb19'], 'siTag': ['1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw'], 'd_sfrom': ['search_unknown'], 'd_ckId': ['6aa779111c1b4ca77cff3648d9dee049'], 'd_curPage': ['0'], 'd_pageSize': ['40'], 'd_headId': ['6aa779111c1b4ca77cff3648d9dee049']}}

# 上周C-1   多个页面准备测试1 中国500强
url = "https://www.liepin.com/zhaopin/"
session = HTMLSession()
payload = 参数_compTag_用户体验['中国500强']
r = session.get( url, params = payload)

# r.url

# 上周C-2  简化 A-1   单一页面爬+解析
session = HTMLSession()

def requests_liepin( url, params):
    r = session.get( url , params = payload)

    # 先取特定元素, 精准打击其子后辈
    主要元素 = r.html.xpath( '//ul[@class="sojob-list"]/li')

    # 作为xpath字典，键为我要抓的牛肉名称，值为xpath
    dict_xpaths={ 
        'text': {
            'edu':      '//div[contains(@class,"job-info")]/p/span[@class="edu"]',
            '经验':      '//div[contains(@class,"job-info")]/p/span[@class="edu"]/following-sibling::span',
            '薪水':    '//div[contains(@class,"job-info")]/p/span[@class="text-warning"]', 
            '时间':    '//div[contains(@class,"job-info")]/p/time/@title', 
            '职称':    '//div[contains(@class,"job-info")]/h3/a', 
            '公司地点': '//div[contains(@class,"job-info")]/p/a',
            '公司名称': '//div[contains(@class,"sojob-item-main")]//p[@class="company-name"]/a', 
        },
        'text_content': {
        },
        'href': {
            '链结':    '//div[contains(@class,"job-info")]/h3/a', 
            '公司URL': '//div[contains(@class,"sojob-item-main")]//p[@class="company-name"]/a', 
        }
    }

    def get_e_text_content(_xpath_):
        # 高级列表推导
        暂存结果 = [e.xpath(_xpath_)[0].lxml.text_content() for e in 主要元素]
        return(暂存结果)

    def get_e_text(_xpath_):
        # 高级列表推导
        暂存结果 = ["".join([x.strip() if type(x) is str else x.text.strip() for x in e.xpath(_xpath_)]) for e in 主要元素]
        return(暂存结果)

    def get_e_href(_xpath_):
        # 高级列表推导
        暂存结果 = [list(e.xpath(_xpath_, first=True).absolute_links)[0] \
                   if len(e.xpath(_xpath_, first=True).absolute_links) >= 1  \
                   else "" for e in 主要元素]
        return(暂存结果)

    # 只对主要元素下进行.xpath取值
    数据字典 = dict()

    数据字典 = {k:get_e_text_content(v) for k,v in dict_xpaths['text_content'].items()}
    数据字典.update({k:get_e_text(v) for k,v in dict_xpaths['text'].items()})
    数据字典.update({k:get_e_href(v) for k,v in dict_xpaths['href'].items()})

    数据 = pd.DataFrame(数据字典)
    #数据.to_excel("20春_Web数据挖掘_week03_liepin.xlsx", sheet_name="搜查结果")
    return (数据)


# 上周C-3   多个页面
url = "https://www.liepin.com/zhaopin/"

list_df = list()
for k,v in 参数_compTag_用户体验.items():
    payload = v
    df = requests_liepin( url, params = payload)
    df = df.assign (热门公司类型 = k)    
    list_df.append(df)

df_all = pd.concat(list_df)
df_all

# 上周C-4   输出
df_all.to_excel("20春_Web数据挖掘_week03_liepin_各热门公司类型.xlsx", sheet_name="搜查结果")

# 上周C-5 Pandas  基本能力

print (df_all.nunique())
df_all[['edu']].drop_duplicates()

df_all.groupby(['公司名称','edu']).agg({"职称":"count"}).sort_values(by='职称', ascending=False)

edu         7
经验         11
薪水         92
时间         21
职称        187
公司地点       62
公司名称       62
链结        196
公司URL      63
热门公司类型      6
dtype: int64


Unnamed: 0_level_0,Unnamed: 1_level_0,职称
公司名称,edu,Unnamed: 2_level_1
天能电池集团股份有限公司,统招本科,18
华为,本科及以上,14
恒大集团,硕士及以上,10
奥比中光,硕士及以上,9
天能电池集团股份有限公司,本科及以上,9
...,...,...
广日股份,本科及以上,1
敏实集团,大专及以上,1
新城控股集团住宅开发事业部,本科及以上,1
明略科技集团,统招本科,1



-----

## 本周实践目标
<mark> 如何有系统的把更多页数据(相同结构)作系统性爬取 </mark>[猎聘PC版](https://www.liepin.com/zhaopin/)
* 翻页：参数字典的拆解
  * xpath解析翻页a/@href
  * 建构参数模板
  * 建构参数字典
* 翻页：系统性迭代
  * robots.txt
  * 频率及时间
* 翻页：数据备份与整合
  * 储存备份
  * 数据整合

# 翻页：参数字典的拆解
## xpath解析翻页a/@href

In [4]:
# A-0   单一页面
url = "https://www.liepin.com/zhaopin/?keyword=PRD"
session = HTMLSession()
r = session.get( url )

In [5]:
# A-1  xpath 解析翻页a/@href
xpath_翻页a = '//div[@class="pagerbar"]/a' # 有disabled, current等href是javascript
xpath_翻页a = '//div[@class="pagerbar"]/a[starts-with(@href,"/zhaopin")]'
print (r.html.xpath(xpath_翻页a)) # 物件

href_列表 = [x.xpath('//@href')[0] for x in r.html.xpath(xpath_翻页a)]
#print (href_列表)

文字_列表 = [x.text for x in r.html.xpath(xpath_翻页a)]
#print (文字_列表)

href_字典 = {x.text:x.xpath('//@href')[0]  for x in r.html.xpath(xpath_翻页a)}
print (href_字典)

[<Element 'a' href='/zhaopin/?init=-1&headckid=260c913c0571747b&fromSearchBtn=2&keyword=PRD&ckid=260c913c0571747b°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=917a54aacc3a11183fcafabd322be908&d_curPage=0&d_pageSize=40&d_headId=917a54aacc3a11183fcafabd322be908&curPage=1'>, <Element 'a' href='/zhaopin/?init=-1&headckid=260c913c0571747b&fromSearchBtn=2&keyword=PRD&ckid=260c913c0571747b°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=917a54aacc3a11183fcafabd322be908&d_curPage=0&d_pageSize=40&d_headId=917a54aacc3a11183fcafabd322be908&curPage=2'>, <Element 'a' href='/zhaopin/?init=-1&headckid=260c913c0571747b&fromSearchBtn=2&keyword=PRD&ckid=260c913c0571747b°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=917a54aacc3a11183fcafabd322be908&d_curPage=0&d_pageSize=40&d_headId=917a54aacc3a11183fcafabd322be908&curPage=3'>, <Element 'a' href='/zhaopin/?init

### 观察：
此网页是否给出开始丶步进丶及结束的信息，以方便我们完成迭代设置

* 老问题 URL太长，用上周的URL+query参数解析与pandas数据框找到异同之处
* 老问题 怎麽系统化出URL？用上周的URL+query参数解析与pandas数据框找到异同之处的时候，顺便构建参数字典，至少让以下参数可调
  * 搜索关键词：上周keyword
  * 页码在哪？
* 实践挑战：如何把上周代码模块化为我们所用？

-----

## 建构参数模板

```python

# 上周B-1 使用 urllib.parse 解析
from urllib.parse import urlparse, parse_qs


# 上周B-2 使用 pd.DataFrame进行 unuinque()相异值计量比对 
import pandas as pd
df = pd.DataFrame([ urlparse(x) for x in 公司数据选择器链结.values()])
print(df.nunique())

# 上周B-3 针对query 再解析之 
#df_qs = pd.DataFrame([ parse_qs(x) for x in df['query'] ])
df_qs = pd.DataFrame([{k:v[0] for k,v in parse_qs(x).items()} for x in df['query'] ])
print(df.nunique())

# 上周B-4 建构 参数模板 及 字典_compTag
def parse_url_qs_for_compTag (url):
    six_parts = urlparse(url) 
    out = parse_qs(six_parts.query)
    return (out)

# parse_url_qs_for_compTag(list(公司数据选择器链结.values())[0])['compTag']
参数模板 = parse_url_qs_for_compTag(list(公司数据选择器链结.values())[0])
print(参数模板)
# [ parse_url_qs_for_compTag(x)['compTag'] for x in 公司数据选择器链结.values()]
[ parse_url_qs_for_compTag(x)['compTag'][0] for x in 公司数据选择器链结.values()]

字典_compTag = { k:parse_url_qs_for_compTag(v)['compTag'][0] for k,v in 公司数据选择器链结.items()}
print (字典_compTag)

# B-5 建构 参数模板  
def 参数模板生成(compTag , keyword ):
    参数 = 参数模板.copy()
    参数['compTag'] = compTag
    参数['keyword'] = keyword
    return (参数)

参数_compTag_用户体验 = { k:参数模板生成(compTag = [v], keyword = ['用户体验']) for k,v in 字典_compTag.items()}
print(参数_compTag_用户体验)

```

In [6]:
# A-2 建构参数模板：找到关键参数及参数结构

# 需要模组库
from urllib.parse import urlparse, parse_qs
import pandas as pd
from IPython.display import display, HTML

# 总体目标：输入 href_列表, 建构出参数字典

# urlparse 解析后丢入数据框
df = pd.DataFrame([ urlparse(x) for x in href_列表])
df_qs = pd.DataFrame([{k:v[0] for k,v in parse_qs(x).items()} for x in df['query'] ])

display(df)
print(df.nunique())
display(df_qs)
print(df_qs.nunique())

df_qs.curPage
df_qs = df_qs.assign (curPage_int=df_qs.curPage.astype(int)) # 变成整数

Unnamed: 0,scheme,netloc,path,params,query,fragment
0,,,/zhaopin/,,init=-1&headckid=260c913c0571747b&fromSearchBt...,
1,,,/zhaopin/,,init=-1&headckid=260c913c0571747b&fromSearchBt...,
2,,,/zhaopin/,,init=-1&headckid=260c913c0571747b&fromSearchBt...,
3,,,/zhaopin/,,init=-1&headckid=260c913c0571747b&fromSearchBt...,
4,,,/zhaopin/,,init=-1&headckid=260c913c0571747b&fromSearchBt...,
5,,,/zhaopin/,,init=-1&headckid=260c913c0571747b&fromSearchBt...,


scheme      1
netloc      1
path        1
params      1
query       5
fragment    1
dtype: int64


Unnamed: 0,init,headckid,fromSearchBtn,keyword,ckid,siTag,d_sfrom,d_ckId,d_curPage,d_pageSize,d_headId,curPage
0,-1,260c913c0571747b,2,PRD,260c913c0571747b°radeFlag=0,1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw,search_unknown,917a54aacc3a11183fcafabd322be908,0,40,917a54aacc3a11183fcafabd322be908,1
1,-1,260c913c0571747b,2,PRD,260c913c0571747b°radeFlag=0,1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw,search_unknown,917a54aacc3a11183fcafabd322be908,0,40,917a54aacc3a11183fcafabd322be908,2
2,-1,260c913c0571747b,2,PRD,260c913c0571747b°radeFlag=0,1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw,search_unknown,917a54aacc3a11183fcafabd322be908,0,40,917a54aacc3a11183fcafabd322be908,3
3,-1,260c913c0571747b,2,PRD,260c913c0571747b°radeFlag=0,1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw,search_unknown,917a54aacc3a11183fcafabd322be908,0,40,917a54aacc3a11183fcafabd322be908,4
4,-1,260c913c0571747b,2,PRD,260c913c0571747b°radeFlag=0,1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw,search_unknown,917a54aacc3a11183fcafabd322be908,0,40,917a54aacc3a11183fcafabd322be908,1
5,-1,260c913c0571747b,2,PRD,260c913c0571747b°radeFlag=0,1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw,search_unknown,917a54aacc3a11183fcafabd322be908,0,40,917a54aacc3a11183fcafabd322be908,9


init             1
headckid         1
fromSearchBtn    1
keyword          1
ckid             1
siTag            1
d_sfrom          1
d_ckId           1
d_curPage        1
d_pageSize       1
d_headId         1
curPage          5
dtype: int64


### 观察：
* query
* curPage 5次, 最大值9, 本页不算?

-----

## 建构参数模板：curPage


In [7]:
# A-2 建构参数模板：找到关键参数及参数结构

def parse_url_qs_for_curPage (url):
    six_parts = urlparse(url) 
    out = parse_qs(six_parts.query)
    return (out)

# 取一例做模板
参数模板 = parse_url_qs_for_curPage(href_列表[0])
print (参数模板)

print (href_字典)

{'init': ['-1'], 'headckid': ['260c913c0571747b'], 'fromSearchBtn': ['2'], 'keyword': ['PRD'], 'ckid': ['260c913c0571747b°radeFlag=0'], 'siTag': ['1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw'], 'd_sfrom': ['search_unknown'], 'd_ckId': ['917a54aacc3a11183fcafabd322be908'], 'd_curPage': ['0'], 'd_pageSize': ['40'], 'd_headId': ['917a54aacc3a11183fcafabd322be908'], 'curPage': ['1']}
{'2': '/zhaopin/?init=-1&headckid=260c913c0571747b&fromSearchBtn=2&keyword=PRD&ckid=260c913c0571747b°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=917a54aacc3a11183fcafabd322be908&d_curPage=0&d_pageSize=40&d_headId=917a54aacc3a11183fcafabd322be908&curPage=1', '3': '/zhaopin/?init=-1&headckid=260c913c0571747b&fromSearchBtn=2&keyword=PRD&ckid=260c913c0571747b°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=917a54aacc3a11183fcafabd322be908&d_curPage=0&d_pageSize=40&d_headId=917a54aacc3a11183fcafabd322be908&curPage=2'

In [8]:
# A-3 建构参数模板生成器：keyword curPage
def 参数模板生成(keyword, curPage):
    参数 = 参数模板.copy()
    参数['curPage'] = curPage
    参数['keyword'] = keyword
    return (参数)

参数_keyword_用户体验_curPage = { 
    i:参数模板生成(curPage = [i], \
                  keyword = ['用户体验']) \
    for i,v in href_字典.items()\
    }

# print(参数_keyword_用户体验_curPage) # 只生成本页有的额外翻页URL, 并没有推估到&curPage=9,也没有这页

print (df_qs.curPage_int.min()) # 最小值只有1
print (df_qs.curPage_int.max()) # 最大值只有9

# 应该是 0 (本页)....9(最大值)

参数_keyword_用户体验_curPage = { 
    i:参数模板生成(curPage = [i], \
                  keyword = ['用户体验']) \
    for i in range(0,df_qs.curPage_int.max()+1)\
    }
参数_keyword_用户体验_curPage

1
9


{0: {'init': ['-1'],
  'headckid': ['260c913c0571747b'],
  'fromSearchBtn': ['2'],
  'keyword': ['用户体验'],
  'ckid': ['260c913c0571747b°radeFlag=0'],
  'siTag': ['1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw'],
  'd_sfrom': ['search_unknown'],
  'd_ckId': ['917a54aacc3a11183fcafabd322be908'],
  'd_curPage': ['0'],
  'd_pageSize': ['40'],
  'd_headId': ['917a54aacc3a11183fcafabd322be908'],
  'curPage': [0]},
 1: {'init': ['-1'],
  'headckid': ['260c913c0571747b'],
  'fromSearchBtn': ['2'],
  'keyword': ['用户体验'],
  'ckid': ['260c913c0571747b°radeFlag=0'],
  'siTag': ['1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw'],
  'd_sfrom': ['search_unknown'],
  'd_ckId': ['917a54aacc3a11183fcafabd322be908'],
  'd_curPage': ['0'],
  'd_pageSize': ['40'],
  'd_headId': ['917a54aacc3a11183fcafabd322be908'],
  'curPage': [1]},
 2: {'init': ['-1'],
  'headckid': ['260c913c0571747b'],
  'fromSearchBtn': ['2'],
  'keyword': ['用户体验'],
  'ckid': ['260c913c0571747b°radeFlag=0'],
  'siTag': ['1B2M2Y8AsgTpgAmY

# 翻页：系统性迭代

## 爬亦有道
* robots.txt 站长/网站拥有者给搜索引擎的"道"
* 频率及时间
  * 不要爬太快
  * 尽量像"人"一样礼貌
  * time.sleep
  
```python

# 上周C-3   多个页面
url = "https://www.liepin.com/zhaopin/"

list_df = list()
for k,v in 参数_compTag_用户体验.items():
    payload = v
    df = requests_liepin( url, params = payload)
    df = df.assign (热门公司类型 = k)    
    list_df.append(df)

df_all = pd.concat(list_df)
df_all
```

In [9]:
# B-1 上周C-2  简化 上上周A-1   单一页面爬+解析
session = HTMLSession()

def requests_liepin( url, params):
    r = session.get( url , params = payload)

    # 先取特定元素, 精准打击其子后辈
    主要元素 = r.html.xpath( '//ul[@class="sojob-list"]/li')

    # 作为xpath字典，键为我要抓的牛肉名称，值为xpath
    dict_xpaths={ 
        'text': {
            'edu':      '//div[contains(@class,"job-info")]/p/span[@class="edu"]',
            '经验':      '//div[contains(@class,"job-info")]/p/span[@class="edu"]/following-sibling::span',
            '薪水':    '//div[contains(@class,"job-info")]/p/span[@class="text-warning"]', 
            '时间':    '//div[contains(@class,"job-info")]/p/time/@title', 
            '职称':    '//div[contains(@class,"job-info")]/h3/a', 
            '公司地点': '//div[contains(@class,"job-info")]/p/a',
            '公司名称': '//div[contains(@class,"sojob-item-main")]//p[@class="company-name"]/a', 
        },
        'text_content': {
        },
        'href': {
            '链结':    '//div[contains(@class,"job-info")]/h3/a', 
            '公司URL': '//div[contains(@class,"sojob-item-main")]//p[@class="company-name"]/a', 
        }
    }

    def get_e_text_content(_xpath_):
        # 高级列表推导
        暂存结果 = [e.xpath(_xpath_)[0].lxml.text_content() for e in 主要元素]
        return(暂存结果)

    def get_e_text(_xpath_):
        # 高级列表推导
        暂存结果 = ["".join([x.strip() if type(x) is str else x.text.strip() for x in e.xpath(_xpath_)]) for e in 主要元素]
        return(暂存结果)

    def get_e_href(_xpath_):
        # 高级列表推导
        暂存结果 = [list(e.xpath(_xpath_, first=True).absolute_links)[0] \
                   if len(e.xpath(_xpath_, first=True).absolute_links) >= 1  \
                   else "" for e in 主要元素]
        return(暂存结果)

    # 只对主要元素下进行.xpath取值
    数据字典 = dict()

    数据字典 = {k:get_e_text_content(v) for k,v in dict_xpaths['text_content'].items()}
    数据字典.update({k:get_e_text(v) for k,v in dict_xpaths['text'].items()})
    数据字典.update({k:get_e_href(v) for k,v in dict_xpaths['href'].items()})

    数据 = pd.DataFrame(数据字典)
    #数据.to_excel("20春_Web数据挖掘_week03_liepin.xlsx", sheet_name="搜查结果")
    return (数据)


## 爬亦有道- 不要爬太快
time.sleep

In [84]:
%%time
time.sleep(3+4*random())

Wall time: 4.6 s


In [64]:
# B-2 多个页面，但放慢脚步 time.sleep
%%time

import time
from random import random

url = "https://www.liepin.com/zhaopin/"

list_df = list()
for k,v in 参数_keyword_用户体验_curPage.items():
    payload = v
    df = requests_liepin( url, params = payload)
    time.sleep(3+4*random())  #放慢脚步 3-7秒, 平均约5秒
    df = df.assign (curPage = k)  # 区分  curPage
    list_df.append(df)

df_all = pd.concat(list_df).reset_index()
df_all.index.name = '序'

# 上周C-4   输出
df_all.to_excel("20春_Web数据挖掘_week04_liepin_翻页.xlsx",\
                sheet_name="用户体验")

# 预估时间: 5秒*10 =50
# 预估数量: 40*10 =400

Unnamed: 0,edu,经验,薪水,时间,职称,公司地点,公司名称,链结,公司URL,curPage
0,本科及以上,7年以上,面议,2020年04月04日,营销副总（项目）,佛山,中国电建地产集团华南区域总部,https://www.liepin.com/job/1927229275.shtml,https://www.liepin.com/company/9624513/,0
1,本科及以上,7年以上,面议,2020年04月04日,营销副总（项目）,广州-石井,中国电建地产集团华南区域总部,https://www.liepin.com/job/1927229273.shtml,https://www.liepin.com/company/9624513/,0
2,本科及以上,7年以上,面议,2020年04月04日,营销副总（项目）,深圳,中国电建地产集团华南区域总部,https://www.liepin.com/job/1927229271.shtml,https://www.liepin.com/company/9624513/,0
3,本科及以上,1年以上,10-12k·13薪,2020年04月04日,c++开发工程师,北京,北京赛普泰克技术有限公司,https://www.liepin.com/job/1927229255.shtml,https://www.liepin.com/company/7884122/,0
4,统招本科,6年以上,15-25k·16薪,2020年04月04日,干部管理经理,深圳,华星光电,https://www.liepin.com/job/1927228767.shtml,https://www.liepin.com/company/671058/,0
...,...,...,...,...,...,...,...,...,...,...
35,大专及以上,2年以上,6-10k·12薪,2020年04月03日,人力资源专员/招聘,宁波-鄞州区,宁波渠成集团有限公司,https://www.liepin.com/job/1925226785.shtml,https://www.liepin.com/company/8096424/,9
36,统招本科,经验不限,8-15k·14薪,2020年04月03日,卫星电源控制器研发工程师,苏州-虎丘区,苏州馥昶空间技术有限公司,https://www.liepin.com/job/1925224865.shtml,https://www.liepin.com/company/9665557/,9
37,本科及以上,3年以上,面议,2020年04月03日,销售部销售经理,上海,上海大丁自动化科技有限公司,https://www.liepin.com/job/1925196603.shtml,https://www.liepin.com/company/10058443/,9
38,大专及以上,2年以上,5-8k·12薪,2020年04月03日,ie工程师,中山,中山市皇鼎五金制品有限公司,https://www.liepin.com/job/1925097067.shtml,https://www.liepin.com/company/9190284/,9


In [None]:
## 多个页面+多个关键词
time.sleep

In [86]:
%%time
# B-3 多个页面+多个关键词
import time
from random import random

url = "https://www.liepin.com/zhaopin/"
xpath_翻页a = '//div[@class="pagerbar"]/a[starts-with(@href,"/zhaopin")]'

keywords = ['用户体验','UX']
list_df = list()

## 第一页试探有多长的页面
for key in keywords:
    payload = 参数模板生成(keyword=[key], curPage=['0'])
    df = requests_liepin( url, params = payload)
    href_列表 = [x.xpath('//@href')[0] for x in r.html.xpath(xpath_翻页a)]
    df = pd.DataFrame([ urlparse(x) for x in href_列表])
    df_qs = pd.DataFrame([{k:v[0] for k,v in parse_qs(x).items()} for x in df['query'] ])
    df_qs = df_qs.assign (curPage_int=df_qs.curPage.astype(int)) # 变成整数
    长度 = df_qs.curPage_int.max()+1
    参数_keyword_X_curPage = { 
        i:参数模板生成(curPage = [i], \
                      keyword = [key]) \
        for i in range(0,长度)\
        }
    #print (参数_keyword_X_curPage)
    print (key,长度)
    
    for k,v in 参数_keyword_X_curPage.items():
        payload = v
        df = requests_liepin( url, params = payload)
        time.sleep(3+4*random())  #放慢脚步 3-7秒, 平均约5秒
        df = df.assign (keyword = key)  # 区分  keyword    
        df = df.assign (curPage = k)  # 区分  curPage    
        list_df.append(df)
        
df_all = pd.concat(list_df).reset_index()
df_all.index.name = '序'

df_all.to_excel("20春_Web数据挖掘_week04_liepin_翻页.xlsx",\
                sheet_name="_".join(keywords))
# 预估时间: 2*5秒*10 =100
# 预估数量: 2*40*10 =800

用户体验 10
UX 10
Wall time: 1min 54s


# 翻页：数据备份与整合
多个页面+多个关键词执行时，若怕中断最好把每一页的df内容备份做中继

In [11]:
%%time
# C-1 多个页面+多个关键词
import time
from random import random

url = "https://www.liepin.com/zhaopin/"
xpath_翻页a = '//div[@class="pagerbar"]/a[starts-with(@href,"/zhaopin")]'

keywords = ['用户体验','UX','产品需求','PRD']
list_df = list()

## 第一页试探有多长的页面
for key in keywords:
    payload = 参数模板生成(keyword=[key], curPage=['0'])
    df = requests_liepin( url, params = payload)
    href_列表 = [x.xpath('//@href')[0] for x in r.html.xpath(xpath_翻页a)]
    df = pd.DataFrame([ urlparse(x) for x in href_列表])
    df_qs = pd.DataFrame([{k:v[0] for k,v in parse_qs(x).items()} for x in df['query'] ])
    df_qs = df_qs.assign (curPage_int=df_qs.curPage.astype(int)) # 变成整数
    长度 = df_qs.curPage_int.max()+1
    参数_keyword_X_curPage = { 
        i:参数模板生成(curPage = [i], \
                      keyword = [key]) \
        for i in range(0,长度)\
        }
    #print (参数_keyword_X_curPage)
    print (key,长度)
    
    for k,v in 参数_keyword_X_curPage.items():
        payload = v
        df = requests_liepin( url, params = payload)
        time.sleep(3+4*random())  #放慢脚步 3-7秒, 平均约5秒
        ## 备份
        df.to_csv("20春_Web数据挖掘_week04_liepin_{key}_{k}.tsv"\
                  .format(key=key, k=k), sep="\t", encoding="utf8")
        
        df = df.assign (keyword = key)  # 区分  keyword    
        df = df.assign (curPage = k)  # 区分  curPage    
        list_df.append(df)
        
df_all = pd.concat(list_df).reset_index()
df_all.index.name = '序'

df_all.to_excel("20春_Web数据挖掘_week04_liepin_翻页_4.xlsx",\
                sheet_name="_".join(keywords))
# 预估时间: 4*5秒*10 =200
# 预估数量: 4*40*10 =1600

用户体验 10
UX 10
产品需求 10
PRD 10
Wall time: 3min 54s


# 本周练习

* 开始试验各类参数的调整
