-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
44 lines (40 loc) · 1.04 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# -*- coding: utf-8 -*-
import requests
from pyquery import PyQuery as pq
class Crawler(object):
def get_page_html(self, url):
"""
获取网页源码
:param url:
:return:
"""
if url:
response = requests.get(url)
print('正在爬取{},状态码为{}'.format(url, response.status_code))
if response.status_code == 200:
return response.text
return None
def parse(self,page_url, html):
"""
解析页面
:param html:
:return:
"""
if html:
doc = pq(html)
url = doc('#bgLink').attr('href')
url_info = doc('#sh_cp').attr('title')
data = {
'url': page_url + url,
'url_info': url_info
}
print(data)
return data
def main(self, url):
"""
爬虫入口
:param url:
:return:
"""
html = self.get_page_html(url)
return self.parse(url,html)