-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
81 lines (63 loc) · 2.45 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import re
from collections import deque
from urllib.parse import urljoin
import requests
LI_A_PATTERN = re.compile(r'<li class="item">.*?</li>')
A_TEXT_PATTERN = re.compile(r'<a\s+[^>]*?>(.*?)</a>')
A_HREF_PATTERN = re.compile(r'<a\s+[^>]*?href="(.*?)"\s*[^>]*?>')
def decode_page(page_bytes, charsets):
"""通过指定的字符集对页面进行解码"""
for charset in charsets:
try:
return page_bytes.decode(charset)
except UnicodeDecodeError:
pass
def get_matched_parts(content_string, pattern):
"""从字符串中提取所有跟正则表达式匹配的内容"""
return pattern.findall(content_string, re.I) \
if content_string else []
def get_matched_part(content_string, pattern, group_no=1):
"""从字符串中提取跟正则表达式匹配的内容"""
match = pattern.search(content_string)
if match:
return match.group(group_no)
def get_page_html(seed_url, *, charsets=('utf-8', )):
"""获取页面的HTML代码"""
resp = requests.get(seed_url)
if resp.status_code == 200:
return decode_page(resp.content, charsets)
def repair_incorrect_href(current_url, href):
"""修正获取的href属性"""
if href.startswith('//'):
href = urljoin('http://', href)
elif href.startswith('/'):
href = urljoin(current_url, href)
return href if href.startswith('http') else ''
def start_crawl(seed_url, pattern, *, max_depth=-1):
"""开始爬取数据"""
new_urls, visited_urls = deque(), set()
new_urls.append((seed_url, 0))
while new_urls:
current_url, depth = new_urls.popleft()
visited_urls.add(current_url)
if depth != max_depth:
page_html = get_page_html(current_url, charsets=('utf-8', 'gbk'))
contents = get_matched_parts(page_html, pattern)
for content in contents:
text = get_matched_part(content, A_TEXT_PATTERN)
href = get_matched_part(content, A_HREF_PATTERN)
if href:
href = repair_incorrect_href(current_url , href)
print(text, href)
if href and href not in visited_urls:
new_urls.append((href, depth + 1))
def main():
"""主函数"""
start_crawl(
seed_url='https://sports.sohu.com/s/nba',
pattern=LI_A_PATTERN,
max_depth=2
)
# 按间距中的绿色按钮以运行脚本。
if __name__ == '__main__':
main()