-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBaseModule.py
267 lines (217 loc) · 10.7 KB
/
BaseModule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# -*- coding: utf-8 -*-
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser
from config import FOLLOW_GENTLEMANS_AGREEMENT
class SearchEngineTokenizer:
def __init__(self):
raise NotImplementedError('Subclass must implement this method.')
def tokenize_disordered_with_word_couont_and_parse_search(self, input_str_list: list[str]) -> dict[str: str | int]:
"""
将输入的字符串列表进行分词,返回一个字典,其中键为词ID或短语ID,值为对应的频次。
同时,字典中还包含一个特殊的键'ARTICLE_ID',其值为频次最高的十个词的ID,用等号连接。
Tokenizes the input list of strings and returns a dictionary where the keys are word IDs or phrase IDs,
and the values are their corresponding frequencies.
The dictionary also includes a special key 'ARTICLE_ID', whose value is the concatenation of the IDs
of the ten most frequent words, connected by equals signs.
:param input_str_list: 输入的字符串列表,每个元素为一个句子 / Input list of strings, each element is a sentence
:return: 分词结果字典 / Dictionary of tokenization results
"""
raise NotImplementedError('Subclass must implement this method.')
def tokenize_in_order(self, input_str: str) -> dict[str, list[str]]:
"""
将输入的字符串按照空格分割,并将每个词转化为对应的词ID,按照原始顺序返回一个字典,
其中键为固定字符串'TOKEN_ID',值为词ID的列表。
Splits the input string by spaces and converts each word into its corresponding word ID.
Returns a dictionary where the key is the fixed string 'TOKENS',
and the value is the list of word IDs in their original order.
:param input_str: 输入的字符串 / Input string
:return: 分词结果字典 / Dictionary of tokenization results
"""
raise NotImplementedError('Subclass must implement this method.')
class AIModule:
def __init__(self):
raise NotImplementedError('Subclass must implement this method.')
def get_response(self, input: object) -> str:
raise NotImplementedError('Subclass must implement this method.')
class Crawler:
def __init__(self, start_url: str, max_depth: int, max_sublinks: int):
"""
初始化爬虫,设置起始URL、最大爬取深度和最大子链接数量
Initialize the crawler with a starting URL, maximum crawling depth, and maximum number of sublinks
:param start_url: 起始URL / Starting URL
:param max_depth: 最大爬取深度 / Maximum crawling depth
:param max_sublinks: 最大子链接数量 / Maximum number of sublinks
"""
# When in China, you are supposed to follow the law of government.
# 当在中国时,你需要遵守政府法律规定使用爬虫。
# 网络爬虫的法律规制:https://www.cac.gov.cn/2019-06/16/c_1124630015.htm
self.start_url = start_url
self.max_depth = max_depth
self.max_sublinks = max_sublinks
# 记录已经解析好的君子协议Robots.txt
self.robots_dict = {}
def crawl(self) -> dict[str, dict]:
"""
执行爬虫主逻辑,遍历页面并提取信息
Main crawling logic, traverse pages and extract information
:return: 爬取结果字典,键为URL,值为包含标题、正文、图标、主图和子链接的字典 /
Dictionary of crawling results, keys are URLs and values are dictionaries containing title, content, icon, main image, and sublinks
"""
result = {}
visited_urls = set()
queue = [(self.start_url, 0)]
print(f"Starting crawling from: {self.start_url}")
while queue:
url, depth = queue.pop(0)
if depth >= self.max_depth:
print(f"Skipping URL: {url} due to reaching maximum depth: {self.max_depth}")
continue
if len(result) >= self.max_sublinks:
print(f"Skipping URL: {url} due to reaching maximum sublinks: {self.max_sublinks}")
continue
if url in visited_urls:
print(f"Skipping URL: {url} as it has already been visited")
continue
# Robots.txt
# 默认需要遵守Robots.txt
if FOLLOW_GENTLEMANS_AGREEMENT:
root_url = self.get_root_url(url)
if root_url not in self.robots_dict:
robots_txt_url = f"{root_url}/robots.txt"
print(f'Parsing robots.txt from {robots_txt_url}...')
self.robots_dict[root_url] = self.parse_robots_txt(robots_txt_url)
print('-' * 10, 'Robots.txt', '-' * 10)
print('Robots.txt result:', self.robots_dict[root_url])
print('-' * 10, 'Robots.txt', '-' * 10)
else:
print('This website\'s Robots.txt file has already been parsed.')
if not self.is_allowed_by_robots_txt(url, self.robots_dict[root_url]):
print(f"Skipping URL: {url} as it is not allowed by robots.txt")
continue
print('-' * 10, f"Processing {len(result) + 1}\'th URL: {url} at depth: {depth}", '-' * 10)
visited_urls.add(url)
print(f"Fetching content for URL: {url}")
content = self.fetch_content(url)
print(f"Extracting information for URL: {url}")
title = self.extract_title(content)
text = self.extract_text(content)
icon = self.extract_icon(content)
main_image = self.extract_main_image(content)
result[url] = {
'title': title,
'content': text,
'icon': icon,
'main_image': main_image,
'sublinks': []
}
print(f"Extracting links for URL: {url}")
links = self.extract_links(content)
for link in links:
normalized_link = self.normalize_url(link, url)
if self.is_valid_url(normalized_link):
print(f"Adding valid sublink: {normalized_link} for URL: {url}")
queue.append((normalized_link, depth + 1))
result[url]['sublinks'].append(normalized_link)
else:
print(f"Skipping invalid sublink: {link} for URL: {url}")
print("Crawling completed")
return result
def extract_title(self, content: str) -> str:
"""
从页面内容中提取标题
Extract the title from the webpage content
:param content: 页面内容 / Webpage content
:return: 页面标题 / Title of the webpage
"""
raise NotImplementedError('Subclass must implement this method.')
def fetch_content(self, url: str) -> str:
"""
根据URL获取页面内容
Fetch the content of a webpage given its URL
:param url: 要获取内容的URL / URL of the webpage to fetch
:return: 页面内容 / Content of the webpage
"""
raise NotImplementedError('Subclass must implement this method.')
def extract_text(self, content: str) -> str:
"""
从页面内容中提取正文
Extract the main text content from the webpage content
:param content: 页面内容 / Webpage content
:return: 页面正文 / Main text content of the webpage
"""
raise NotImplementedError('Subclass must implement this method.')
def extract_icon(self, content: str) -> str:
"""
从页面内容中提取图标的URL
Extract the URL of the icon from the webpage content
:param content: 页面内容 / Webpage content
:return: 图标的URL / URL of the icon
"""
raise NotImplementedError('Subclass must implement this method.')
def extract_main_image(self, content: str) -> str:
"""
从页面内容中提取主图的URL
Extract the URL of the main image from the webpage content
:param content: 页面内容 / Webpage content
:return: 主图的URL / URL of the main image
"""
raise NotImplementedError('Subclass must implement this method.')
def extract_links(self, content: str) -> list[str]:
"""
从页面内容中提取链接的URL列表
Extract the URLs of links from the webpage content
:param content: 页面内容 / Webpage content
:return: 链接URL列表 / List of link URLs
"""
raise NotImplementedError('Subclass must implement this method.')
def is_valid_url(self, url: str) -> bool:
"""
判断URL是否合法
Check if a URL is valid
:param url: 要判断的URL / URL to check
:return: URL是否合法 / Whether the URL is valid
"""
raise NotImplementedError('Subclass must implement this method.')
def normalize_url(self, url: str, base_url: str) -> str:
"""
规范化URL,将相对URL转换为绝对URL
Normalize a URL by converting a relative URL to an absolute URL
:param url: 要规范化的URL / URL to normalize
:param base_url: 基准URL,用于解析相对URL / Base URL for resolving relative URLs
:return: 规范化后的URL / Normalized URL
"""
raise NotImplementedError('Subclass must implement this method.')
# Robots.txt
@staticmethod
def get_root_url(url: str) -> str:
"""
获取URL的根目录
Get the root directory of a URL
:param url: URL
:return: 根目录URL / Root directory URL
"""
parsed_url = urlparse(url)
root_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
return root_url
@staticmethod
def parse_robots_txt(robots_txt_url: str) -> RobotFileParser:
"""
解析robots.txt文件
Parse the robots.txt file
:param robots_txt_url: robots.txt文件的URL / URL of the robots.txt file
:return: 解析后的RobotFileParser对象 / Parsed RobotFileParser object
"""
rp = RobotFileParser()
rp.set_url(robots_txt_url)
rp.read()
return rp
@staticmethod
def is_allowed_by_robots_txt(url: str, rp: RobotFileParser) -> bool:
"""
检查URL是否被robots.txt允许爬取
Check if a URL is allowed to be crawled according to robots.txt
:param url: 要检查的URL / URL to check
:param rp: 解析后的RobotFileParser对象 / Parsed RobotFileParser object
:return: 是否允许爬取 / Whether crawling is allowed
"""
return rp.can_fetch("*", url)