Skip to content

Commit

Permalink
Modify the web crawler filtering mechanism, extract table contents to…
Browse files Browse the repository at this point in the history
… web crawler result
  • Loading branch information
yym68686 committed Dec 13, 2023
1 parent 8711e5a commit 990362f
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 11 deletions.
25 changes: 19 additions & 6 deletions test/test_Web_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,30 +63,41 @@
# print('\033[0m')
# return result

def Web_crawler(url: str) -> str:
def Web_crawler(url: str, isSearch=False) -> str:
"""返回链接网址url正文内容,必须是合法的网址"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
result = ''
try:
requests.packages.urllib3.disable_warnings()
response = requests.get(url, headers=headers, verify=False, timeout=5, stream=True)
response = requests.get(url, headers=headers, verify=False, timeout=3, stream=True)
if response.status_code == 404:
print("Page not found:", url)
return ""
return "抱歉,网页不存在,目前无法访问该网页。@Trash@"
content_length = int(response.headers.get('Content-Length', 0))
if content_length > 5000000:
print("Skipping large file:", url)
return result
soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8')

table_contents = ""
tables = soup.find_all('table')
for table in tables:
table_contents += table.get_text()
table.decompose()
body = "".join(soup.find('body').get_text().split('\n'))
result = body
result = table_contents + body
if result == '' and not isSearch:
result = "抱歉,可能反爬虫策略,目前无法访问该网页。@Trash@"
if result.count("\"") > 1000:
result = ""
except Exception as e:
print('\033[31m')
print("error url", url)
print("error", e)
print('\033[0m')
print("url content", result + "\n\n")
return result

# def Web_crawler(url: str) -> str:
Expand Down Expand Up @@ -121,14 +132,16 @@ def Web_crawler(url: str) -> str:
# TODO 没办法访问
# for url in ['https://s.weibo.com/top/summary?cate=realtimehot']:
# for url in ['https://tophub.today/n/KqndgxeLl9']:
for url in ['https://support.apple.com/zh-cn/HT213931']:
# for url in ['https://support.apple.com/zh-cn/HT213931']:
for url in ['https://developer.aliyun.com/article/721836']:
# for url in ['https://cn.aliyun.com/page-source/price/detail/machinelearning_price']:
# for url in ['https://mp.weixin.qq.com/s/Itad7Y-QBcr991JkF3SrIg']:
# for url in ['https://zhidao.baidu.com/question/317577832.html']:
# for url in ['https://www.cnn.com/2023/09/06/tech/huawei-mate-60-pro-phone/index.html']:
# for url in ['https://www.reddit.com/r/China_irl/comments/15qojkh/46%E6%9C%88%E5%A4%96%E8%B5%84%E5%AF%B9%E4%B8%AD%E5%9B%BD%E7%9B%B4%E6%8E%A5%E6%8A%95%E8%B5%84%E5%87%8F87/', 'https://www.apple.com.cn/job-creation/Apple_China_CSR_Report_2020.pdf', 'https://hdr.undp.org/system/files/documents/hdr2013chpdf.pdf']:
# for url in ['https://www.airuniversity.af.edu/JIPA/Display/Article/3111127/the-uschina-trade-war-vietnam-emerges-as-the-greatest-winner/']:
# for url in ['https://zhuanlan.zhihu.com/p/646786536', 'https://zh.wikipedia.org/wiki/%E4%BF%84%E7%BE%85%E6%96%AF%E5%85%A5%E4%BE%B5%E7%83%8F%E5%85%8B%E8%98%AD', 'https://stock.finance.sina.com.cn/usstock/quotes/aapl.html']:
print(Web_crawler(url))
Web_crawler(url)
print('-----------------------------')
end_time = time.time()
run_time = end_time - start_time
Expand Down
20 changes: 15 additions & 5 deletions utils/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,15 +264,15 @@ def join(self):
super().join()
return self._return

def Web_crawler(url: str) -> str:
def Web_crawler(url: str, isSearch=False) -> str:
"""返回链接网址url正文内容,必须是合法的网址"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
result = ''
try:
requests.packages.urllib3.disable_warnings()
response = requests.get(url, headers=headers, verify=False, timeout=5, stream=True)
response = requests.get(url, headers=headers, verify=False, timeout=3, stream=True)
if response.status_code == 404:
print("Page not found:", url)
return "抱歉,网页不存在,目前无法访问该网页。@Trash@"
Expand All @@ -281,10 +281,18 @@ def Web_crawler(url: str) -> str:
print("Skipping large file:", url)
return result
soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8')

table_contents = ""
tables = soup.find_all('table')
for table in tables:
table_contents += table.get_text()
table.decompose()
body = "".join(soup.find('body').get_text().split('\n'))
result = body
if result == '':
result = table_contents + body
if result == '' and not isSearch:
result = "抱歉,可能反爬虫策略,目前无法访问该网页。@Trash@"
if result.count("\"") > 1000:
result = ""
except Exception as e:
print('\033[31m')
print("error url", url)
Expand Down Expand Up @@ -451,7 +459,7 @@ def get_search_results(prompt: str, context_max_tokens: int):

threads = []
for url in url_set_list:
url_search_thread = ThreadWithReturnValue(target=Web_crawler, args=(url,))
url_search_thread = ThreadWithReturnValue(target=Web_crawler, args=(url,True,))
url_search_thread.start()
threads.append(url_search_thread)

Expand Down Expand Up @@ -494,6 +502,8 @@ def get_search_results(prompt: str, context_max_tokens: int):
# # 搜索

# for i in search_web_and_summary("今天的微博热搜有哪些?"):
# for i in search_web_and_summary("阿里云24核96G的云主机价格是多少"):
# for i in search_web_and_summary("话说葬送的芙莉莲动漫是半年番还是季番?完结没?"):
# for i in search_web_and_summary("周海媚事件进展"):
# for i in search_web_and_summary("macos 13.6 有什么新功能"):
# for i in search_web_and_summary("用python写个网络爬虫给我"):
Expand Down
1 change: 1 addition & 0 deletions utils/chatgpt2api.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,7 @@ def ask_stream(
if self.conversation[convo_id][-1 - index]["role"] == "user":
self.conversation[convo_id][-1 - index]["content"] = self.conversation[convo_id][-1 - index]["content"].replace("search: ", "")
prompt = self.conversation[convo_id][-1 - index]["content"]
prompt = " ".join([prompt, json.loads(full_response)["prompt"]])
print("\n\nprompt", prompt)
break
# prompt = self.conversation[convo_id][-1]["content"]
Expand Down

0 comments on commit 990362f

Please sign in to comment.