Modify the web crawler filtering mechanism, extract table contents to…

… web crawler result
yym68686 · Dec 13, 2023 · 990362f · 990362f
1 parent 8711e5a
commit 990362f
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 11 deletions.
diff --git a/test/test_Web_crawler.py b/test/test_Web_crawler.py
@@ -63,30 +63,41 @@
 #         print('\033[0m')
 #     return result
 
-def Web_crawler(url: str) -> str:
+def Web_crawler(url: str, isSearch=False) -> str:
     """返回链接网址url正文内容，必须是合法的网址"""
     headers = {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
     }
     result = ''
     try:
         requests.packages.urllib3.disable_warnings()
-        response = requests.get(url, headers=headers, verify=False, timeout=5, stream=True)
+        response = requests.get(url, headers=headers, verify=False, timeout=3, stream=True)
         if response.status_code == 404:
             print("Page not found:", url)
-            return ""
+            return "抱歉，网页不存在，目前无法访问该网页。@Trash@"
         content_length = int(response.headers.get('Content-Length', 0))
         if content_length > 5000000:
             print("Skipping large file:", url)
             return result
         soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8')
+
+        table_contents = ""
+        tables = soup.find_all('table')
+        for table in tables:
+            table_contents += table.get_text()
+            table.decompose()
         body = "".join(soup.find('body').get_text().split('\n'))
-        result = body
+        result = table_contents + body
+        if result == '' and not isSearch:
+            result = "抱歉，可能反爬虫策略，目前无法访问该网页。@Trash@"
+        if result.count("\"") > 1000:
+            result = ""
     except Exception as e:
         print('\033[31m')
         print("error url", url)
         print("error", e)
         print('\033[0m')
+    print("url content", result + "\n\n")
     return result
 
 # def Web_crawler(url: str) -> str:
@@ -121,14 +132,16 @@ def Web_crawler(url: str) -> str:
 # TODO 没办法访问
 # for url in ['https://s.weibo.com/top/summary?cate=realtimehot']:
 # for url in ['https://tophub.today/n/KqndgxeLl9']:
-for url in ['https://support.apple.com/zh-cn/HT213931']:
+# for url in ['https://support.apple.com/zh-cn/HT213931']:
+for url in ['https://developer.aliyun.com/article/721836']:
+# for url in ['https://cn.aliyun.com/page-source/price/detail/machinelearning_price']:
 # for url in ['https://mp.weixin.qq.com/s/Itad7Y-QBcr991JkF3SrIg']:
 # for url in ['https://zhidao.baidu.com/question/317577832.html']:
 # for url in ['https://www.cnn.com/2023/09/06/tech/huawei-mate-60-pro-phone/index.html']:
 # for url in ['https://www.reddit.com/r/China_irl/comments/15qojkh/46%E6%9C%88%E5%A4%96%E8%B5%84%E5%AF%B9%E4%B8%AD%E5%9B%BD%E7%9B%B4%E6%8E%A5%E6%8A%95%E8%B5%84%E5%87%8F87/', 'https://www.apple.com.cn/job-creation/Apple_China_CSR_Report_2020.pdf', 'https://hdr.undp.org/system/files/documents/hdr2013chpdf.pdf']:
 # for url in ['https://www.airuniversity.af.edu/JIPA/Display/Article/3111127/the-uschina-trade-war-vietnam-emerges-as-the-greatest-winner/']:
 # for url in ['https://zhuanlan.zhihu.com/p/646786536', 'https://zh.wikipedia.org/wiki/%E4%BF%84%E7%BE%85%E6%96%AF%E5%85%A5%E4%BE%B5%E7%83%8F%E5%85%8B%E8%98%AD', 'https://stock.finance.sina.com.cn/usstock/quotes/aapl.html']:
-    print(Web_crawler(url))
+    Web_crawler(url)
     print('-----------------------------')
 end_time = time.time()
 run_time = end_time - start_time

diff --git a/utils/agent.py b/utils/agent.py
@@ -264,15 +264,15 @@ def join(self):
         super().join()
         return self._return
 
-def Web_crawler(url: str) -> str:
+def Web_crawler(url: str, isSearch=False) -> str:
     """返回链接网址url正文内容，必须是合法的网址"""
     headers = {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
     }
     result = ''
     try:
         requests.packages.urllib3.disable_warnings()
-        response = requests.get(url, headers=headers, verify=False, timeout=5, stream=True)
+        response = requests.get(url, headers=headers, verify=False, timeout=3, stream=True)
         if response.status_code == 404:
             print("Page not found:", url)
             return "抱歉，网页不存在，目前无法访问该网页。@Trash@"
@@ -281,10 +281,18 @@ def Web_crawler(url: str) -> str:
             print("Skipping large file:", url)
             return result
         soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8')
+
+        table_contents = ""
+        tables = soup.find_all('table')
+        for table in tables:
+            table_contents += table.get_text()
+            table.decompose()
         body = "".join(soup.find('body').get_text().split('\n'))
-        result = body
-        if result == '':
+        result = table_contents + body
+        if result == '' and not isSearch:
             result = "抱歉，可能反爬虫策略，目前无法访问该网页。@Trash@"
+        if result.count("\"") > 1000:
+            result = ""
     except Exception as e:
         print('\033[31m')
         print("error url", url)
@@ -451,7 +459,7 @@ def get_search_results(prompt: str, context_max_tokens: int):
 
     threads = []
     for url in url_set_list:
-        url_search_thread = ThreadWithReturnValue(target=Web_crawler, args=(url,))
+        url_search_thread = ThreadWithReturnValue(target=Web_crawler, args=(url,True,))
         url_search_thread.start()
         threads.append(url_search_thread)
 
@@ -494,6 +502,8 @@ def get_search_results(prompt: str, context_max_tokens: int):
     # # 搜索
 
     # for i in search_web_and_summary("今天的微博热搜有哪些？"):
+    # for i in search_web_and_summary("阿里云24核96G的云主机价格是多少"):
+    # for i in search_web_and_summary("话说葬送的芙莉莲动漫是半年番还是季番？完结没？"):
     # for i in search_web_and_summary("周海媚事件进展"):
     # for i in search_web_and_summary("macos 13.6 有什么新功能"):
     # for i in search_web_and_summary("用python写个网络爬虫给我"):

diff --git a/utils/chatgpt2api.py b/utils/chatgpt2api.py
@@ -617,6 +617,7 @@ def ask_stream(
                         if self.conversation[convo_id][-1 - index]["role"] == "user":
                             self.conversation[convo_id][-1 - index]["content"] = self.conversation[convo_id][-1 - index]["content"].replace("search: ", "")
                             prompt = self.conversation[convo_id][-1 - index]["content"]
+                            prompt = " ".join([prompt, json.loads(full_response)["prompt"]])
                             print("\n\nprompt", prompt)
                             break
                     # prompt = self.conversation[convo_id][-1]["content"]