# 可以更新最近上傳的內容

In [1]:
import os
import time
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup
import urllib.parse
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import logging
from tqdm import tqdm
import hashlib
import re
import urllib3
from typing import Optional, Dict, Any, List


class ArticleScraper:
    def __init__(self, scan_mode="recent", data_file="articles.xlsx"):
        """初始化爬蟲設定"""
        # 常量設定
        self.SETTINGS = {
            'BASE_URL': "https://real-estate.get.com.tw/Columns/",
            'TARGET_AUTHORS': ["曾榮耀", "許文昌", "蘇偉強"],
            'JOURNAL_PARAMS': {
                "no": "1282",
                "pno": "51121"
            },
            'PERFORMANCE': {
                'BATCH_SIZE': 50,
                'MAX_WORKERS': 4,
                'MAX_RETRIES': 5,
                'RETRY_DELAY': 3,
                'REQUEST_INTERVAL': 1.5
            }
        }

        # 基本設定
        self.base_url = self.SETTINGS['BASE_URL']
        self.detail_url = f"{self.base_url}detail.aspx"
        self.journal_url = f"{self.base_url}journal.aspx"
        self.target_authors = self.SETTINGS['TARGET_AUTHORS']
        self.scan_mode = scan_mode
        self.data_file = Path(data_file)

        # 期刊參數設定
        self.journal_params = self.SETTINGS['JOURNAL_PARAMS']

        # 時間範圍設定 - 修改為30天
        self.end_date = datetime.now()
        self.start_date = self.end_date - timedelta(days=30)

        # 效能設定
        self.batch_size = self.SETTINGS['PERFORMANCE']['BATCH_SIZE']
        self.max_workers = self.SETTINGS['PERFORMANCE']['MAX_WORKERS']
        self.max_retries = self.SETTINGS['PERFORMANCE']['MAX_RETRIES']
        self.retry_delay = self.SETTINGS['PERFORMANCE']['RETRY_DELAY']
        self.request_interval = self.SETTINGS['PERFORMANCE']['REQUEST_INTERVAL']

        # 移除文章範圍設定，因為只需要更新最新文章
        
        # 初始化其他組件
        self.setup_directories()
        self.setup_session()
        self.setup_logger()
        self.processed_articles = set()
        self.load_processed_articles()
        self.last_request_time = 0

        # 記錄掃描設定到日誌
        self.logger.info(f"初始化更新模式: {scan_mode}")
        self.logger.info(f"更新時間範圍: {self.start_date.date()} 到 {self.end_date.date()}")


    def setup_directories(self):
        """建立必要的目錄結構"""
        self.base_dir = Path("real_estate_articles")
        self.articles_dir = self.base_dir / "articles"
        self.images_dir = self.articles_dir / "images"
        self.logs_dir = self.base_dir / "logs"

        for directory in [self.base_dir, self.articles_dir, self.images_dir, self.logs_dir]:
            directory.mkdir(parents=True, exist_ok=True)

        # 創建預設的失敗圖片
        self.failed_image_path = self.images_dir / "image_download_failed.png"
        if not self.failed_image_path.exists():
            try:
                from PIL import Image, ImageDraw
                img = Image.new('RGB', (400, 100), color='white')
                d = ImageDraw.Draw(img)
                d.text((10, 40), "Image Download Failed", fill='black')
                img.save(self.failed_image_path)
            except Exception:
                self.failed_image_path.touch()

   

    def setup_session(self):
        """設定請求session"""
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        self.session = requests.Session()
        self.session.headers.update({
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })



    def setup_logger(self):
        """設定日誌系統"""
        self.logger = logging.getLogger('ArticleScraper')
        # 清除現有的 handlers
        self.logger.handlers = []
        self.logger.setLevel(logging.INFO)

        log_file = self.logs_dir / f"scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
        handlers = [
            logging.FileHandler(log_file, encoding='utf-8'),
            logging.StreamHandler()
        ]

        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        for handler in handlers:
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)


    def load_processed_articles(self):
        """載入已處理的文章"""
        if self.data_file.exists():
            df = pd.read_excel(self.data_file)
            if '文章編號' in df.columns:
                self.processed_articles = set(df['文章編號'].astype(str))

    def wait_between_requests(self):
        """控制請求間隔"""
        current_time = time.time()
        elapsed = current_time - self.last_request_time
        if elapsed < self.request_interval:
            time.sleep(self.request_interval - elapsed)
        self.last_request_time = current_time

    def get_max_page_number(self) -> int:
        """獲取期刊最大頁數"""
        try:
            self.wait_between_requests()
            response = self.session.get(
                self.journal_url,
                params=self.journal_params,
                timeout=30,
                verify=False
            )
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # 找到分頁元素
            pagination = soup.select('.pagination a')
            if pagination:
                page_numbers = []
                for a in pagination:
                    try:
                        page_numbers.append(int(a.text.strip()))
                    except ValueError:
                        continue
                return max(page_numbers) if page_numbers else 1
            return 1

        except Exception as e:
            self.logger.error(f"獲取最大頁數失敗: {str(e)}")
            return 30  # 預設較大的頁數以確保不遺漏

    def get_article_urls_from_journal(self, page_no: int) -> List[int]:
        """從期刊頁面獲取文章編號列表"""
        try:
            params = self.journal_params.copy()
            params['page_no'] = page_no

            self.wait_between_requests()
            response = self.session.get(
                self.journal_url,
                params=params,
                timeout=30,
                verify=False
            )
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 尋找文章列表
            articles = []
            for link in soup.select('a[href*="detail.aspx?no="]'):
                try:
                    href = link.get('href', '')
                    if match := re.search(r'no=(\d+)', href):
                        article_no = int(match.group(1))
                        # 檢查發布日期（如果有的話）
                        date_element = link.find_next('td', class_='date')
                        if date_element:
                            date_str = date_element.text.strip()
                            try:
                                pub_date = datetime.strptime(date_str, '%Y-%m-%d')
                                if pub_date < self.start_date:
                                    continue
                            except ValueError:
                                pass
                        
                        if str(article_no) not in self.processed_articles:
                            articles.append(article_no)
                except ValueError:
                    continue

            if articles:
                self.logger.info(f"第 {page_no} 頁找到 {len(articles)} 篇新文章")
            
            return articles

        except Exception as e:
            self.logger.error(f"獲取第 {page_no} 頁文章列表失敗: {str(e)}")
            return []



    def fetch_article(self, article_no: int) -> Optional[Dict]:
        """抓取單篇文章"""
        if str(article_no) in self.processed_articles:
            self.logger.debug(f"文章 {article_no} 已處理過，跳過")
            return None

        for retry in range(self.max_retries):
            try:
                self.wait_between_requests()
                
                url = f"{self.detail_url}?no={article_no}"
                self.logger.info(f"開始請求文章 URL: {url}")  # 改為 INFO 級別
                
                # 使用 session 而不是直接使用 requests
                response = self.session.get(
                    url,
                    timeout=30,
                    verify=False
                )
                
                # 記錄響應狀態
                self.logger.info(f"文章 {article_no} 請求狀態碼: {response.status_code}")
                
                if response.status_code == 404:
                    self.logger.error(f"文章 {article_no} 不存在 (404)")
                    return None
                
                response.raise_for_status()
                response.encoding = 'utf-8'
                
                # 記錄響應內容長度
                content_length = len(response.text)
                self.logger.info(f"文章 {article_no} 響應內容長度: {content_length}")
                
                if content_length < 100:  # 假設正常文章至少有100字符
                    self.logger.error(f"文章 {article_no} 響應內容過短，可能是無效響應")
                    return None
                
                # 保存原始響應以供調試
                debug_file = self.logs_dir / f"article_{article_no}_response.html"
                with open(debug_file, 'w', encoding='utf-8') as f:
                    f.write(response.text)
                self.logger.info(f"已保存文章 {article_no} 的原始響應到: {debug_file}")
                
                # 解析HTML
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # 檢查頁面是否包含預期的元素
                if not soup.select('.columnsDetail_tableRow'):
                    self.logger.error(f"文章 {article_no} 頁面結構不符合預期")
                    return None
                
                # 使用 parse_article 方法解析內容
                article_data = self.parse_article(soup, article_no)
                
                if article_data and self.validate_article(article_data):
                    self.logger.info(f"文章 {article_no} 解析成功")
                    return article_data
                else:
                    self.logger.error(f"文章 {article_no} 解析失敗或不符合條件")
                    return None

            except requests.exceptions.RequestException as e:
                self.logger.error(f"抓取文章 {article_no} 失敗 (嘗試 {retry + 1}/{self.max_retries}): {str(e)}")
                if retry < self.max_retries - 1:
                    wait_time = self.retry_delay * (retry + 1)
                    self.logger.info(f"等待 {wait_time} 秒後重試...")
                    time.sleep(wait_time)
                continue
            except Exception as e:
                self.logger.error(f"處理文章 {article_no} 時發生未預期錯誤: {str(e)}")
                import traceback
                self.logger.error(f"錯誤堆疊: {traceback.format_exc()}")
                if retry < self.max_retries - 1:
                    wait_time = self.retry_delay * (retry + 1)
                    self.logger.info(f"等待 {wait_time} 秒後重試...")
                    time.sleep(wait_time)
                continue

        return None



                
                
    def check_specific_article(self, article_no: int):
        """檢查特定文章"""
        self.logger.info(f"開始檢查特定文章: {article_no}")
        try:
            result = self.fetch_article(article_no)
            if result:
                self.logger.info(f"文章 {article_no} 抓取成功，開始保存")
                self.save_article(result)
                self.create_index()
                self.logger.info(f"成功抓取並保存文章 {article_no}")
            else:
                self.logger.error(f"無法抓取文章 {article_no}")
        except Exception as e:
            self.logger.error(f"檢查文章 {article_no} 時發生錯誤: {str(e)}")




    def parse_article(self, soup: BeautifulSoup, article_no: int) -> Optional[Dict]:
        """解析文章內容"""
        try:
            article_info = {}
            
            # 記錄找到的欄位數量
            found_fields = 0
            
            # 解析文章基本信息
            rows = soup.select('.columnsDetail_tableRow')
            self.logger.info(f"文章 {article_no} 找到 {len(rows)} 個資料列")
            
            for row in rows:
                th = row.select_one('.columnsDetail_tableth')
                td = row.select_one('.columnsDetail_tabletd')
                
                if th and td:
                    key = th.text.strip()
                    value = td.text.strip()
                    
                    self.logger.debug(f"文章 {article_no} 欄位: {key} = {value[:50]}...")
                    
                    if key == '篇名':
                        article_info['標題'] = value
                        found_fields += 1
                    elif key == '作者':
                        article_info['作者'] = value
                        found_fields += 1
                    elif key == '日期':
                        article_info['日期'] = value
                        found_fields += 1
                    elif key == '內文':
                        article_info['內文HTML'] = str(td)
                        found_fields += 1

            self.logger.info(f"文章 {article_no} 成功解析 {found_fields} 個欄位")

            # 驗證必要欄位
            missing_fields = []
            for field in ['標題', '作者', '日期']:
                if field not in article_info:
                    missing_fields.append(field)
            
            if missing_fields:
                self.logger.error(f"文章 {article_no} 缺少必要欄位: {', '.join(missing_fields)}")
                return None

            # 驗證作者
            if not any(target in article_info['作者'] for target in self.target_authors):
                self.logger.info(f"文章 {article_no} 作者不符合目標: {article_info['作者']}")
                return None

            # 處理文章內容
            content = self.process_content(article_info.get('內文HTML', ''), article_no)
            
            if not content:
                self.logger.error(f"文章 {article_no} 內文處理後為空")
                return None
            
            # 構建完整的文章數據
            article_data = {
                '文章編號': article_no,
                '標題': article_info['標題'],
                '作者': article_info['作者'],
                '日期': article_info['日期'],
                '內文': content,
                'URL': f"{self.detail_url}?no={article_no}",
                '爬取時間': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }

            return article_data

        except Exception as e:
            self.logger.error(f"解析文章 {article_no} 失敗: {str(e)}")
            import traceback
            self.logger.error(f"錯誤堆疊: {traceback.format_exc()}")
            return None



    def download_image(self, img_url: str, article_no: int) -> Optional[str]:
        """下載圖片並返回本地檔名"""
        try:
            if not img_url.startswith(('http://', 'https://')):
                img_url = urllib.parse.urljoin(self.base_url, img_url)

            # 生成唯一的檔名
            url_hash = hashlib.md5(img_url.encode()).hexdigest()[:8]
            file_ext = os.path.splitext(img_url)[1] or '.jpg'
            local_filename = f"{article_no}_{url_hash}{file_ext}"
            local_path = self.images_dir / local_filename

            # 如果圖片已存在就直接返回
            if local_path.exists():
                return local_filename

            # 下載圖片
            self.wait_between_requests()
            response = self.session.get(img_url, timeout=30, verify=False)
            response.raise_for_status()

            # 保存圖片
            with open(local_path, 'wb') as f:
                f.write(response.content)

            return local_filename

        except Exception as e:
            self.logger.error(f"下載圖片失敗 ({img_url}): {str(e)}")
            return None

    def process_content(self, html_content: str, article_no: int) -> str:
        """處理文章內容，包括下載圖片和清理HTML"""
        if not html_content:
            return ""

        soup = BeautifulSoup(html_content, 'html.parser')
        image_references = []  # 用於存儲圖片引用

        # 處理圖片
        for index, img in enumerate(soup.find_all('img'), 1):
            img_url = img.get('src', '')
            if img_url:
                local_filename = self.download_image(img_url, article_no)
                if local_filename:
                    # 保存圖片引用
                    image_references.append(
                        f"\n![圖片{index}](./images/{local_filename})\n")
                    # 在原文中插入圖片標記
                    img.replace_with(f"[圖片{index}]")
                else:
                    img.replace_with("[圖片下載失敗]")

        # 清理HTML並格式化
        content = self._format_content(soup)

        # 在文章末尾添加所有圖片
        if image_references:
            content += "\n\n## 文章圖片\n"
            content += "".join(image_references)

        return content

    def _format_content(self, soup: BeautifulSoup) -> str:
        """格式化文章內容，處理換行和縮排"""
        # 允許的HTML標籤
        allowed_tags = {'p', 'br', 'h1', 'h2', 'h3',
                        'h4', 'h5', 'h6', 'ul', 'ol', 'li'}

        # 移除不允許的標籤但保留其文本
        for tag in soup.find_all():
            if tag.name not in allowed_tags:
                tag.unwrap()

        # 確保段落之間有適當的換行
        for p in soup.find_all('p'):
            text = p.get_text().strip()
            if text:  # 只處理非空段落
                p.string = ' '.join(text.split())
                p.append('\n\n')

        # 處理標題
        for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            level = int(h.name[1])  # 獲取標題級別
            prefix = '#' * level + ' '  # 根據級別添加 # 符號
            h.string = f'\n{prefix}{h.get_text().strip()}\n'

        # 處理列表
        for li in soup.find_all('li'):
            indent = '  '
            if li.parent.name == 'ol':
                # 為有序列表添加數字
                index = len(li.find_previous_siblings('li')) + 1
                li.insert(0, f'{indent}{index}. ')
            else:
                # 為無序列表添加圓點
                li.insert(0, f'{indent}• ')
            li.append('\n')

        # 獲取處理後的文本
        content = soup.get_text()

        # 清理最終文本
        content = re.sub(r'\n{3,}', '\n\n', content)  # 移除過多的空行
        content = re.sub(r'[ \t]+', ' ', content)     # 標準化空格
        content = re.sub(r' *\n *', '\n', content)    # 清理行首尾空格

        # 分段處理並保持段落間的空行
        paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
        formatted_content = '\n\n'.join(paragraphs)

        return formatted_content.strip()

    def validate_article(self, article_data: Dict) -> bool:
        """驗證文章資料完整性"""
        required_fields = ['標題', '作者', '日期', '內文']
        return all(field in article_data and article_data[field] for field in required_fields)

    def save_article(self, article_data: Dict) -> None:
        """儲存文章"""
        try:
            # 更新 Excel 資料
            new_df = pd.DataFrame([article_data])
            if self.data_file.exists():
                df = pd.read_excel(self.data_file)
                df = pd.concat([df, new_df]).drop_duplicates(subset=['文章編號'])
            else:
                df = new_df
            df.to_excel(self.data_file, index=False)

            # 建立 Markdown 文件
            article_no = article_data['文章編號']
            title = re.sub(r'[<>:"/\\|?*]', '', article_data['標題'])[:100]

            markdown_content = f"""# {article_data['標題']}

## 文章資訊
- 文章編號：{article_no}
- 作者：{article_data['作者']}
- 發布日期：{article_data['日期']}
- 爬取時間：{article_data['爬取時間']}
- 原文連結：[閱讀原文]({article_data['URL']})

## 內文
{article_data['內文']}

---
*注：本文圖片存放於 ./images/ 目錄下*
"""

            # 儲存 Markdown 文件
            file_path = self.articles_dir / f"{article_no}_{title}.md"
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)

            # 更新已處理集合
            self.processed_articles.add(str(article_no))

        except Exception as e:
            self.logger.error(f"儲存文章失敗: {str(e)}")

    def create_index(self):
        """創建文章索引，整合原有目錄內容"""
        try:
            # 讀取 Excel 檔案獲取所有文章資訊
            if not self.data_file.exists():
                return

            df = pd.read_excel(self.data_file)

            # 讀取現有的目錄文件（如果存在）
            index_path = self.base_dir / "README.md"
            existing_content = []
            existing_articles = set()

            if index_path.exists():
                with open(index_path, 'r', encoding='utf-8') as f:
                    existing_content = f.read().splitlines()
                    # 提取現有目錄中的文章編號
                    for line in existing_content:
                        if match := re.search(r'/(\d+)_[^/]+\.md', line):
                            existing_articles.add(match.group(1))

            # 只處理新文章
            df['文章編號'] = df['文章編號'].astype(str)
            new_articles = df[~df['文章編號'].isin(existing_articles)]

            if new_articles.empty and existing_content:
                self.logger.info("沒有新文章需要添加到目錄")
                return

            # 如果有現有內容，保留開頭的通用部分（如標題）
            index_content = []
            for line in existing_content:
                if line.startswith('## '):
                    break
                index_content.append(line)

            if not index_content:  # 如果是全新的目錄
                index_content = ["# 文章目錄", ""]

            # 合併現有文章和新文章
            all_articles = pd.concat([
                df[df['文章編號'].isin(existing_articles)],
                new_articles
            ]).drop_duplicates(subset=['文章編號'])

            # 按作者和日期排序
            all_articles['日期'] = pd.to_datetime(all_articles['日期'])
            all_articles = all_articles.sort_values(
                ['作者', '日期'], ascending=[True, False])

            # 按作者分組
            for author in sorted(all_articles['作者'].unique()):
                index_content.append(f"## {author}")
                author_articles = all_articles[all_articles['作者'] == author]

                # 按年份分組
                for year in sorted(author_articles['日期'].dt.year.unique(), reverse=True):
                    index_content.append(f"\n### {year}年")
                    year_articles = author_articles[author_articles['日期'].dt.year == year]

                    # 生成文章列表
                    for _, article in year_articles.iterrows():
                        title = re.sub(r'[<>:"/\\|?*]', '',
                                       article['標題'])[:100]
                        file_name = f"{article['文章編號']}_{title}.md"
                        date_str = article['日期'].strftime('%Y-%m-%d')
                        index_content.append(
                            f"- {date_str} [{article['標題']
                                             }](./articles/{file_name})"
                        )

                index_content.append("")  # 作者之間加入空行

            # 保存更新後的目錄文件
            with open(index_path, 'w', encoding='utf-8') as f:
                f.write('\n'.join(index_content))

            self.logger.info(f"已成功更新文章目錄，新增 {len(new_articles)} 篇文章")

        except Exception as e:
            self.logger.error(f"更新目錄失敗: {str(e)}")

    def run(self):
        """執行文章更新"""
        self.logger.info(f"開始檢查文章更新 (檢查範圍: {self.start_date.date()} 到 {self.end_date.date()})")

        success_count = 0
        fail_count = 0

        try:
            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                futures = []

                # 從期刊頁面獲取最新文章
                max_page = self.get_max_page_number()
                self.logger.info(f"檢測到期刊總頁數: {max_page}")

                article_numbers = set()

                # 只檢查前5頁以提高效率
                for page_no in range(1, min(6, max_page + 1)):
                    self.logger.info(f"正在檢查第 {page_no} 頁的文章列表")
                    page_articles = self.get_article_urls_from_journal(page_no)
                    article_numbers.update(page_articles)

                    if not page_articles:
                        self.logger.info(f"第 {page_no} 頁沒有找到新文章，停止檢查")
                        break

                self.logger.info(f"找到 {len(article_numbers)} 篇待檢查文章")

                # 處理找到的文章
                for article_no in article_numbers:
                    if str(article_no) not in self.processed_articles:
                        futures.append(executor.submit(self.fetch_article, article_no))

                # 處理結果
                with tqdm(total=len(futures), desc="更新文章") as pbar:
                    for future in futures:
                        try:
                            result = future.result()
                            if result:
                                self.save_article(result)
                                success_count += 1
                            else:
                                fail_count += 1
                            pbar.update(1)
                        except Exception as e:
                            self.logger.error(f"處理文章更新失敗: {str(e)}")
                            fail_count += 1
                            pbar.update(1)

        finally:
            self.logger.info(f"更新完成：成功 {success_count} 篇，失敗 {fail_count} 篇")
            if success_count > 0:
                self.create_index()
            self.logger.info("更新程序結束")


if __name__ == "__main__":
    scraper = ArticleScraper(scan_mode="recent")

    # # 檢查特定文章
    # specific_article = 913375
    # scraper.check_specific_article(specific_article)

    # 執行一般更新
    scraper.run()

2025-03-08 10:08:47,550 - INFO - 初始化更新模式: recent
2025-03-08 10:08:47,558 - INFO - 更新時間範圍: 2025-02-06 到 2025-03-08
2025-03-08 10:08:47,559 - INFO - 開始檢查文章更新 (檢查範圍: 2025-02-06 到 2025-03-08)
2025-03-08 10:08:48,390 - INFO - 檢測到期刊總頁數: 1
2025-03-08 10:08:48,392 - INFO - 正在檢查第 1 頁的文章列表
2025-03-08 10:08:49,129 - INFO - 第 1 頁找到 4 篇新文章
2025-03-08 10:08:49,129 - INFO - 找到 3 篇待檢查文章
更新文章:   0%|          | 0/3 [00:00<?, ?it/s]2025-03-08 10:08:49,902 - INFO - 開始請求文章 URL: https://real-estate.get.com.tw/Columns/detail.aspx?no=913445
2025-03-08 10:08:49,902 - INFO - 開始請求文章 URL: https://real-estate.get.com.tw/Columns/detail.aspx?no=913460
2025-03-08 10:08:49,903 - INFO - 開始請求文章 URL: https://real-estate.get.com.tw/Columns/detail.aspx?no=913517
2025-03-08 10:08:49,933 - INFO - 文章 913445 請求狀態碼: 200
2025-03-08 10:08:49,936 - INFO - 文章 913445 響應內容長度: 30337
2025-03-08 10:08:50,021 - INFO - 已保存文章 913445 的原始響應到: real_estate_articles\logs\article_913445_response.html
2025-03-08 10:08:50,067 - INFO - 文章 913445 找到