In [None]:
import os
import time
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup
import urllib.parse
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import logging
from tqdm import tqdm
import hashlib
import re
import urllib3


class ArticleScraper:
    def __init__(self, scan_mode="all", data_file="articles.xlsx"):
        """初始化爬蟲設定"""
        # 基本設定
        self.base_url = "https://real-estate.get.com.tw/Columns/detail.aspx?no="
        self.target_authors = ["曾榮耀", "許文昌", "蘇偉強"]
        self.scan_mode = scan_mode
        self.data_file = Path(data_file)

        # 時間範圍設定
        self.end_date = datetime.now()
        self.start_date = self.end_date - timedelta(days=9*365)

        # 效能設定
        self.batch_size = 200
        self.max_workers = 16
        self.max_retries = 3
        self.retry_delay = 5

        # 文章編號範圍
        self.start_no = 409119
        self.max_no = 915000

        # 初始化
        self.setup_directories()
        self.setup_session()
        self.setup_logger()
        self.processed_articles = set()
        self.load_processed_articles()

    def setup_directories(self):
        """建立必要的目錄結構"""
        self.base_dir = Path("real_estate_articles")
        self.articles_dir = self.base_dir / "articles"
        self.images_dir = self.articles_dir / "images"
        self.logs_dir = self.base_dir / "logs"

        for directory in [self.base_dir, self.articles_dir, self.images_dir, self.logs_dir]:
            directory.mkdir(parents=True, exist_ok=True)

        # 創建預設的失敗圖片
        self.failed_image_path = self.images_dir / "image_download_failed.png"
        if not self.failed_image_path.exists():
            try:
                from PIL import Image, ImageDraw
                img = Image.new('RGB', (400, 100), color='white')
                d = ImageDraw.Draw(img)
                d.text((10, 40), "Image Download Failed", fill='black')
                img.save(self.failed_image_path)
            except Exception:
                self.failed_image_path.touch()

    def setup_session(self):
        """設定請求session"""
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        })

    def setup_logger(self):
        """設定日誌系統"""
        self.logger = logging.getLogger('ArticleScraper')
        self.logger.setLevel(logging.INFO)

        log_file = self.logs_dir / \
            f"scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
        handlers = [
            logging.FileHandler(log_file, encoding='utf-8'),
            logging.StreamHandler()
        ]

        formatter = logging.Formatter(
            '%(asctime)s - %(levelname)s - %(message)s')
        for handler in handlers:
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)

    def load_processed_articles(self):
        """載入已處理的文章"""
        if self.data_file.exists():
            df = pd.read_excel(self.data_file)
            if '文章編號' in df.columns:
                self.processed_articles = set(df['文章編號'].astype(str))

    def download_image(self, img_url: str, article_no: int) -> str:
        """下載圖片並返回本地檔名"""
        for retry in range(self.max_retries):
            try:
                if not img_url.startswith('http'):
                    img_url = urllib.parse.urljoin(self.base_url, img_url)

                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                    'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
                    'Accept-Encoding': 'gzip, deflate',
                    'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
                    'Referer': self.base_url,
                    'Connection': 'keep-alive',
                    'Cache-Control': 'no-cache',
                    'Pragma': 'no-cache'
                }

                response = self.session.get(
                    img_url,
                    stream=True,
                    timeout=30,
                    headers=headers,
                    verify=False
                )
                response.raise_for_status()

                content_type = response.headers.get('content-type', '')
                if not content_type.startswith('image/'):
                    raise ValueError(f"非圖片內容類型: {content_type}")

                img_data = response.content
                img_hash = hashlib.md5(img_data).hexdigest()
                img_ext = os.path.splitext(urllib.parse.urlparse(img_url).path)[
                    1] or '.jpg'
                local_filename = f"{article_no}_{img_hash}{img_ext}"
                local_path = self.images_dir / local_filename

                if not local_path.exists():
                    with open(local_path, 'wb') as f:
                        f.write(img_data)

                return local_filename

            except Exception as e:
                self.logger.error(
                    f"下載圖片失敗 (嘗試 {retry + 1}/{self.max_retries}) {img_url}: {str(e)}")
                if retry < self.max_retries - 1:
                    time.sleep(self.retry_delay * (retry + 1))
                    continue

        return "image_download_failed.png"

    def fetch_article(self, article_no):
        """抓取單篇文章"""
        if str(article_no) in self.processed_articles:
            return None

        for retry in range(self.max_retries):
            try:
                url = f"{self.base_url}{article_no}"
                response = self.session.get(url, timeout=30, verify=False)

                if response.status_code == 404:
                    return None

                response.raise_for_status()
                response.encoding = 'utf-8'

                soup = BeautifulSoup(response.text, 'html.parser')
                article_data = self.parse_article(soup, article_no, url)

                if article_data and self.validate_article(article_data):
                    return article_data

            except requests.exceptions.RequestException as e:
                self.logger.error(f"抓取文章 {article_no} 失敗 (嘗試 {
                                  retry + 1}/{self.max_retries}): {str(e)}")
                if retry < self.max_retries - 1:
                    time.sleep(self.retry_delay * (retry + 1))
                    continue
            except Exception as e:
                self.logger.error(f"處理文章 {article_no} 時發生未預期錯誤: {str(e)}")
                break

        return None

    def parse_article(self, soup, article_no, url):
        """解析文章內容"""
        try:
            article_info = {}
            for row in soup.select('.columnsDetail_tableRow'):
                th = row.select_one('.columnsDetail_tableth')
                td = row.select_one('.columnsDetail_tabletd')
                if th and td:
                    key = th.text.strip()
                    value = td.text.strip()
                    article_info[key] = value

                    if key == '內文':
                        article_info['內文HTML'] = td

            if not article_info:
                return None

            if not any(author in article_info.get('作者', '') for author in self.target_authors):
                return None

            try:
                article_date = datetime.strptime(
                    article_info.get('日期', ''), '%Y/%m/%d')
                if not self.start_date <= article_date <= self.end_date:
                    return None
            except ValueError:
                return None

            return {
                '文章編號': article_no,
                '標題': article_info.get('篇名', ''),
                '作者': article_info.get('作者', ''),
                '日期': article_info.get('日期', ''),
                '內文': self.process_content(article_info.get('內文HTML'), article_no),
                'URL': url,
                '爬取時間': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }

        except Exception as e:
            self.logger.error(f"解析文章 {article_no} 失敗: {str(e)}")
            return None

    def process_content(self, content_html, article_no):
        """處理文章內容"""
        if not content_html:
            return ""

        content = []
        try:
            for element in content_html.descendants:
                if isinstance(element, str):
                    text = element.strip()
                    if text:
                        content.append(text)
                elif element.name == 'img':
                    try:
                        img_src = element.get('src')
                        if img_src:
                            local_img = self.download_image(
                                img_src, article_no)
                            if local_img:
                                content.append(
                                    f"\n![圖片](./images/{local_img})\n")
                    except Exception as e:
                        self.logger.error(f"處理圖片元素失敗: {str(e)}")
                        continue

            return '\n'.join(filter(None, content))
        except Exception as e:
            self.logger.error(f"處理文章 {article_no} 內容失敗: {str(e)}")
            return ""

    def validate_article(self, article_data):
        """驗證文章資料完整性"""
        required_fields = ['標題', '作者', '日期', '內文']
        return all(field in article_data and article_data[field] for field in required_fields)

    def save_article(self, article_data):
        """儲存文章"""
        try:
            # 更新 Excel 資料
            new_df = pd.DataFrame([article_data])
            if self.data_file.exists():
                df = pd.read_excel(self.data_file)
                df = pd.concat([df, new_df]).drop_duplicates(subset=['文章編號'])
            else:
                df = new_df
            df.to_excel(self.data_file, index=False)

            # 建立 Markdown 文件
            article_no = article_data['文章編號']
            title = re.sub(r'[<>:"/\\|?*]', '', article_data['標題'])[:100]

            markdown_content = f"""# {article_data['標題']}

## 文章資訊
- 文章編號：{article_no}
- 作者：{article_data['作者']}
- 發布日期：{article_data['日期']}
- 爬取時間：{article_data['爬取時間']}
- 原文連結：[閱讀原文]({article_data['URL']})

## 內文
{article_data['內文']}

---
*注：本文圖片存放於 ./images/ 目錄下*
"""

            # 儲存 Markdown 文件
            file_path = self.articles_dir / f"{article_no}_{title}.md"
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)

            # 更新已處理集合
            self.processed_articles.add(str(article_no))

        except Exception as e:
            self.logger.error(f"儲存文章失敗: {str(e)}")

    def create_index(self):
        """建立索引頁面"""
        try:
            # 讀取文章資料
            if not self.data_file.exists():
                self.logger.error("找不到文章資料檔案")
                return

            df = pd.read_excel(self.data_file)

            # 將日期轉換為datetime格式並排序
            df['日期'] = pd.to_datetime(df['日期'])
            df = df.sort_values('日期', ascending=False)

            # 依年份分組
            years = df['日期'].dt.year.unique()

            # 建立索引內容
            content = ["# 地政專欄文章索引\n"]

            # 添加統計資訊
            content.append("## 文章統計\n")
            content.append(f"- 總文章數：{len(df)}篇")
            content.append(f"- 收錄年份：{min(years)}年 - {max(years)}年")
            content.append(
                f"- 最後更新：{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

            # 作者統計
            author_stats = df['作者'].value_counts()
            content.append("### 作者文章統計")
            for author, count in author_stats.items():
                content.append(f"- {author}: {count}篇")
            content.append("")

            content.append("## 目錄\n")

            # 建立年份快速連結
            content.append("### 年份快速導覽")
            for year in sorted(years, reverse=True):
                content.append(f"- [{year}年](#year-{year})")
            content.append("\n---\n")

            # 依年份建立文章列表
            for year in sorted(years, reverse=True):
                content.append(f"## {year}年 {{#year-{year}}}\n")
                year_df = df[df['日期'].dt.year == year]

                # 依月份分組
                for month in sorted(year_df['日期'].dt.month.unique(), reverse=True):
                    content.append(f"### {month}月\n")
                    month_df = year_df[year_df['日期'].dt.month == month]

                    # 列出當月文章
                    for _, article in month_df.iterrows():
                        title = re.sub(r'[<>:"/\\|?*]', '',
                                       article['標題'])[:100]
                        article_link = f"{article['文章編號']}_{title}.md"
                        date_str = article['日期'].strftime('%Y-%m-%d')
                        content.append(
                            f"- {date_str} [{article['標題']}](./{article_link}) - {article['作者']}")
                    content.append("")

            # 寫入索引文件
            index_path = self.articles_dir / "index.md"
            with open(index_path, 'w', encoding='utf-8') as f:
                f.write('\n'.join(content))

            self.logger.info("索引頁面建立完成")

        except Exception as e:
            self.logger.error(f"建立索引頁面時發生錯誤: {str(e)}")

    def run(self):
        """執行爬蟲"""
        self.logger.info(f"開始執行爬蟲 (模式: {self.scan_mode})")

        try:
            latest_no = int(max(self.processed_articles)
                            ) if self.processed_articles else self.start_no

            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                futures = []

                if self.scan_mode in ["all", "new"]:
                    # 掃描新文章
                    for article_no in range(latest_no + 1, self.max_no + 1, self.batch_size):
                        batch = range(article_no, min(
                            article_no + self.batch_size, self.max_no + 1))
                        futures.extend(
                            [executor.submit(self.fetch_article, no) for no in batch])

                if self.scan_mode in ["all", "old"]:
                    # 掃描舊文章
                    for article_no in range(latest_no - 1, self.start_no - 1, -self.batch_size):
                        batch = range(article_no, max(
                            article_no - self.batch_size, self.start_no - 1), -1)
                        futures.extend(
                            [executor.submit(self.fetch_article, no) for no in batch])

                # 處理結果
                with tqdm(total=len(futures), desc="處理文章") as pbar:
                    for future in futures:
                        try:
                            result = future.result()
                            if result:
                                self.save_article(result)
                            pbar.update(1)
                        except Exception as e:
                            self.logger.error(f"處理文章結果失敗: {str(e)}")
                            pbar.update(1)

        except KeyboardInterrupt:
            self.logger.info("程式被使用者中斷")
        except Exception as e:
            self.logger.error(f"執行過程中發生錯誤: {str(e)}")
        finally:
            self.create_index()  # 建立索引頁面
            self.logger.info("程式結束執行")


if __name__ == "__main__":
    # 建立爬蟲實例並執行
    scraper = ArticleScraper(scan_mode="new")
    scraper.run()

2025-02-02 14:26:11,989 - INFO - 開始執行爬蟲 (模式: new)
處理文章:  95%|█████████▍| 468807/493484 [42:10<04:48, 85.62it/s]   