# 可以抓取舊文章和新文章


In [1]:
import os
import time
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup
import urllib.parse
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import logging
from tqdm import tqdm
import hashlib
import re
import urllib3
from typing import Optional, Dict, Any, List

class ArticleScraper:
    def __init__(self, scan_mode="all", data_file="articles.xlsx"):
        """初始化爬蟲設定"""
        # 常量設定
        self.SETTINGS = {
            'BASE_URL': "https://real-estate.get.com.tw/Columns/",
            'TARGET_AUTHORS': ["曾榮耀", "許文昌", "蘇偉強"],
            'JOURNAL_PARAMS': {
                "no": "1282",
                "pno": "51121"
            },
            'PERFORMANCE': {
                'BATCH_SIZE': 50,
                'MAX_WORKERS': 4,
                'MAX_RETRIES': 5,
                'RETRY_DELAY': 3,
                'REQUEST_INTERVAL': 1.5
            },
            'ARTICLE_RANGES': [
                {
                    "start": 900000,
                    "end": 915000,
                    "description": "新年份範圍"
                },
                {
                    "start": 409187,
                    "end": 421516,
                    "description": "早期年份範圍"
                }
            ]
        }

        # 基本設定
        self.base_url = self.SETTINGS['BASE_URL']
        self.detail_url = f"{self.base_url}detail.aspx"
        self.journal_url = f"{self.base_url}journal.aspx"
        self.target_authors = self.SETTINGS['TARGET_AUTHORS']
        self.scan_mode = scan_mode
        self.data_file = Path(data_file)

        # 期刊參數設定
        self.journal_params = self.SETTINGS['JOURNAL_PARAMS']

        # 時間範圍設定
        self.end_date = datetime.now()
        self.start_date = self.end_date - timedelta(days=9*365)

        # 效能設定
        self.batch_size = self.SETTINGS['PERFORMANCE']['BATCH_SIZE']
        self.max_workers = self.SETTINGS['PERFORMANCE']['MAX_WORKERS']
        self.max_retries = self.SETTINGS['PERFORMANCE']['MAX_RETRIES']
        self.retry_delay = self.SETTINGS['PERFORMANCE']['RETRY_DELAY']
        self.request_interval = self.SETTINGS['PERFORMANCE']['REQUEST_INTERVAL']

        # 文章編號範圍設定
        self.article_ranges = self.SETTINGS['ARTICLE_RANGES']

        # 初始化其他組件
        self.setup_directories()
        self.setup_session()
        self.setup_logger()
        self.processed_articles = set()
        self.load_processed_articles()
        self.last_request_time = 0

        # 記錄範圍資訊到日誌
        for range_info in self.article_ranges:
            self.logger.info(
                f"設定文章範圍: {range_info['description']} - "
                f"從 {range_info['start']} 到 {range_info['end']}"
            )



    def setup_directories(self):
        """建立必要的目錄結構"""
        self.base_dir = Path("real_estate_articles")
        self.articles_dir = self.base_dir / "articles"
        self.images_dir = self.articles_dir / "images"
        self.logs_dir = self.base_dir / "logs"

        for directory in [self.base_dir, self.articles_dir, self.images_dir, self.logs_dir]:
            directory.mkdir(parents=True, exist_ok=True)

        # 創建預設的失敗圖片
        self.failed_image_path = self.images_dir / "image_download_failed.png"
        if not self.failed_image_path.exists():
            try:
                from PIL import Image, ImageDraw
                img = Image.new('RGB', (400, 100), color='white')
                d = ImageDraw.Draw(img)
                d.text((10, 40), "Image Download Failed", fill='black')
                img.save(self.failed_image_path)
            except Exception:
                self.failed_image_path.touch()

    def setup_session(self):
        """設定請求session"""
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        })

    def setup_logger(self):
        """設定日誌系統"""
        self.logger = logging.getLogger('ArticleScraper')
        self.logger.setLevel(logging.INFO)

        log_file = self.logs_dir / f"scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
        handlers = [
            logging.FileHandler(log_file, encoding='utf-8'),
            logging.StreamHandler()
        ]

        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        for handler in handlers:
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)

    def load_processed_articles(self):
        """載入已處理的文章"""
        if self.data_file.exists():
            df = pd.read_excel(self.data_file)
            if '文章編號' in df.columns:
                self.processed_articles = set(df['文章編號'].astype(str))

    def wait_between_requests(self):
        """控制請求間隔"""
        current_time = time.time()
        elapsed = current_time - self.last_request_time
        if elapsed < self.request_interval:
            time.sleep(self.request_interval - elapsed)
        self.last_request_time = current_time

    def get_max_page_number(self) -> int:
        """獲取期刊最大頁數"""
        try:
            self.wait_between_requests()
            response = self.session.get(
                self.journal_url,
                params=self.journal_params,
                timeout=30,
                verify=False
            )
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 找到分頁元素
            pagination = soup.select('.pagination a')
            if pagination:
                page_numbers = []
                for a in pagination:
                    try:
                        page_numbers.append(int(a.text.strip()))
                    except ValueError:
                        continue
                return max(page_numbers) if page_numbers else 1
            return 1
            
        except Exception as e:
            self.logger.error(f"獲取最大頁數失敗: {str(e)}")
            return 30  # 預設較大的頁數以確保不遺漏

    def get_article_urls_from_journal(self, page_no: int) -> List[int]:
        """從期刊頁面獲取文章編號列表"""
        try:
            params = self.journal_params.copy()
            params['page_no'] = page_no
            
            self.wait_between_requests()
            response = self.session.get(
                self.journal_url,
                params=params,
                timeout=30,
                verify=False
            )
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            articles = []
            for link in soup.select('a[href*="detail.aspx?no="]'):
                href = link.get('href', '')
                if 'no=' in href:
                    article_no = href.split('no=')[-1]
                    try:
                        article_no = int(article_no)
                        if str(article_no) not in self.processed_articles:
                            articles.append(article_no)
                    except ValueError:
                        continue
                        
            return articles
            
        except Exception as e:
            self.logger.error(f"獲取第 {page_no} 頁文章列表失敗: {str(e)}")
            return []

    def fetch_article(self, article_no: int) -> Optional[Dict]:
        """抓取單篇文章"""
        if str(article_no) in self.processed_articles:
            return None

        for retry in range(self.max_retries):
            try:
                params = {'no': article_no}
                self.wait_between_requests()

                self.logger.info(f"正在抓取文章 {article_no}")
                response = self.session.get(
                    self.detail_url,
                    params=params,
                    timeout=30,
                    verify=False
                )

                self.logger.info(f"文章 {article_no} 回應狀態碼: {
                                 response.status_code}")

                if response.status_code == 404:
                    self.logger.info(f"文章 {article_no} 不存在")
                    return None

                response.raise_for_status()
                response.encoding = 'utf-8'

                soup = BeautifulSoup(response.text, 'html.parser')
                article_data = self.parse_article(soup, article_no)

                if article_data and self.validate_article(article_data):
                    self.logger.info(f"成功解析文章 {article_no}")
                    return article_data
                else:
                    self.logger.info(f"文章 {article_no} 不符合條件或解析失敗")
                    return None

            except requests.exceptions.RequestException as e:
                self.logger.error(f"抓取文章 {article_no} 失敗 (嘗試 {
                                  retry + 1}/{self.max_retries}): {str(e)}")
                if hasattr(e, 'response') and hasattr(e.response, 'text'):
                    self.logger.error(f"錯誤回應內容: {e.response.text[:200]}")
                if retry < self.max_retries - 1:
                    time.sleep(self.retry_delay * (retry + 1))
                    continue
            except Exception as e:
                self.logger.error(f"處理文章 {article_no} 時發生未預期錯誤: {str(e)}")
                break

        return None

    def parse_article(self, soup: BeautifulSoup, article_no: int) -> Optional[Dict]:
        """解析文章內容"""
        try:
            # 先只解析作者欄位,如果不符合就直接返回
            author = None
            article_info = {}

            for row in soup.select('.columnsDetail_tableRow'):
                th = row.select_one('.columnsDetail_tableth')
                td = row.select_one('.columnsDetail_tabletd')
                if th and td:
                    key = th.text.strip()
                    value = td.text.strip()
                    article_info[key] = value

                    if key == '內文':
                        article_info['內文HTML'] = str(td)
                    elif key == '作者':
                        author = value

            # 如果不是目標作者就直接返回
            if not author or not any(target in author for target in self.target_authors):
                return None

            # 處理文章內容和圖片
            content = self.process_content(
                article_info.get('內文HTML', ''), article_no)

            return {
                '文章編號': article_no,
                '標題': article_info.get('篇名', ''),
                '作者': author,
                '日期': article_info.get('日期', ''),
                '內文': content,
                'URL': f"{self.detail_url}?no={article_no}",
                '爬取時間': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }

        except Exception as e:
            self.logger.error(f"解析文章 {article_no} 失敗: {str(e)}")
            return None
        
    def download_image(self, img_url: str, article_no: int) -> Optional[str]:
        """下載圖片並返回本地檔名"""
        try:
            if not img_url.startswith(('http://', 'https://')):
                img_url = urllib.parse.urljoin(self.base_url, img_url)
            
            # 生成唯一的檔名
            url_hash = hashlib.md5(img_url.encode()).hexdigest()[:8]
            file_ext = os.path.splitext(img_url)[1] or '.jpg'
            local_filename = f"{article_no}_{url_hash}{file_ext}"
            local_path = self.images_dir / local_filename
            
            # 如果圖片已存在就直接返回
            if local_path.exists():
                return local_filename
                
            # 下載圖片
            self.wait_between_requests()
            response = self.session.get(img_url, timeout=30, verify=False)
            response.raise_for_status()
            
            # 保存圖片
            with open(local_path, 'wb') as f:
                f.write(response.content)
                
            return local_filename
            
        except Exception as e:
            self.logger.error(f"下載圖片失敗 ({img_url}): {str(e)}")
            return None

    def process_content(self, content_html: str, article_no: int) -> str:
        """處理文章內容"""
        if not content_html:
            return ""

        content = []
        soup = BeautifulSoup(content_html, 'html.parser')

        try:
            for element in soup.descendants:
                if isinstance(element, str):
                    text = element.strip()
                    if text:
                        content.append(text)
                elif element.name == 'img':
                    try:
                        img_src = element.get('src')
                        if img_src:
                            local_img = self.download_image(
                                img_src, article_no)
                            if local_img:
                                content.append(
                                    f"\n![圖片](./images/{local_img})\n")
                    except Exception as e:
                        self.logger.error(f"處理圖片元素失敗: {str(e)}")
                        continue

            return '\n'.join(filter(None, content))
        except Exception as e:
            self.logger.error(f"處理文章 {article_no} 內容失敗: {str(e)}")
            return ""


    def validate_article(self, article_data: Dict) -> bool:
        """驗證文章資料完整性"""
        required_fields = ['標題', '作者', '日期', '內文']
        return all(field in article_data and article_data[field] for field in required_fields)

    def save_article(self, article_data: Dict) -> None:
        """儲存文章"""
        try:
            # 更新 Excel 資料
            new_df = pd.DataFrame([article_data])
            if self.data_file.exists():
                df = pd.read_excel(self.data_file)
                df = pd.concat([df, new_df]).drop_duplicates(subset=['文章編號'])
            else:
                df = new_df
            df.to_excel(self.data_file, index=False)

            # 建立 Markdown 文件
            article_no = article_data['文章編號']
            title = re.sub(r'[<>:"/\\|?*]', '', article_data['標題'])[:100]

            markdown_content = f"""# {article_data['標題']}

## 文章資訊
- 文章編號：{article_no}
- 作者：{article_data['作者']}
- 發布日期：{article_data['日期']}
- 爬取時間：{article_data['爬取時間']}
- 原文連結：[閱讀原文]({article_data['URL']})

## 內文
{article_data['內文']}

---
*注：本文圖片存放於 ./images/ 目錄下*
"""

            # 儲存 Markdown 文件
            file_path = self.articles_dir / f"{article_no}_{title}.md"
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)

            # 更新已處理集合
            self.processed_articles.add(str(article_no))

        except Exception as e:
            self.logger.error(f"儲存文章失敗: {str(e)}")
            
            
    def create_index(self):
        """創建文章索引，整合原有目錄內容"""
        try:
            # 讀取 Excel 檔案獲取所有文章資訊
            if not self.data_file.exists():
                return
                
            df = pd.read_excel(self.data_file)
            
            # 讀取現有的目錄文件（如果存在）
            index_path = self.base_dir / "README.md"
            existing_content = []
            existing_articles = set()
            
            if index_path.exists():
                with open(index_path, 'r', encoding='utf-8') as f:
                    existing_content = f.read().splitlines()
                    # 提取現有目錄中的文章編號
                    for line in existing_content:
                        if match := re.search(r'/(\d+)_[^/]+\.md', line):
                            existing_articles.add(match.group(1))
            
            # 只處理新文章
            df['文章編號'] = df['文章編號'].astype(str)
            new_articles = df[~df['文章編號'].isin(existing_articles)]
            
            if new_articles.empty and existing_content:
                self.logger.info("沒有新文章需要添加到目錄")
                return
                
            # 如果有現有內容，保留開頭的通用部分（如標題）
            index_content = []
            for line in existing_content:
                if line.startswith('## '):
                    break
                index_content.append(line)
            
            if not index_content:  # 如果是全新的目錄
                index_content = ["# 文章目錄", ""]
            
            # 合併現有文章和新文章
            all_articles = pd.concat([
                df[df['文章編號'].isin(existing_articles)],
                new_articles
            ]).drop_duplicates(subset=['文章編號'])
            
            # 按作者和日期排序
            all_articles['日期'] = pd.to_datetime(all_articles['日期'])
            all_articles = all_articles.sort_values(['作者', '日期'], ascending=[True, False])
            
            # 按作者分組
            for author in sorted(all_articles['作者'].unique()):
                index_content.append(f"## {author}")
                author_articles = all_articles[all_articles['作者'] == author]
                
                # 按年份分組
                for year in sorted(author_articles['日期'].dt.year.unique(), reverse=True):
                    index_content.append(f"\n### {year}年")
                    year_articles = author_articles[author_articles['日期'].dt.year == year]
                    
                    # 生成文章列表
                    for _, article in year_articles.iterrows():
                        title = re.sub(r'[<>:"/\\|?*]', '', article['標題'])[:100]
                        file_name = f"{article['文章編號']}_{title}.md"
                        date_str = article['日期'].strftime('%Y-%m-%d')
                        index_content.append(
                            f"- {date_str} [{article['標題']}](./articles/{file_name})"
                        )
                
                index_content.append("")  # 作者之間加入空行
            
            # 保存更新後的目錄文件
            with open(index_path, 'w', encoding='utf-8') as f:
                f.write('\n'.join(index_content))
                
            self.logger.info(f"已成功更新文章目錄，新增 {len(new_articles)} 篇文章")
            
        except Exception as e:
            self.logger.error(f"更新目錄失敗: {str(e)}")


    def run(self):
        """執行爬蟲"""
        self.logger.info(f"開始執行爬蟲 (模式: {self.scan_mode})")

        success_count = 0
        fail_count = 0

        try:
            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                futures = []

                # 從期刊頁面獲取文章
                max_page = self.get_max_page_number()
                self.logger.info(f"檢測到期刊總頁數: {max_page}")

                article_numbers = set()

                # 收集所有文章編號
                for page_no in range(1, max_page + 1):
                    self.logger.info(f"正在獲取第 {page_no} 頁的文章列表")
                    page_articles = self.get_article_urls_from_journal(page_no)
                    article_numbers.update(page_articles)

                    if not page_articles:
                        self.logger.warning(f"第 {page_no} 頁沒有找到新文章，可能已到達最後一頁")
                        break

                self.logger.info(f"從期刊頁面共找到 {len(article_numbers)} 篇新文章")

                # 處理找到的文章
                for article_no in article_numbers:
                    if str(article_no) not in self.processed_articles:
                        futures.append(executor.submit(self.fetch_article, article_no))

                # 如果是全量模式，掃描所有編號範圍
                if self.scan_mode == "all":
                    for range_info in self.article_ranges:
                        self.logger.info(f"開始處理範圍 {range_info['start']} 到 {range_info['end']}")
                        for article_no in range(range_info['start'], range_info['end'] + 1, self.batch_size):
                            batch = range(article_no, min(
                                article_no + self.batch_size, range_info['end'] + 1))
                            futures.extend([
                                executor.submit(self.fetch_article, no)
                                for no in batch
                                if str(no) not in self.processed_articles
                            ])

                # 處理結果
                with tqdm(total=len(futures), desc="處理文章") as pbar:
                    for future in futures:
                        try:
                            result = future.result()
                            if result:
                                self.save_article(result)
                                success_count += 1
                            else:
                                fail_count += 1
                            pbar.update(1)
                        except Exception as e:
                            self.logger.error(f"處理文章結果失敗: {str(e)}")
                            fail_count += 1
                            pbar.update(1)

        finally:
            self.logger.info(f"成功處理文章數: {success_count}")
            self.logger.info(f"失敗處理文章數: {fail_count}")
            self.create_index()
            self.logger.info("程式結束執行")



if __name__ == "__main__":
    # 建立爬蟲實例並執行
    scraper = ArticleScraper(scan_mode="all")
    scraper.run()

2025-02-02 17:08:51,936 - INFO - 設定文章範圍: 新年份範圍 - 從 900000 到 915000
2025-02-02 17:08:51,936 - INFO - 設定文章範圍: 早期年份範圍 - 從 409187 到 421516
2025-02-02 17:08:51,936 - INFO - 開始執行爬蟲 (模式: all)
2025-02-02 17:08:52,455 - INFO - 檢測到期刊總頁數: 1
2025-02-02 17:08:52,456 - INFO - 正在獲取第 1 頁的文章列表
2025-02-02 17:08:53,501 - INFO - 從期刊頁面共找到 0 篇新文章
2025-02-02 17:08:53,502 - INFO - 開始處理範圍 900000 到 915000
2025-02-02 17:08:53,857 - INFO - 開始處理範圍 409187 到 421516
2025-02-02 17:08:53,968 - INFO - 正在抓取文章 900002
2025-02-02 17:08:53,968 - INFO - 正在抓取文章 900001
2025-02-02 17:08:53,983 - INFO - 正在抓取文章 900003
2025-02-02 17:08:53,991 - INFO - 正在抓取文章 900000
2025-02-02 17:08:54,034 - INFO - 文章 900002 回應狀態碼: 200
2025-02-02 17:08:54,169 - INFO - 文章 900003 回應狀態碼: 200
2025-02-02 17:08:54,177 - INFO - 文章 900002 不符合條件或解析失敗
2025-02-02 17:08:54,202 - INFO - 文章 900003 不符合條件或解析失敗
處理文章:   0%|          | 0/27148 [00:00<?, ?it/s]2025-02-02 17:08:54,623 - INFO - 文章 900001 回應狀態碼: 200
2025-02-02 17:08:54,624 - INFO - 文章 900001 不符合條件或解析失敗
20