# 可以更新最近上傳的內容

In [1]:
import time
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup
import urllib.parse
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import logging
from tqdm import tqdm
import hashlib
import re
import urllib3
from typing import Optional, Dict, Any, List


class ArticleScraper:
    def __init__(self, scan_mode="recent", data_file="articles.xlsx"):
        """初始化爬蟲設定
        
        Args:
            scan_mode (str): 掃描模式 ("recent", "all", "range")
            data_file (str): 儲存資料的Excel檔案名稱
        """
        self.SETTINGS = {
            'BASE_URL': "https://real-estate.get.com.tw/Columns/",
            'TARGET_AUTHORS': ["曾榮耀", "許文昌", "蘇偉強"],
            'JOURNAL_PARAMS': {
                "no": "1282",
                "pno": "51121"
            },
            'PERFORMANCE': {
                'RETRY_DELAY': 3,
                'REQUEST_INTERVAL': 1.5,
                'MAX_RETRIES': 5,
                'TIMEOUTS': {
                    'CONNECT': 10,
                    'READ': 30,
                    'TOTAL': 40
                }
            },
            'SCAN_MODES': {
                'recent': {
                    'days': 30,
                    'batch_size': 50,
                    'max_workers': 4
                },
                'all': {
                    'batch_size': 100,
                    'max_workers': 8,
                    'article_ranges': [
                        {
                            "start": 900000,
                            "end": 915000,
                            "description": "新年份範圍"
                        },
                        {
                            "start": 409187,
                            "end": 421516,
                            "description": "早期年份範圍"
                        }
                    ]
                },
                'range': {
                    'batch_size': 75,
                    'max_workers': 6
                }
            }
        }

        # 初始化基本屬性
        self.scan_mode = scan_mode
        if scan_mode not in self.SETTINGS['SCAN_MODES']:
            raise ValueError(f"不支援的掃描模式: {scan_mode}")

        self.mode_settings = self.SETTINGS['SCAN_MODES'][scan_mode]
        self.data_file = Path(data_file)

        # 初始化URL和參數
        self.detail_url = f"{self.SETTINGS['BASE_URL']}detail.aspx"
        self.journal_url = f"{self.SETTINGS['BASE_URL']}journal.aspx"
        self.journal_params = self.SETTINGS['JOURNAL_PARAMS']
        self.target_authors = self.SETTINGS['TARGET_AUTHORS']

        # 初始化效能參數
        self.request_interval = self.SETTINGS['PERFORMANCE']['REQUEST_INTERVAL']
        self.retry_delay = self.SETTINGS['PERFORMANCE']['RETRY_DELAY']
        self.max_retries = self.SETTINGS['PERFORMANCE']['MAX_RETRIES']
        self.timeouts = self.SETTINGS['PERFORMANCE']['TIMEOUTS']

        # 設定路徑
        self.base_dir = Path('data')
        self.images_dir = self.base_dir / 'images'
        self.logs_dir = self.base_dir / 'logs'
        self.failed_image_path = self.base_dir / 'failed.jpg'

        # 建立必要的目錄
        self._create_directories()

        # 設定 logger
        self._setup_logger()

        # 初始化 session
        self.session = self._setup_session()

        # 載入已處理的文章
        self.processed_articles = self._load_processed_articles()

        # 根據掃描模式設定參數
        self._setup_scan_mode()

        # 設定最後請求時間
        self.last_request_time = 0

    def _create_directories(self):
        """創建必要的目錄結構"""
        for directory in [self.base_dir, self.images_dir, self.logs_dir]:
            directory.mkdir(parents=True, exist_ok=True)

        # 創建預設的失敗圖片
        if not self.failed_image_path.exists():
            try:
                from PIL import Image, ImageDraw
                img = Image.new('RGB', (400, 100), color='white')
                d = ImageDraw.Draw(img)
                d.text((10, 40), "Image Download Failed", fill='black')
                img.save(self.failed_image_path)
            except Exception as e:
                self.logger.error(f"創建失敗圖片時發生錯誤: {str(e)}")
                self.failed_image_path.touch()

    def _setup_logger(self):
        """設定日誌系統"""
        self.logger = logging.getLogger('ArticleScraper')
        self.logger.setLevel(logging.INFO)

        # 創建日誌檔案
        log_file = self.logs_dir / \
            f"scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"

        # 設定處理器
        file_handler = logging.FileHandler(log_file, encoding='utf-8')
        console_handler = logging.StreamHandler()

        # 設定格式
        formatter = logging.Formatter(
            '%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        console_handler.setFormatter(formatter)

        # 添加處理器
        self.logger.addHandler(file_handler)
        self.logger.addHandler(console_handler)

    def _setup_session(self):
        """設定並返回requests session"""
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        })
        return session

    def _load_processed_articles(self):
        """載入已處理的文章編號集合"""
        processed = set()
        if self.data_file.exists():
            try:
                df = pd.read_excel(self.data_file)
                if '文章編號' in df.columns:
                    processed = set(df['文章編號'].astype(str))
                self.logger.info(f"已載入 {len(processed)} 篇已處理文章")
            except Exception as e:
                self.logger.error(f"載入已處理文章時發生錯誤: {str(e)}")
        return processed

    def _setup_scan_mode(self):
        """根據掃描模式設定相關參數"""
        self.batch_size = self.mode_settings['batch_size']
        self.max_workers = self.mode_settings['max_workers']

        now = datetime.now()
        if self.scan_mode == 'recent':
            self.start_date = now - timedelta(days=self.mode_settings['days'])
            self.end_date = now
        elif self.scan_mode == 'range':
            # 需要外部設定日期範圍
            self.start_date = None
            self.end_date = None
        else:  # 'all' mode
            self.start_date = datetime(2016, 1, 1)
            self.end_date = now

        self.logger.info(f"掃描模式: {self.scan_mode}")
        if self.start_date and self.end_date:
            self.logger.info(f"日期範圍: {self.start_date.date()} 到 {
                             self.end_date.date()}")

    def set_date_range(self, start_date: datetime, end_date: datetime):
        """設定掃描的日期範圍"""
        if self.scan_mode != 'range':
            raise ValueError("只有在 'range' 模式下才能設定日期範圍")
        self.start_date = start_date
        self.end_date = end_date
        self.logger.info(f"設定日期範圍: {start_date.date()} 到 {end_date.date()}")

    def wait_between_requests(self):
        """控制請求間隔"""
        current_time = time.time()
        elapsed = current_time - self.last_request_time
        if elapsed < self.request_interval:
            time.sleep(self.request_interval - elapsed)
        self.last_request_time = time.time()
        
    def get_max_page_number(self) -> int:
        """獲取期刊最大頁數"""
        for retry in range(self.max_retries):
            try:
                self.wait_between_requests()
                response = self.session.get(
                    self.journal_url,
                    params=self.journal_params,
                    timeout=self.timeouts['TOTAL'],
                    verify=False
                )
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')

                # 找到分頁元素
                pagination = soup.select('.pagination a')
                if pagination:
                    page_numbers = []
                    for a in pagination:
                        try:
                            page_numbers.append(int(a.text.strip()))
                        except ValueError:
                            continue
                    max_page = max(page_numbers) if page_numbers else 1
                    self.logger.info(f"找到最大頁數: {max_page}")
                    return max_page
                return 1

            except requests.exceptions.RequestException as e:
                self.logger.error(
                    f"獲取最大頁數失敗 (嘗試 {retry + 1}/{self.max_retries}): {str(e)}")
                if retry < self.max_retries - 1:
                    time.sleep(self.retry_delay * (retry + 1))
                    continue

        self.logger.warning("無法獲取最大頁數，使用預設值 30")
        return 30

    def get_article_urls_from_journal(self, page_no: int) -> List[int]:
        """從期刊頁面獲取文章編號列表"""
        try:
            params = self.journal_params.copy()
            params['page_no'] = page_no

            self.wait_between_requests()
            response = self.session.get(
                self.journal_url,
                params=params,
                timeout=self.timeouts['TOTAL'],
                verify=False
            )
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            articles = []
            for link in soup.select('a[href*="detail.aspx?no="]'):
                href = link.get('href', '')
                if 'no=' in href:
                    article_no = href.split(
                        'no=')[-1].split('&')[0]  # 處理可能的額外參數
                    try:
                        article_no = int(article_no)
                        if str(article_no) not in self.processed_articles:
                            articles.append(article_no)
                    except ValueError:
                        continue

            self.logger.info(f"第 {page_no} 頁找到 {len(articles)} 篇未處理文章")
            return articles

        except Exception as e:
            self.logger.error(f"獲取第 {page_no} 頁文章列表失敗: {str(e)}")
            return []

    def fetch_article(self, article_no: int) -> Optional[Dict]:
        """抓取單篇文章"""
        if str(article_no) in self.processed_articles:
            self.logger.debug(f"文章 {article_no} 已處理過，跳過")
            return None

        for retry in range(self.max_retries):
            try:
                params = {'no': article_no}
                self.wait_between_requests()

                response = self.session.get(
                    self.detail_url,
                    params=params,
                    timeout=self.timeouts['TOTAL'],
                    verify=False
                )

                if response.status_code == 404:
                    self.logger.info(f"文章 {article_no} 不存在")
                    return None

                response.raise_for_status()
                response.encoding = 'utf-8'

                soup = BeautifulSoup(response.text, 'html.parser')
                article_data = self.parse_article(soup, article_no)

                if article_data and self.validate_article(article_data):
                    self.logger.info(f"成功抓取文章 {article_no}")
                    return article_data
                else:
                    self.logger.info(f"文章 {article_no} 不符合條件或解析失敗")
                    return None

            except requests.exceptions.RequestException as e:
                self.logger.error(f"抓取文章 {article_no} 失敗 (嘗試 {
                                  retry + 1}/{self.max_retries}): {str(e)}")
                if retry < self.max_retries - 1:
                    time.sleep(self.retry_delay * (retry + 1))
                    continue
            except Exception as e:
                self.logger.error(f"處理文章 {article_no} 時發生未預期錯誤: {str(e)}")
                break

        return None

    def parse_article(self, soup: BeautifulSoup, article_no: int) -> Optional[Dict]:
        """解析文章內容"""
        try:
            article_info = {}
            author = None

            # 解析文章基本信息
            for row in soup.select('.columnsDetail_tableRow'):
                th = row.select_one('.columnsDetail_tableth')
                td = row.select_one('.columnsDetail_tabletd')
                if th and td:
                    key = th.text.strip()
                    value = td.text.strip()
                    article_info[key] = value

                    if key == '內文':
                        article_info['內文HTML'] = str(td)
                    elif key == '作者':
                        author = value

            # 驗證作者
            if not author or not any(target in author for target in self.target_authors):
                self.logger.debug(f"文章 {article_no} 作者 {author} 不在目標列表中")
                return None

            # 處理文章內容和圖片
            content = self.process_content(
                article_info.get('內文HTML', ''), article_no)

            article_data = {
                '文章編號': article_no,
                '標題': article_info.get('篇名', ''),
                '作者': author,
                '日期': article_info.get('日期', ''),
                '內文': content,
                'URL': f"{self.detail_url}?no={article_no}",
                '爬取時間': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }

            return article_data

        except Exception as e:
            self.logger.error(f"解析文章 {article_no} 失敗: {str(e)}")
            return None

    def process_content(self, html_content: str, article_no: int) -> str:
        """處理文章內容，包括下載圖片和清理HTML"""
        if not html_content:
            return ""

        soup = BeautifulSoup(html_content, 'html.parser')

        # 處理圖片
        for img in soup.find_all('img'):
            img_url = img.get('src', '')
            if img_url:
                local_path = self.download_image(img_url, article_no)
                if local_path:
                    img['src'] = str(local_path)
                else:
                    img['src'] = str(self.failed_image_path)
                    img['alt'] = '圖片下載失敗'

        # 清理HTML並格式化
        return self._format_content(soup)

    def _format_content(self, soup: BeautifulSoup) -> str:
        """格式化文章內容，處理換行和縮排"""
        # 允許的HTML標籤
        allowed_tags = {'p', 'br', 'img', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li'}
        
        # 移除不允許的標籤但保留其文本
        for tag in soup.find_all():
            if tag.name not in allowed_tags:
                tag.unwrap()
        
        # 處理圖片標籤
        for img in soup.find_all('img'):
            img.insert_after('\n[圖片]\n')  # 在圖片後添加標記
        
        # 確保段落之間有適當的換行
        for p in soup.find_all('p'):
            text = p.get_text().strip()
            if text:  # 只處理非空段落
                p.string = ' '.join(text.split())
                p.append('\n\n')
        
        # 處理標題
        for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            level = int(h.name[1])  # 獲取標題級別
            prefix = '#' * level + ' '  # 根據級別添加 # 符號
            h.string = f'\n{prefix}{h.get_text().strip()}\n'
        
        # 處理列表
        for li in soup.find_all('li'):
            indent = '  '
            if li.parent.name == 'ol':
                # 為有序列表添加數字
                index = len(li.find_previous_siblings('li')) + 1
                li.insert(0, f'{indent}{index}. ')
            else:
                # 為無序列表添加圓點
                li.insert(0, f'{indent}• ')
            li.append('\n')
        
        # 獲取處理後的文本
        content = soup.get_text()
        
        # 清理最終文本
        content = re.sub(r'\n{3,}', '\n\n', content)  # 移除過多的空行
        content = re.sub(r'[ \t]+', ' ', content)     # 標準化空格
        content = re.sub(r' *\n *', '\n', content)    # 清理行首尾空格
        
        # 分段處理並保持段落間的空行
        paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
        formatted_content = '\n\n'.join(paragraphs)
        
        return formatted_content.strip()


    
    def download_image(self, img_url: str, article_no: int) -> Optional[Path]:
        """下載圖片並返回本地路徑"""
        try:
            # 確保是完整的URL
            if not img_url.startswith(('http://', 'https://')):
                img_url = urllib.parse.urljoin(
                    self.SETTINGS['BASE_URL'], img_url)

            # 生成圖片檔名
            url_hash = hashlib.md5(img_url.encode()).hexdigest()
            img_ext = img_url.split('.')[-1].lower()
            if img_ext not in ['jpg', 'jpeg', 'png', 'gif']:
                img_ext = 'jpg'

            img_filename = f"{article_no}_{url_hash[:8]}.{img_ext}"
            img_path = self.images_dir / img_filename

            # 如果圖片已存在，直接返回路徑
            if img_path.exists():
                return img_path

            # 下載圖片
            self.wait_between_requests()
            response = self.session.get(
                img_url,
                timeout=self.timeouts['TOTAL'],
                verify=False,
                stream=True
            )
            response.raise_for_status()

            # 驗證內容類型
            content_type = response.headers.get('content-type', '')
            if not content_type.startswith('image/'):
                raise ValueError(f"非圖片內容類型: {content_type}")

            # 保存圖片
            with open(img_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

            self.logger.debug(f"成功下載圖片: {img_filename}")
            return img_path

        except Exception as e:
            self.logger.error(f"下載圖片失敗 ({img_url}): {str(e)}")
            return None

    def validate_article(self, article_data: Dict) -> bool:
        """驗證文章數據是否完整且符合條件"""
        required_fields = ['文章編號', '標題', '作者', '日期', '內文']

        # 檢查必要欄位
        if not all(field in article_data for field in required_fields):
            return False

        # 檢查日期格式和範圍
        try:
            article_date = datetime.strptime(article_data['日期'], "%Y-%m-%d")
            if self.start_date and article_date < self.start_date:
                return False
            if self.end_date and article_date > self.end_date:
                return False
        except ValueError:
            return False

        # 檢查內容長度
        if len(article_data['內文']) < 100:  # 最小內容長度要求
            return False

        return True

    def save_articles(self, articles: List[Dict]):
        """保存文章數據到Excel檔案"""
        try:
            # 讀取現有數據
            if self.data_file.exists():
                df_existing = pd.read_excel(self.data_file)
                # 移除重複的文章
                new_articles_df = pd.DataFrame(articles)
                df = pd.concat([df_existing, new_articles_df],
                               ignore_index=True)
                df = df.drop_duplicates(subset=['文章編號'], keep='last')
            else:
                df = pd.DataFrame(articles)

            # 排序並保存
            df = df.sort_values('日期', ascending=False)
            df.to_excel(self.data_file, index=False)
            self.logger.info(f"成功保存 {len(articles)} 篇文章到 {self.data_file}")

        except Exception as e:
            self.logger.error(f"保存文章數據失敗: {str(e)}")
            # 備份數據
            backup_file = self.base_dir / \
                f"backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
            pd.DataFrame(articles).to_excel(backup_file, index=False)
            self.logger.info(f"已創建備份文件: {backup_file}")

    def process_batch(self, article_numbers: List[int]) -> List[Dict]:
        """使用線程池處理一批文章"""
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = [executor.submit(self.fetch_article, no)
                       for no in article_numbers]
            articles = []

            for future in tqdm(futures, desc="處理文章", unit="篇"):
                try:
                    result = future.result()
                    if result:
                        articles.append(result)
                except Exception as e:
                    self.logger.error(f"處理文章批次時發生錯誤: {str(e)}")

        return articles

    def run(self):
        """主要運行方法"""
        self.logger.info("開始爬取文章...")
        total_articles = []

        try:
            if self.scan_mode == 'all':
                # 處理所有文章範圍
                for range_info in self.mode_settings['article_ranges']:
                    start = range_info['start']
                    end = range_info['end']
                    desc = range_info['description']

                    self.logger.info(f"處理{desc}: {start} 到 {end}")
                    article_numbers = list(range(start, end + 1))

                    # 分批處理
                    for i in range(0, len(article_numbers), self.batch_size):
                        batch = article_numbers[i:i + self.batch_size]
                        articles = self.process_batch(batch)
                        if articles:
                            total_articles.extend(articles)
                            self.save_articles(articles)  # 定期保存

            else:  # recent 或 range 模式
                max_page = self.get_max_page_number()

                for page in range(1, max_page + 1):
                    article_numbers = self.get_article_urls_from_journal(page)
                    if not article_numbers:
                        continue

                    articles = self.process_batch(article_numbers)
                    if articles:
                        total_articles.extend(articles)
                        self.save_articles(articles)  # 定期保存

            # 最終保存
            if total_articles:
                self.save_articles(total_articles)

            self.logger.info(f"爬蟲完成，共處理 {len(total_articles)} 篇文章")

        except KeyboardInterrupt:
            self.logger.info("收到中斷信號，正在保存已處理的文章...")
            if total_articles:
                self.save_articles(total_articles)
            self.logger.info("程序已安全停止")

        except Exception as e:
            self.logger.error(f"執行過程中發生錯誤: {str(e)}")
            if total_articles:
                self.save_articles(total_articles)
            raise


if __name__ == "__main__":
    # 使用示例
    scraper = ArticleScraper(scan_mode="recent", data_file="articles.xlsx")

    # 如果是range模式，需要設定日期範圍
    # scraper.set_date_range(
    #     start_date=datetime(2023, 1, 1),
    #     end_date=datetime(2023, 12, 31)
    # )

    scraper.run()

2025-02-02 19:07:28,386 - INFO - 已載入 811 篇已處理文章
2025-02-02 19:07:28,386 - INFO - 掃描模式: recent
2025-02-02 19:07:28,386 - INFO - 日期範圍: 2025-01-03 到 2025-02-02
2025-02-02 19:07:28,386 - INFO - 開始爬取文章...
2025-02-02 19:07:29,928 - INFO - 第 1 頁找到 0 篇未處理文章
2025-02-02 19:07:29,928 - INFO - 爬蟲完成，共處理 0 篇文章
