# 可以更新最近上傳的內容

In [None]:
import time
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup
import urllib.parse
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import logging
from tqdm import tqdm
import hashlib
import re
import urllib3
import os
from PIL import Image, ImageDraw
from typing import Optional, Dict, Any, List


class ArticleScraper:
    def __init__(self, scan_mode="recent", data_file="articles.xlsx"):
        """初始化爬蟲設定
        
        Args:
            scan_mode (str): 掃描模式 ("recent", "all", "range")
            data_file (str): 儲存資料的Excel檔案名稱
        """
        self.SETTINGS = {
            'BASE_URL': "https://real-estate.get.com.tw/Columns/",
            'TARGET_AUTHORS': ["曾榮耀", "許文昌", "蘇偉強"],
            'JOURNAL_PARAMS': {
                "no": "1282",
                "pno": "51121"
            },
            'PERFORMANCE': {
                'RETRY_DELAY': 3,
                'REQUEST_INTERVAL': 1.5,
                'MAX_RETRIES': 5,
                'TIMEOUTS': {
                    'CONNECT': 10,
                    'READ': 30,
                    'TOTAL': 40
                }
            },
            'SCAN_MODES': {
                'recent': {
                    'days': 30,
                    'batch_size': 50,
                    'max_workers': 4
                },
                'all': {
                    'batch_size': 100,
                    'max_workers': 8,
                    'article_ranges': [
                        {
                            "start": 900000,
                            "end": 915000,
                            "description": "新年份範圍"
                        },
                        {
                            "start": 409187,
                            "end": 421516,
                            "description": "早期年份範圍"
                        }
                    ]
                },
                'range': {
                    'batch_size': 75,
                    'max_workers': 6
                }
            }
        }

        # 初始化基本屬性
        self.scan_mode = scan_mode
        if scan_mode not in self.SETTINGS['SCAN_MODES']:
            raise ValueError(f"不支援的掃描模式: {scan_mode}")

        self.mode_settings = self.SETTINGS['SCAN_MODES'][scan_mode]
        self.data_file = Path(data_file)

        # 初始化URL和參數
        self.detail_url = f"{self.SETTINGS['BASE_URL']}detail.aspx"
        self.journal_url = f"{self.SETTINGS['BASE_URL']}journal.aspx"
        self.journal_params = self.SETTINGS['JOURNAL_PARAMS'].copy()
        self.target_authors = self.SETTINGS['TARGET_AUTHORS']

        # 初始化效能參數
        self.request_interval = self.SETTINGS['PERFORMANCE']['REQUEST_INTERVAL']
        self.retry_delay = self.SETTINGS['PERFORMANCE']['RETRY_DELAY']
        self.max_retries = self.SETTINGS['PERFORMANCE']['MAX_RETRIES']
        self.timeouts = self.SETTINGS['PERFORMANCE']['TIMEOUTS'].copy()

        # 設定路徑
        self.base_dir = Path('data')
        self.images_dir = self.base_dir / 'images'
        self.logs_dir = self.base_dir / 'logs'
        self.failed_image_path = self.base_dir / 'failed.jpg'

        # 建立必要的目錄
        self._create_directories()

        # 設定 logger
        self._setup_logger()

        # 初始化 session
        self.session = self._setup_session()

        # 載入已處理的文章
        self.processed_articles = self._load_processed_articles()

        # 根據掃描模式設定參數
        self._setup_scan_mode()

        # 設定最後請求時間
        self.last_request_time = 0
        

    def _create_directories(self):
        """創建必要的目錄結構"""
        for directory in [self.base_dir, self.images_dir, self.logs_dir]:
            directory.mkdir(parents=True, exist_ok=True)

        # 創建預設的失敗圖片
        if not self.failed_image_path.exists():
            try:
                img = Image.new('RGB', (400, 100), color='white')
                d = ImageDraw.Draw(img)
                d.text((10, 40), "Image Download Failed", fill='black')
                img.save(str(self.failed_image_path))  # 轉換為字符串路徑
            except Exception as e:
                self.logger.error(f"創建失敗圖片時發生錯誤: {str(e)}")
                self.failed_image_path.touch()


    def _setup_logger(self):
        """設定日誌系統"""
        self.logger = logging.getLogger('ArticleScraper')
        self.logger.setLevel(logging.INFO)

        # 確保處理器不會重複添加
        if not self.logger.handlers:
            # 創建日誌檔案
            log_file = self.logs_dir / \
                f"scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"

            # 設定處理器
            file_handler = logging.FileHandler(str(log_file), encoding='utf-8')
            console_handler = logging.StreamHandler()

            # 設定格式
            formatter = logging.Formatter(
                '%(asctime)s - %(levelname)s - %(message)s')
            file_handler.setFormatter(formatter)
            console_handler.setFormatter(formatter)

            # 添加處理器
            self.logger.addHandler(file_handler)
            self.logger.addHandler(console_handler)


    def _setup_session(self):
        """設定並返回requests session"""
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        })
        return session


    def _load_processed_articles(self):
        """載入已處理的文章編號集合"""
        processed = set()
        if self.data_file.exists():
            try:
                df = pd.read_excel(str(self.data_file))
                if '文章編號' in df.columns:
                    processed = set(df['文章編號'].astype(str))
                self.logger.info(f"已載入 {len(processed)} 篇已處理文章")
            except Exception as e:
                self.logger.error(f"載入已處理文章時發生錯誤: {str(e)}")
        return processed


    def _setup_scan_mode(self):
        """根據掃描模式設定相關參數"""
        self.batch_size = self.mode_settings['batch_size']
        self.max_workers = self.mode_settings['max_workers']

        now = datetime.now()
        if self.scan_mode == 'recent':
            self.start_date = now - timedelta(days=self.mode_settings['days'])
            self.end_date = now
        elif self.scan_mode == 'range':
            # 需要外部設定日期範圍
            self.start_date = None
            self.end_date = None
        else:  # 'all' mode
            self.start_date = datetime(2016, 1, 1)
            self.end_date = now

        self.logger.info(f"掃描模式: {self.scan_mode}")
        if self.start_date and self.end_date:
            self.logger.info(f"日期範圍: {self.start_date.date()} 到 {
                            self.end_date.date()}")


    def wait_between_requests(self):
        """控制請求間隔"""
        current_time = time.time()
        elapsed = current_time - self.last_request_time
        if elapsed < self.request_interval:
            time.sleep(self.request_interval - elapsed)
        self.last_request_time = time.time()


    def set_date_range(self, start_date: datetime, end_date: datetime):
        """設定掃描的日期範圍"""
        if self.scan_mode != 'range':
            raise ValueError("只有在 'range' 模式下才能設定日期範圍")
        self.start_date = start_date
        self.end_date = end_date
        self.logger.info(f"設定日期範圍: {start_date.date()} 到 {end_date.date()}")
        
        
    def make_request(self, url: str, params: Optional[Dict] = None, method: str = 'get',
                    retry_count: int = 0) -> Optional[requests.Response]:
        """發送 HTTP 請求並處理重試邏輯
        
        Args:
            url: 請求URL
            params: URL參數
            method: HTTP方法 ('get' 或 'post')
            retry_count: 當前重試次數
        
        Returns:
            Response對象或None（如果請求失敗）
        """
        if retry_count >= self.max_retries:
            self.logger.error(f"達到最大重試次數 ({self.max_retries})")
            return None

        self.wait_between_requests()

        try:
            if method.lower() == 'post':
                response = self.session.post(
                    url,
                    params=params,
                    verify=False,
                    timeout=(self.timeouts['CONNECT'], self.timeouts['READ'])
                )
            else:
                response = self.session.get(
                    url,
                    params=params,
                    verify=False,
                    timeout=(self.timeouts['CONNECT'], self.timeouts['READ'])
                )

            response.raise_for_status()
            return response

        except requests.RequestException as e:
            self.logger.warning(
                f"請求失敗 (重試 {retry_count + 1}/{self.max_retries}): {str(e)}")
            time.sleep(self.retry_delay * (retry_count + 1))  # 指數退避
            return self.make_request(url, params, method, retry_count + 1)


    def parse_article_list(self, html_content: str) -> List[Dict[str, Any]]:
        """解析文章列表頁面
        
        Args:
            html_content: HTML內容
        
        Returns:
            文章資訊列表
        """
        articles = []
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            article_elements = soup.select('div.article-item')  # 根據實際HTML結構調整

            for element in article_elements:
                try:
                    article_info = {
                        '文章編號': element.get('data-article-id', '').strip(),
                        '標題': element.select_one('.title').text.strip(),
                        '作者': element.select_one('.author').text.strip(),
                        '發布日期': element.select_one('.date').text.strip(),
                        '摘要': element.select_one('.summary').text.strip()
                    }

                    # 檢查必要欄位
                    if not all(article_info.values()):
                        continue

                    # 轉換日期格式
                    try:
                        date_str = article_info['發布日期']
                        article_info['發布日期'] = datetime.strptime(
                            date_str, '%Y-%m-%d')
                    except ValueError:
                        self.logger.warning(f"日期格式錯誤: {date_str}")
                        continue

                    articles.append(article_info)

                except Exception as e:
                    self.logger.warning(f"解析文章元素時發生錯誤: {str(e)}")
                    continue

        except Exception as e:
            self.logger.error(f"解析文章列表時發生錯誤: {str(e)}")

        return articles


    def parse_article_detail(self, html_content: str, article_no: str) -> Dict[str, Any]:
        """解析文章詳細內容頁面
        
        Args:
            html_content: HTML內容
            article_no: 文章編號
        
        Returns:
            文章詳細資訊字典
        """
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            article_data = {
                '文章編號': article_no,
                '標題': '',
                '作者': '',
                '發布日期': '',
                '內文': '',
                '圖片連結': []
            }

            # 解析標題
            title_element = soup.select_one('h1.article-title')
            if title_element:
                article_data['標題'] = title_element.text.strip()

            # 解析作者
            author_element = soup.select_one('div.author-info')
            if author_element:
                article_data['作者'] = author_element.text.strip()

            # 解析發布日期
            date_element = soup.select_one('div.publish-date')
            if date_element:
                date_str = date_element.text.strip()
                try:
                    article_data['發布日期'] = datetime.strptime(date_str, '%Y-%m-%d')
                except ValueError:
                    self.logger.warning(f"文章 {article_no} 日期格式錯誤: {date_str}")

            # 解析內文
            content_element = soup.select_one('div.article-content')
            if content_element:
                # 移除不需要的元素
                for element in content_element.select('script, style, iframe'):
                    element.decompose()

                article_data['內文'] = content_element.get_text(strip=True)

            # 解析圖片連結
            image_elements = soup.select('div.article-content img')
            for img in image_elements:
                src = img.get('src', '')
                if src:
                    if not src.startswith(('http://', 'https://')):
                        src = urllib.parse.urljoin(self.SETTINGS['BASE_URL'], src)
                    article_data['圖片連結'].append(src)

            return article_data

        except Exception as e:
            self.logger.error(f"解析文章 {article_no} 詳細內容時發生錯誤: {str(e)}")
            return None


    def download_image(self, url: str, save_path: Path) -> bool:
        """下載並保存圖片
        
        Args:
            url: 圖片URL
            save_path: 保存路徑
        
        Returns:
            是否成功下載
        """
        try:
            response = self.make_request(url)
            if not response:
                return False

            # 檢查內容類型
            content_type = response.headers.get('content-type', '')
            if not content_type.startswith('image/'):
                self.logger.warning(f"非圖片內容類型: {content_type}")
                return False

            # 保存圖片
            with open(save_path, 'wb') as f:
                f.write(response.content)

            return True

        except Exception as e:
            self.logger.error(f"下載圖片失敗 ({url}): {str(e)}")
            return False
        

    def save_article_markdown(self, article_data: Dict) -> None:
        """將文章保存為Markdown格式
        
        Args:
            article_data: 文章資料字典
        """
        try:
            article_no = article_data['文章編號']
            # 清理標題中的非法字符
            title = re.sub(r'[<>:"/\\|?*]', '', article_data['標題'])[:50]

            # 建立文章目錄
            article_dir = self.base_dir / article_no
            article_dir.mkdir(parents=True, exist_ok=True)

            # 準備Markdown內容
            markdown_content = [
                f"# {article_data['標題']}",
                "",
                f"作者：{article_data['作者']}",
                f"發布日期：{article_data['發布日期'].strftime('%Y-%m-%d')}",
                f"文章編號：{article_no}",
                "",
                "## 內文",
                "",
                article_data['內文'],
                "",
                "## 圖片",
                ""
            ]

            # 處理圖片
            for i, img_url in enumerate(article_data['圖片連結'], 1):
                img_filename = f"image_{i}.jpg"
                img_path = article_dir / img_filename

                if self.download_image(img_url, img_path):
                    markdown_content.append(f"![圖片{i}]({img_filename})")
                else:
                    markdown_content.append(f"![下載失敗的圖片{i}](failed_image.jpg)")
                markdown_content.append("")

            # 保存Markdown文件
            markdown_path = article_dir / f"{title}.md"
            with open(markdown_path, 'w', encoding='utf-8') as f:
                f.write('\n'.join(markdown_content))

            self.logger.info(f"已保存文章 {article_no} 的Markdown文件")

        except Exception as e:
            self.logger.error(f"保存文章 {article_no} Markdown時發生錯誤: {str(e)}")


    def save_to_excel(self, articles: List[Dict]) -> None:
        """將文章資料保存到Excel
        
        Args:
            articles: 文章資料列表
        """
        try:
            # 準備DataFrame
            df = pd.DataFrame(articles)

            # 如果文件已存在，讀取並合併
            if self.data_file.exists():
                existing_df = pd.read_excel(str(self.data_file))
                # 使用文章編號作為索引合併
                df = pd.concat([existing_df, df]).drop_duplicates(
                    subset=['文章編號'], keep='last')

            # 排序並保存
            df = df.sort_values('發布日期', ascending=False)
            df.to_excel(str(self.data_file), index=False)
            self.logger.info(f"已更新Excel文件，共 {len(df)} 篇文章")

        except Exception as e:
            self.logger.error(f"保存Excel文件時發生錯誤: {str(e)}")


    def process_article(self, article_info: Dict) -> Optional[Dict]:
        """處理單篇文章
        
        Args:
            article_info: 文章基本資訊
        
        Returns:
            完整的文章資料或None（如果處理失敗）
        """
        article_no = article_info['文章編號']

        try:
            # 檢查是否已處理
            if article_no in self.processed_articles:
                self.logger.debug(f"文章 {article_no} 已處理，跳過")
                return None

            # 獲取文章詳細頁面
            url = f"{self.SETTINGS['BASE_URL']}article/{article_no}"
            response = self.make_request(url)
            if not response:
                return None

            # 解析文章詳細內容
            article_data = self.parse_article_detail(response.text, article_no)
            if not article_data:
                return None

            # 保存Markdown
            self.save_article_markdown(article_data)

            return article_data

        except Exception as e:
            self.logger.error(f"處理文章 {article_no} 時發生錯誤: {str(e)}")
            return None


    def scan_articles(self) -> None:
        """主要掃描邏輯"""
        try:
            self.logger.info("開始掃描文章...")
            processed_count = 0

            # 根據掃描模式設定文章範圍
            if self.scan_mode == 'all':
                for range_info in self.mode_settings['article_ranges']:
                    self.logger.info(f"掃描範圍: {range_info['description']}")
                    article_numbers = range(range_info['start'], range_info['end'])
                    self._process_article_batch(article_numbers)

            elif self.scan_mode in ['recent', 'range']:
                # 獲取文章列表
                page = 1
                while True:
                    url = f"{self.SETTINGS['BASE_URL']}list"
                    response = self.make_request(url, params={'page': page})
                    if not response:
                        break

                    articles = self.parse_article_list(response.text)
                    if not articles:
                        break

                    # 檢查日期範圍
                    articles = [
                        a for a in articles
                        if self.start_date <= a['發布日期'] <= self.end_date
                    ]

                    if not articles:
                        break

                    # 處理文章批次
                    self._process_article_batch(articles)
                    processed_count += len(articles)
                    page += 1

            self.logger.info(f"掃描完成，共處理 {processed_count} 篇文章")

        except Exception as e:
            self.logger.error(f"掃描過程中發生錯誤: {str(e)}")


    def _process_article_batch(self, articles) -> None:
        """使用線程池處理文章批次
        
        Args:
            articles: 文章列表或範圍
        """
        processed_articles = []

        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            future_to_article = {
                executor.submit(self.process_article, article): article
                for article in articles
            }

            for future in tqdm(future_to_article, desc="處理文章"):
                try:
                    result = future.result()
                    if result:
                        processed_articles.append(result)
                except Exception as e:
                    self.logger.error(f"處理文章批次時發生錯誤: {str(e)}")

        if processed_articles:
            self.save_to_excel(processed_articles)
        

def main():
    """主程序入口"""
    import argparse
    from datetime import datetime, timedelta

    # 設定命令行參數
    parser = argparse.ArgumentParser(description='文章爬蟲工具')
    parser.add_argument(
        '--mode',
        choices=['recent', 'all', 'range'],
        default='recent',
        help='掃描模式：recent(最近30天), all(全部), range(指定範圍)'
    )
    parser.add_argument(
        '--start-date',
        type=str,
        help='開始日期 (YYYY-MM-DD)'
    )
    parser.add_argument(
        '--end-date',
        type=str,
        help='結束日期 (YYYY-MM-DD)'
    )
    parser.add_argument(
        '--output',
        type=str,
        default='articles.xlsx',
        help='輸出Excel檔案名稱'
    )
    parser.add_argument(
        '--log-level',
        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
        default='INFO',
        help='日誌級別'
    )

    args = parser.parse_args()

    # 設定日期範圍
    if args.mode == 'range':
        if not (args.start_date and args.end_date):
            print("Error: range模式需要指定開始和結束日期")
            return
        try:
            start_date = datetime.strptime(args.start_date, '%Y-%m-%d')
            end_date = datetime.strptime(args.end_date, '%Y-%m-%d')
        except ValueError:
            print("Error: 日期格式錯誤，請使用YYYY-MM-DD格式")
            return
    else:
        end_date = datetime.now()
        start_date = end_date - timedelta(days=30)

    # 初始化爬蟲
    try:
        scraper = ArticleScraper(
            scan_mode=args.mode,
            data_file=args.output
        )

        # 設定日誌級別
        scraper.logger.setLevel(getattr(logging, args.log_level))

        # 開始執行
        scraper.scan_articles()

    except Exception as e:
        print(f"執行過程中發生錯誤: {str(e)}")


if __name__ == '__main__':
    main()
    
    