# 最新

In [None]:
import json
import requests
from bs4 import BeautifulSoup
import re
import time
import random
import os
from datetime import datetime


class JasperRealEstateScraper:
    """爬取 Jasper 不動產網站文章的爬蟲類"""

    def __init__(self, base_url="https://www.jasper-realestate.com/posts/", output_dir="scraped_data"):
        """
        初始化爬蟲
        
        Args:
            base_url: 目標網站的基礎URL
            output_dir: 輸出數據的目錄
        """
        self.base_url = base_url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        self.output_dir = output_dir
        self.results = []
        self.current_page = 1

        # 確保輸出目錄存在
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

    def random_sleep(self, min_seconds=2, max_seconds=5):
        """
        隨機休息一段時間，避免過快請求
        
        Args:
            min_seconds: 最小休息秒數
            max_seconds: 最大休息秒數
        """
        sleep_time = random.uniform(min_seconds, max_seconds)
        print(f"休息 {sleep_time:.2f} 秒...")
        time.sleep(sleep_time)

    def get_page(self, url):
        """
        獲取頁面內容
        
        Args:
            url: 要獲取的頁面URL
        
        Returns:
            BeautifulSoup 對象
        """
        try:
            response = requests.get(url, headers=self.headers)
            response.encoding = 'utf-8'  # 確保正確解碼中文
            return BeautifulSoup(response.text, 'html.parser')
        except Exception as e:
            print(f"獲取頁面時出錯: {url}, 錯誤: {str(e)}")
            return None

    def extract_article_preview(self, article):
        """
        從文章預覽中提取信息
        
        Args:
            article: 包含文章預覽的 BeautifulSoup 對象
        
        Returns:
            包含文章預覽信息的字典
        """
        # 提取標題和連結
        title_elem = article.find('h3', class_='elementor-post__title')
        title = title_elem.find('a').text.strip(
        ) if title_elem and title_elem.find('a') else 'N/A'
        link = title_elem.find(
            'a')['href'] if title_elem and title_elem.find('a') else 'N/A'

        # 提取摘要和條文依據
        excerpt_elem = article.find('div', class_='elementor-post__excerpt')
        if excerpt_elem:
            paragraphs = excerpt_elem.find_all('p')
            excerpt = paragraphs[0].text.strip() if paragraphs else 'N/A'
            legal_basis = paragraphs[1].text.strip() if len(
                paragraphs) > 1 else 'N/A'
        else:
            excerpt, legal_basis = 'N/A', 'N/A'

        # 提取發布日期
        date_elem = article.find('span', class_='elementor-post-date')
        date = date_elem.text.strip() if date_elem else 'N/A'

        # 提取縮圖 URL
        thumbnail_elem = article.find(
            'div', class_='elementor-post__thumbnail')
        thumbnail_url = 'N/A'
        if thumbnail_elem:
            img = thumbnail_elem.find('img')
            # 優先從 data-lazy-src 提取，若無則從 src 提取
            thumbnail_url = img.get('data-lazy-src') or img.get('src') or 'N/A'
            # 清理 SVG 占位符
            if 'data:image/svg+xml' in thumbnail_url:
                thumbnail_url = img.get('data-lazy-src') or 'N/A'

        # 提取標籤
        badge_elem = article.find('div', class_='elementor-post__badge')
        badge = badge_elem.text.strip() if badge_elem else 'N/A'

        return {
            'title': title,
            'link': link,
            'excerpt': excerpt,
            'legal_basis': legal_basis,
            'date': date,
            'thumbnail_url': thumbnail_url,
            'badge': badge
        }

    def extract_full_content(self, article_url):
        """
        從文章頁面提取完整內容
        
        Args:
            article_url: 文章頁面URL
        
        Returns:
            文章的完整內容
        """
        if article_url == 'N/A':
            return "無法獲取完整內容，鏈接不可用"

        try:
            print(f"正在抓取文章內容: {article_url}")
            article_soup = self.get_page(article_url)

            if not article_soup:
                return "無法獲取文章頁面"

            # 根據HTML結構，精確定位文章內容
            content_section = article_soup.find(
                'div', class_='elementor-widget-theme-post-content')

            if content_section:
                # 提取文章內容
                content_container = content_section.find(
                    'div', class_='elementor-widget-container')

                if content_container:
                    # 提取所有標題和段落
                    content_elements = content_container.find_all(
                        ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'li'])

                    # 組織內容，保留標題層級結構
                    full_content = ""
                    for elem in content_elements:
                        # 檢查是否是延伸閱讀或免費諮詢部分
                        if elem.get_text().strip() in ["．延伸閱讀", "．免費諮詢", "延伸閱讀", "免費諮詢"]:
                            # 如果找到延伸閱讀或免費諮詢，則停止處理
                            break

                        if elem.name.startswith('h'):  # 如果是標題
                            full_content += f"\n\n## {elem.get_text().strip()}\n\n"
                        elif elem.name == 'p':  # 如果是段落
                            # 檢查段落是否為空
                            text = elem.get_text().strip()
                            if text:
                                full_content += f"{text}\n\n"
                        elif elem.name in ['ul', 'ol']:  # 如果是列表
                            for li in elem.find_all('li'):
                                full_content += f"- {li.get_text().strip()}\n"
                            full_content += "\n"
                        # 單獨的列表項
                        elif elem.name == 'li' and not elem.parent.name in ['ul', 'ol']:
                            full_content += f"- {elem.get_text().strip()}\n"

                    # 清理多餘的空行
                    full_content = re.sub(
                        r'\n{3,}', '\n\n', full_content).strip()
                else:
                    # 如果找不到特定容器，則獲取整個內容區域的文本
                    full_content = content_section.get_text(
                        separator="\n").strip()

                    # 移除延伸閱讀和免費諮詢部分
                    sections_to_remove = ["．延伸閱讀", "．免費諮詢", "延伸閱讀", "免費諮詢"]
                    for section in sections_to_remove:
                        if section in full_content:
                            full_content = full_content.split(section)[
                                0].strip()
            else:
                # 如果找不到主要內容區域，嘗試其他可能的選擇器
                full_content = "無法找到主要內容區域，嘗試其他方法..."

                # 嘗試查找任何可能包含文章內容的區域
                possible_content_areas = article_soup.find_all(
                    'div', class_=['elementor-widget-container', 'entry-content', 'post-content'])

                if possible_content_areas:
                    # 選擇最長的內容區域作為可能的文章內容
                    longest_content = ""
                    for area in possible_content_areas:
                        content = area.get_text(separator="\n").strip()
                        if len(content) > len(longest_content):
                            longest_content = content

                    # 移除延伸閱讀和免費諮詢部分
                    sections_to_remove = ["．延伸閱讀", "．免費諮詢", "延伸閱讀", "免費諮詢"]
                    for section in sections_to_remove:
                        if section in longest_content:
                            longest_content = longest_content.split(section)[
                                0].strip()

                    full_content = longest_content
                else:
                    full_content = "無法獲取文章內容"

            return full_content

        except Exception as e:
            return f"獲取內容時出錯: {str(e)}"

    def has_next_page(self, soup):
        """
        檢查是否有下一頁
        
        Args:
            soup: 當前頁面的 BeautifulSoup 對象
        
        Returns:
            布爾值，表示是否有下一頁
        """
        # 查找分頁導航元素
        pagination = soup.find('nav', class_='elementor-pagination')
        if not pagination:
            return False

        # 檢查是否有下一頁按鈕或連結
        current_page_links = pagination.find_all('a', class_='page-numbers')
        for link in current_page_links:
            # 檢查是否有"下一頁"按鈕或更高的頁碼
            if link.text.strip() == '下一頁' or link.text.strip() == '»' or (link.text.strip().isdigit() and int(link.text.strip()) > self.current_page):
                return True

        return False

    def get_next_page_url(self, soup):
        """
        獲取下一頁的URL
        
        Args:
            soup: 當前頁面的 BeautifulSoup 對象
        
        Returns:
            下一頁的URL，如果沒有下一頁則返回None
        """
        # 查找分頁導航元素
        pagination = soup.find('nav', class_='elementor-pagination')
        if not pagination:
            return None

        # 尋找"下一頁"按鈕
        next_page_link = pagination.find(
            'a', string=['下一頁', '»']) or pagination.find('a', class_='next')
        if next_page_link and 'href' in next_page_link.attrs:
            return next_page_link['href']

        # 如果沒有明確的"下一頁"按鈕，尋找比當前頁碼更高的頁碼連結
        page_links = pagination.find_all('a', class_='page-numbers')
        for link in page_links:
            if link.text.strip().isdigit() and int(link.text.strip()) > self.current_page:
                return link['href']

        return None

    def scrape_all_articles(self):
        """爬取所有文章"""
        current_url = self.base_url
        total_articles = 0

        while current_url:
            print(f"\n正在爬取第 {self.current_page} 頁: {current_url}")
            soup = self.get_page(current_url)

            if not soup:
                print(f"無法獲取頁面: {current_url}")
                break

            # 找到所有文章
            articles = soup.find_all('article', class_='elementor-post')

            if not articles:
                print(f"在頁面上找不到文章: {current_url}")
                break

            print(f"找到 {len(articles)} 篇文章")

            for index, article in enumerate(articles):
                print(f"正在處理第 {self.current_page} 頁的第 {index+1} 篇文章...")

                # 提取文章預覽信息
                article_info = self.extract_article_preview(article)

                # 隨機休息，避免過快請求
                self.random_sleep()

                # 獲取完整內容
                full_content = self.extract_full_content(article_info['link'])

                # 添加完整內容到文章信息中
                article_info['full_content'] = full_content

                # 添加到結果列表
                self.results.append(article_info)

                print(f"已抓取文章: {article_info['title']}")
                total_articles += 1

                # 每抓取5篇文章保存一次結果，防止中途中斷丟失數據
                if total_articles % 5 == 0:
                    self.save_results(
                        f"jasper_articles_partial_{total_articles}.json")

                # 隨機休息，避免過快請求
                self.random_sleep()

            # 檢查是否有下一頁
            if self.has_next_page(soup):
                next_page_url = self.get_next_page_url(soup)
                if next_page_url:
                    current_url = next_page_url
                    self.current_page += 1
                    # 頁面之間的休息時間更長，避免被檢測
                    self.random_sleep(5, 10)
                else:
                    print("找不到下一頁的URL")
                    break
            else:
                print("沒有更多頁面")
                break

        print(f"\n爬取完成，共獲取 {len(self.results)} 篇文章")
        return self.results

    def save_results(self, filename=None):
        """
        保存爬取結果
        
        Args:
            filename: 保存的文件名，如果為None則自動生成
        """
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"jasper_articles_{timestamp}.json"

        filepath = os.path.join(self.output_dir, filename)

        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(self.results, f, ensure_ascii=False, indent=4)

        print(f"結果已保存至 {filepath}")


# 執行爬蟲
if __name__ == "__main__":
    scraper = JasperRealEstateScraper()
    scraper.scrape_all_articles()
    scraper.save_results()