In [1]:
import asyncio
import csv
import json
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
import re
import time

# 读取Excel文件中的景点数据
def read_sights_from_excel(file_path):
    df = pd.read_excel(file_path)
    sights = []
    for _, row in df.iterrows():
        sights.append({
            'name': row['name'],
            'url': row['url'],
            'rating': row['评分（0-5分）']
        })
    return sights

async def fetch_reviews_for_sight(page, sight):
    print(f"[INFO] 开始爬取景点: {sight['name']}")
    
    # 导航到景点页面
    await page.goto(sight['url'], wait_until="domcontentloaded", timeout=60000)
    await asyncio.sleep(3)
    
    # 滚动页面确保评论区域加载
    await page.evaluate("window.scrollTo(0, 800)")
    await asyncio.sleep(2)
    
    all_reviews = []
    
    try:
        # 查找并点击"时间排序"
        print(f"[INFO] 正在查找时间排序按钮...")
        
        # 尝试多种可能的选择器来找到排序按钮
        sort_selectors = [
            "div[class*='sort']",
            "span:has-text('排序')",
            "button:has-text('排序')",
            "div:has-text('排序')",
            ".commentSort"
        ]
        
        sort_clicked = False
        for selector in sort_selectors:
            try:
                sort_elements = await page.query_selector_all(selector)
                for element in sort_elements:
                    element_text = await element.text_content()
                    if element_text and '排序' in element_text:
                        await element.click()
                        await asyncio.sleep(2)
                        
                        # 点击"时间排序"
                        time_sort_selectors = [
                            "text='时间排序'",
                            "text='按时间'",
                            "li:has-text('时间')",
                            "span:has-text('时间')"
                        ]
                        
                        for time_selector in time_sort_selectors:
                            try:
                                time_element = await page.query_selector(time_selector)
                                if time_element:
                                    await time_element.click()
                                    await asyncio.sleep(3)
                                    sort_clicked = True
                                    print(f"[INFO] 已点击时间排序")
                                    break
                            except:
                                continue
                        break
            except:
                continue
        
        if not sort_clicked:
            print(f"[WARNING] 未找到时间排序按钮，继续爬取默认排序的评论")
        
        # 开始爬取评论
        page_num = 1
        max_pages = 50  # 防止无限循环
        
        while page_num <= max_pages:
            print(f"[INFO] 正在爬取第 {page_num} 页评论...")
            
            # 获取当前页面HTML
            html = await page.content()
            soup = BeautifulSoup(html, 'lxml')
            
            # 解析评论
            page_reviews = parse_reviews_from_html(soup, sight)
            
            if not page_reviews:
                print(f"[INFO] 第 {page_num} 页未找到评论，可能已到末尾")
                break
                
            all_reviews.extend(page_reviews)
            print(f"[INFO] 第 {page_num} 页找到 {len(page_reviews)} 条评论")
            
            # 尝试点击下一页
            next_found = False
            next_selectors = [
                "a:has-text('下一页')",
                "button:has-text('下一页')",
                "span:has-text('下一页')",
                ".next-page",
                "a[class*='next']",
                "button[class*='next']"
            ]
            
            for selector in next_selectors:
                try:
                    next_btn = await page.query_selector(selector)
                    if next_btn:
                        # 检查按钮是否可点击
                        is_disabled = await next_btn.get_attribute("disabled")
                        if not is_disabled:
                            await next_btn.click()
                            await asyncio.sleep(3)
                            next_found = True
                            break
                except:
                    continue
            
            if not next_found:
                print(f"[INFO] 未找到下一页按钮，评论爬取完成")
                break
                
            page_num += 1
            
    except Exception as e:
        print(f"[ERROR] 爬取景点 {sight['name']} 时出错: {str(e)}")
    
    print(f"[INFO] 景点 {sight['name']} 爬取完成，共获取 {len(all_reviews)} 条评论")
    return all_reviews

def parse_reviews_from_html(soup, sight):
    reviews = []
    
    # 多种可能的评论选择器
    review_selectors = [
        "div[class*='comment']",
        "div[class*='review']",
        "li[class*='comment']",
        "div[class*='comment-item']",
        ".commentList"
    ]
    
    for selector in review_selectors:
        review_blocks = soup.find_all('div', class_=re.compile(selector.replace('.', '')))
        if review_blocks:
            print(f"[DEBUG] 使用选择器 {selector} 找到 {len(review_blocks)} 个评论块")
            break
    
    if not review_blocks:
        # 尝试更通用的选择器
        review_blocks = soup.find_all('div', class_=re.compile(r'comment|review|点评|评价'))
    
    for block in review_blocks:
        try:
            # 提取评论内容
            content = ""
            
            # 多种可能的内容选择器
            content_selectors = [
                "span",
                "p",
                "div[class*='content']",
                "div[class*='text']"
            ]
            
            for content_selector in content_selectors:
                content_elements = block.select(content_selector)
                for element in content_elements:
                    text = element.get_text(strip=True)
                    if len(text) > 10:  # 确保是真正的评论内容
                        content = text
                        break
                if content:
                    break
            
            # 如果还没找到内容，获取整个块的文本
            if not content:
                content = block.get_text(strip=True)
            
            # 清理内容
            content = re.sub(r'\s+', ' ', content).strip()
            
            # 过滤掉太短或无意义的内容
            if len(content) >= 5 and not any(word in content for word in ['暂无点评', '暂无评论', '加载中']):
                reviews.append({
                    'sight_name': sight['name'],
                    'rating': sight['rating'],
                    'review_content': content
                })
                
        except Exception as e:
            continue
    
    return reviews

async def main():
    # 读取景点数据
    sights = read_sights_from_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
    print(f"[INFO] 共读取到 {len(sights)} 个景点")
    
    all_reviews = []
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)  # 设为False便于调试
        context = await browser.new_context(
            user_agent=(
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/120.0.0.0 Safari/537.36"
            ),
            viewport={'width': 1920, 'height': 1080}
        )
        
        page = await context.new_page()
        
        # 为每个景点爬取评论
        for i, sight in enumerate(sights, 1):
            print(f"\n[进度] 正在处理第 {i}/{len(sights)} 个景点: {sight['name']}")
            
            reviews = await fetch_reviews_for_sight(page, sight)
            all_reviews.extend(reviews)
            
            # 每个景点之间休息一下
            await asyncio.sleep(2)
        
        await browser.close()
    
    # 保存结果
    if all_reviews:
        # 保存为CSV
        csv_file = "ctrip_sight_reviews.csv"
        with open(csv_file, 'w', newline='', encoding='utf-8-sig') as f:
            fieldnames = ['sight_name', 'rating', 'review_content']
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(all_reviews)
        
        # 保存为JSON
        json_file = "ctrip_sight_reviews.json"
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(all_reviews, f, ensure_ascii=False, indent=2)
        
        print(f"\n[DONE] 爬取完成！共获取 {len(all_reviews)} 条评论")
        print(f"结果已保存至: {csv_file} 和 {json_file}")
        
        # 统计每个景点的评论数量
        from collections import Counter
        sight_counts = Counter([review['sight_name'] for review in all_reviews])
        print("\n各景点评论数量统计:")
        for sight, count in sight_counts.most_common():
            print(f"  {sight}: {count}条")
    else:
        print("[ERROR] 未获取到任何评论数据")

if __name__ == "__main__":
    asyncio.run(main())

ModuleNotFoundError: No module named 'playwright'

In [2]:
import pandas as pd

# 读取Excel文件
df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")

# 查看数据结构
print("数据形状:", df.shape)
print("列名:", df.columns.tolist())
print("\n前5行数据:")
print(df.head())

# 转换为字典列表格式
sights = []
for _, row in df.iterrows():
    sights.append({
        'name': row['name'],
        'url': row['url'],
        'rating': row['评分（0-5分）'],
        'page': row['page'],
        'free': row['free'],
        'category': row['分类'],
        'reviews_count': row['点评数'],
        'city': row['所在城市']
    })

print(f"成功读取 {len(sights)} 个景点")

数据形状: (80, 8)
列名: ['page', 'name', 'url', 'free', '分类', '评分（0-5分）', '点评数', '所在城市']

前5行数据:
   page    name                                                url  free  \
0     1      外滩  https://you.ctrip.com/sight/shanghai2/736.html...  True   
1     6  湖南省博物馆  https://you.ctrip.com/sight/changsha148/8981.h...  True   
2    15  湖北省博物馆  https://you.ctrip.com/sight/wuhan145/8977.html...  True   
3    18     陆家嘴  https://you.ctrip.com/sight/shanghai2/1815444....  True   
4    27   成都博物馆  https://you.ctrip.com/sight/chengdu104/2006697...  True   

      分类  评分（0-5分）       点评数 所在城市  
0  遛娃宝藏地       4.8  15.8万条点评  上海   
1    博物馆       4.8   1.4万条点评   长沙  
2    博物馆       4.9   6565条点评   武汉  
3     其他       4.8   3687条点评   上海  
4    博物馆       4.8   5231条点评   成都  
成功读取 80 个景点


In [3]:
import asyncio
import csv
import json
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
import re
import os

def read_sights_from_excel(file_path):
    """
    从Excel文件读取景点数据
    """
    try:
        # 读取Excel文件
        df = pd.read_excel(file_path)
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 行数据")
        print(f"[INFO] 列名: {df.columns.tolist()}")
        
        sights = []
        for index, row in df.iterrows():
            # 处理点评数字段（去除"条点评"文字）
            reviews_str = str(row['点评数']).replace('条点评', '').strip()
            if '万' in reviews_str:
                reviews_count = int(float(reviews_str.replace('万', '')) * 10000)
            else:
                try:
                    reviews_count = int(float(reviews_str))
                except:
                    reviews_count = 0
            
            sight_data = {
                'name': row['name'],
                'url': row['url'],
                'rating': row['评分（0-5分）'],
                'page': row['page'],
                'free': row['free'],
                'category': row['分类'],
                'reviews_count': reviews_count,
                'city': row['所在城市']
            }
            sights.append(sight_data)
            
        print(f"[INFO] 成功解析 {len(sights)} 个景点")
        return sights
        
    except Exception as e:
        print(f"[ERROR] 读取Excel文件失败: {str(e)}")
        return []

async def fetch_reviews_for_sight(page, sight):
    """
    爬取单个景点的评论
    """
    print(f"[INFO] 开始爬取景点: {sight['name']}")
    
    try:
        # 导航到景点页面
        await page.goto(sight['url'], wait_until="domcontentloaded", timeout=60000)
        await asyncio.sleep(3)
        
        # 滚动页面加载更多内容
        await page.evaluate("window.scrollTo(0, 1000)")
        await asyncio.sleep(2)
        
        all_reviews = []
        page_num = 1
        max_pages = 20  # 限制最大页数防止无限循环
        
        while page_num <= max_pages:
            print(f"[INFO] {sight['name']} - 正在爬取第 {page_num} 页评论...")
            
            # 获取页面HTML
            html = await page.content()
            page_reviews = parse_reviews_from_html(html, sight)
            
            if page_reviews:
                all_reviews.extend(page_reviews)
                print(f"[INFO] 第 {page_num} 页找到 {len(page_reviews)} 条评论")
            else:
                print(f"[INFO] 第 {page_num} 页未找到评论")
                break
            
            # 尝试翻页
            if not await try_click_next_page(page):
                break
                
            page_num += 1
            await asyncio.sleep(2)
            
    except Exception as e:
        print(f"[ERROR] 爬取景点 {sight['name']} 时出错: {str(e)}")
    
    print(f"[INFO] 景点 {sight['name']} 爬取完成，共获取 {len(all_reviews)} 条评论")
    return all_reviews

def parse_reviews_from_html(html, sight):
    """
    从HTML中解析评论内容
    """
    soup = BeautifulSoup(html, 'lxml')
    reviews = []
    
    # 多种可能的评论选择器
    comment_selectors = [
        'div[class*="comment"]',
        'div[class*="review"]', 
        'li[class*="comment"]',
        'div[class*="Comment"]',
        '.comment-item',
        '.review-item'
    ]
    
    for selector in comment_selectors:
        review_blocks = soup.select(selector)
        for block in review_blocks:
            try:
                # 获取评论文本
                text = block.get_text(strip=True)
                # 清理文本
                text = re.sub(r'\s+', ' ', text)
                
                # 过滤有效评论
                if (len(text) > 10 and 
                    '暂无点评' not in text and 
                    '加载中' not in text and
                    '条点评' not in text):
                    reviews.append({
                        'sight_name': sight['name'],
                        'rating': sight['rating'],
                        'review_content': text,
                        'city': sight['city'],
                        'category': sight['category']
                    })
            except:
                continue
        
        if reviews:  # 如果找到评论就停止尝试其他选择器
            break
    
    return reviews

async def try_click_next_page(page):
    """
    尝试点击下一页按钮
    """
    next_selectors = [
        "a:has-text('下一页')",
        "button:has-text('下一页')", 
        ".next-page",
        "a[class*='next']",
        "button[class*='next']"
    ]
    
    for selector in next_selectors:
        try:
            next_btn = await page.query_selector(selector)
            if next_btn:
                is_disabled = await next_btn.get_attribute("disabled")
                is_visible = await next_btn.is_visible()
                
                if not is_disabled and is_visible:
                    await next_btn.click()
                    await asyncio.sleep(3)
                    return True
        except:
            continue
    
    return False

async def main():
    """
    主函数
    """
    # 读取景点数据
    excel_file = "4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx"
    
    # 检查文件是否存在
    if not os.path.exists(excel_file):
        print(f"[ERROR] 文件 {excel_file} 不存在!")
        print("请确保:")
        print("1. Excel文件在当前目录下")
        print("2. 文件名正确: '4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx'")
        return
    
    sights = read_sights_from_excel(excel_file)
    
    if not sights:
        print("[ERROR] 未读取到景点数据，程序退出")
        return
    
    all_reviews = []
    
    # 启动浏览器
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)  # 设为False便于调试
        context = await browser.new_context(
            user_agent=(
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/120.0.0.0 Safari/537.36"
            )
        )
        
        page = await context.new_page()
        
        # 爬取每个景点的评论
        for i, sight in enumerate(sights, 1):
            print(f"\n[进度] ({i}/{len(sights)}) 正在处理: {sight['name']}")
            
            reviews = await fetch_reviews_for_sight(page, sight)
            all_reviews.extend(reviews)
            
            # 休息一下
            await asyncio.sleep(2)
        
        await browser.close()
    
    # 保存结果
    if all_reviews:
        # 保存为CSV
        csv_file = "ctrip_sight_reviews.csv"
        with open(csv_file, 'w', newline='', encoding='utf-8-sig') as f:
            fieldnames = ['sight_name', 'rating', 'review_content', 'city', 'category']
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(all_reviews)
        
        print(f"\n[DONE] 爬取完成！共获取 {len(all_reviews)} 条评论")
        print(f"结果已保存至: {csv_file}")
        
        # 统计信息
        from collections import Counter
        sight_counts = Counter([review['sight_name'] for review in all_reviews])
        print("\n各景点评论数量统计:")
        for sight, count in sight_counts.most_common():
            print(f"  {sight}: {count}条")
    else:
        print("[ERROR] 未获取到任何评论数据")

if __name__ == "__main__":
    asyncio.run(main())

ModuleNotFoundError: No module named 'playwright'

In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
import json
import re
import os
import time
from urllib.parse import urljoin

def read_sights_from_excel(file_path):
    """
    从Excel文件读取景点数据
    """
    try:
        df = pd.read_excel(file_path)
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 行数据")
        print(f"[INFO] 列名: {df.columns.tolist()}")
        
        sights = []
        for index, row in df.iterrows():
            # 处理点评数字段
            reviews_str = str(row['点评数']).replace('条点评', '').strip()
            if '万' in reviews_str:
                reviews_count = int(float(reviews_str.replace('万', '')) * 10000)
            else:
                try:
                    reviews_count = int(float(reviews_str))
                except:
                    reviews_count = 0
            
            sight_data = {
                'name': row['name'],
                'url': row['url'],
                'rating': row['评分（0-5分）'],
                'page': row['page'],
                'free': row['free'],
                'category': row['分类'],
                'reviews_count': reviews_count,
                'city': row['所在城市']
            }
            sights.append(sight_data)
            
        print(f"[INFO] 成功解析 {len(sights)} 个景点")
        return sights
        
    except Exception as e:
        print(f"[ERROR] 读取Excel文件失败: {str(e)}")
        return []

def fetch_reviews_for_sight(sight):
    """
    爬取单个景点的评论（使用requests）
    """
    print(f"[INFO] 开始爬取景点: {sight['name']}")
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }
    
    all_reviews = []
    
    try:
        # 发送请求
        response = requests.get(sight['url'], headers=headers, timeout=10)
        response.encoding = 'utf-8'
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'lxml')
            reviews = parse_reviews_from_html(soup, sight)
            all_reviews.extend(reviews)
            print(f"[INFO] {sight['name']} - 找到 {len(reviews)} 条评论")
        else:
            print(f"[WARNING] {sight['name']} - 请求失败，状态码: {response.status_code}")
            
    except Exception as e:
        print(f"[ERROR] 爬取景点 {sight['name']} 时出错: {str(e)}")
    
    return all_reviews

def parse_reviews_from_html(soup, sight):
    """
    从HTML中解析评论内容
    """
    reviews = []
    
    # 多种可能的评论选择器
    comment_selectors = [
        'div[class*="comment"]',
        'div[class*="review"]', 
        'li[class*="comment"]',
        'div[class*="Comment"]',
        '.comment-item',
        '.review-item',
        '.commentList',
        '.review-list'
    ]
    
    # 首先尝试找到评论区块
    for selector in comment_selectors:
        review_blocks = soup.select(selector)
        if review_blocks:
            print(f"[DEBUG] 使用选择器 {selector} 找到 {len(review_blocks)} 个评论块")
            
        for block in review_blocks:
            try:
                # 获取评论文本
                text = block.get_text(strip=True)
                # 清理文本
                text = re.sub(r'\s+', ' ', text)
                
                # 过滤有效评论（长度适中，不包含特定无效文本）
                if (len(text) >= 10 and 
                    len(text) <= 1000 and
                    '暂无点评' not in text and 
                    '加载中' not in text and
                    '条点评' not in text and
                    '发表点评' not in text and
                    '我要点评' not in text):
                    
                    reviews.append({
                        'sight_name': sight['name'],
                        'rating': sight['rating'],
                        'review_content': text,
                        'city': sight['city'],
                        'category': sight['category'],
                        'url': sight['url']
                    })
                    
            except Exception as e:
                continue
        
        if reviews:  # 如果找到评论就停止尝试其他选择器
            break
    
    # 如果上面的选择器没找到，尝试更通用的方法
    if not reviews:
        print(f"[DEBUG] 尝试通用文本匹配...")
        # 查找包含评论关键词的文本
        text_elements = soup.find_all(text=True)
        comment_keywords = ['很好', '不错', '推荐', '体验', '值得', '漂亮', '美丽', '好玩']
        
        for element in text_elements:
            text = element.strip()
            if (len(text) >= 20 and 
                len(text) <= 500 and
                any(keyword in text for keyword in comment_keywords) and
                '条点评' not in text):
                
                reviews.append({
                    'sight_name': sight['name'],
                    'rating': sight['rating'],
                    'review_content': text,
                    'city': sight['city'],
                    'category': sight['category'],
                    'url': sight['url']
                })
    
    return reviews

def main():
    """
    主函数
    """
    # 读取景点数据
    excel_file = "4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx"
    
    # 检查文件是否存在
    if not os.path.exists(excel_file):
        print(f"[ERROR] 文件 {excel_file} 不存在!")
        print("当前目录文件列表:")
        for file in os.listdir('.'):
            if file.endswith('.xlsx'):
                print(f"  - {file}")
        return
    
    sights = read_sights_from_excel(excel_file)
    
    if not sights:
        print("[ERROR] 未读取到景点数据，程序退出")
        return
    
    all_reviews = []
    
    # 爬取每个景点的评论
    for i, sight in enumerate(sights, 1):
        print(f"\n[进度] ({i}/{len(sights)}) 正在处理: {sight['name']}")
        
        reviews = fetch_reviews_for_sight(sight)
        all_reviews.extend(reviews)
        
        # 休息一下，避免请求过快
        time.sleep(2)
        
        # 每10个景点保存一次进度
        if i % 10 == 0:
            print(f"[INFO] 已处理 {i} 个景点，当前总计 {len(all_reviews)} 条评论")
    
    # 保存结果
    if all_reviews:
        # 保存为CSV
        csv_file = "ctrip_sight_reviews.csv"
        with open(csv_file, 'w', newline='', encoding='utf-8-sig') as f:
            fieldnames = ['sight_name', 'rating', 'review_content', 'city', 'category', 'url']
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(all_reviews)
        
        # 保存为JSON
        json_file = "ctrip_sight_reviews.json"
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(all_reviews, f, ensure_ascii=False, indent=2)
        
        print(f"\n[DONE] 爬取完成！共获取 {len(all_reviews)} 条评论")
        print(f"结果已保存至: {csv_file} 和 {json_file}")
        
        # 统计信息
        from collections import Counter
        sight_counts = Counter([review['sight_name'] for review in all_reviews])
        print("\n各景点评论数量统计:")
        for sight, count in sight_counts.most_common():
            print(f"  {sight}: {count}条")
    else:
        print("[ERROR] 未获取到任何评论数据")

if __name__ == "__main__":
    main()

[INFO] 成功读取Excel文件，共 80 行数据
[INFO] 列名: ['page', 'name', 'url', 'free', '分类', '评分（0-5分）', '点评数', '所在城市']
[INFO] 成功解析 80 个景点

[进度] (1/80) 正在处理: 外滩
[INFO] 开始爬取景点: 外滩
[DEBUG] 使用选择器 div[class*="comment"] 找到 87 个评论块
[INFO] 外滩 - 找到 40 条评论

[进度] (2/80) 正在处理: 湖南省博物馆
[INFO] 开始爬取景点: 湖南省博物馆
[DEBUG] 使用选择器 div[class*="comment"] 找到 84 个评论块
[INFO] 湖南省博物馆 - 找到 40 条评论

[进度] (3/80) 正在处理: 湖北省博物馆
[INFO] 开始爬取景点: 湖北省博物馆
[DEBUG] 使用选择器 div[class*="comment"] 找到 87 个评论块
[INFO] 湖北省博物馆 - 找到 40 条评论

[进度] (4/80) 正在处理: 陆家嘴
[INFO] 开始爬取景点: 陆家嘴
[DEBUG] 使用选择器 div[class*="comment"] 找到 87 个评论块
[INFO] 陆家嘴 - 找到 40 条评论

[进度] (5/80) 正在处理: 成都博物馆
[INFO] 开始爬取景点: 成都博物馆
[DEBUG] 使用选择器 div[class*="comment"] 找到 87 个评论块
[INFO] 成都博物馆 - 找到 40 条评论

[进度] (6/80) 正在处理: 山西博物院
[INFO] 开始爬取景点: 山西博物院
[DEBUG] 使用选择器 div[class*="comment"] 找到 87 个评论块
[INFO] 山西博物院 - 找到 40 条评论

[进度] (7/80) 正在处理: 河南博物院
[INFO] 开始爬取景点: 河南博物院
[DEBUG] 使用选择器 div[class*="comment"] 找到 87 个评论块
[INFO] 河南博物院 - 找到 40 条评论

[进度] (8/80) 正在处理: 呼伦贝尔大草原
[INFO] 开始爬取景点: 呼伦贝尔大草原
[DEBUG] 使用

In [6]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

def parse_comments_from_html(html_content, sight_name):
    """
    从HTML中解析评论的用户名和内容
    """
    soup = BeautifulSoup(html_content, 'lxml')
    comments = []
    
    # 方案1：查找commentlist相关的div
    comment_containers = soup.find_all('div', class_=re.compile(r'commentlist|commentList|comment-list'))
    
    for container in comment_containers:
        # 在评论容器中查找每个评论项
        comment_items = container.find_all('div', class_=re.compile(r'comment-item|commentItem|comment_item'))
        
        for item in comment_items:
            try:
                # 提取用户名 - 多种可能的选择器
                user_name = extract_user_name(item)
                
                # 提取评论内容 - 多种可能的选择器
                comment_content = extract_comment_content(item)
                
                if user_name and comment_content:
                    comments.append({
                        'sight_name': sight_name,
                        'user_name': user_name,
                        'comment_content': comment_content
                    })
                    
            except Exception as e:
                continue
    
    # 方案2：如果上面没找到，尝试更通用的选择器
    if not comments:
        comments = fallback_comment_parsing(soup, sight_name)
    
    return comments

def extract_user_name(comment_element):
    """提取用户名"""
    user_selectors = [
        '.userInfo .userName',
        '.user-info .user-name',
        '.username',
        '.userName',
        '.user-name',
        'span[class*="user"]',
        'div[class*="user"]',
        '.name'
    ]
    
    for selector in user_selectors:
        user_element = comment_element.select_one(selector)
        if user_element:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1:
                return text
    
    # 如果选择器没找到，尝试正则匹配
    text_content = comment_element.get_text()
    user_match = re.search(r'用户\w+|\w+用户|游客\w+', text_content)
    if user_match:
        return user_match.group()
    
    return None

def extract_comment_content(comment_element):
    """提取评论内容"""
    content_selectors = [
        '.contentInfo .commentDetail',
        '.content-info .comment-detail',
        '.commentContent',
        '.comment-content',
        '.content',
        '.detail',
        '.comment-text',
        '.commentDetail',
        'p',
        'span[class*="content"]',
        'div[class*="content"]'
    ]
    
    for selector in content_selectors:
        content_element = comment_element.select_one(selector)
        if content_element:
            text = content_element.get_text(strip=True)
            if text and len(text) >= 5:  # 过滤太短的内容
                return text
    
    # 如果选择器没找到，尝试获取整个评论元素的文本
    full_text = comment_element.get_text(strip=True)
    # 清理文本，移除可能的用户名、时间等信息
    cleaned_text = clean_comment_text(full_text)
    if cleaned_text and len(cleaned_text) >= 10:
        return cleaned_text
    
    return None

def clean_comment_text(text):
    """清理评论文本"""
    # 移除常见的前缀和后缀
    patterns_to_remove = [
        r'用户\d+\s*',
        r'游客\d+\s*',
        r'\d{4}-\d{2}-\d{2}\s*',
        r'\d+月\d+日\s*',
        r'发表于\s*',
        r'评分：\d+',
        r'点击回复',
        r'回复'
    ]
    
    cleaned = text
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

def fallback_comment_parsing(soup, sight_name):
    """备选方案：更通用的评论解析"""
    comments = []
    
    # 查找所有可能包含评论的div
    potential_comments = soup.find_all('div', class_=re.compile(r'comment|review|点评|评价'))
    
    for element in potential_comments:
        try:
            # 获取完整文本
            full_text = element.get_text(strip=True)
            
            # 跳过太短或包含特定关键词的文本
            if (len(full_text) < 20 or 
                '暂无点评' in full_text or 
                '我要点评' in full_text or
                '条点评' in full_text):
                continue
            
            # 尝试分离用户名和评论内容
            user_name, comment_content = separate_user_and_content(full_text)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'comment_content': comment_content
                })
                
        except Exception as e:
            continue
    
    return comments

def separate_user_and_content(text):
    """尝试从文本中分离用户名和评论内容"""
    # 常见的用户名模式
    user_patterns = [
        r'^([\u4e00-\u9fa5a-zA-Z0-9_]{2,10})\s+',
        r'用户(\d+)\s*',
        r'游客(\d+)\s*',
        r'^(\w+)\s+'
    ]
    
    user_name = None
    comment_content = text
    
    for pattern in user_patterns:
        match = re.match(pattern, text)
        if match:
            user_name = match.group(1)
            comment_content = text[match.end():].strip()
            break
    
    return user_name, comment_content

# 使用示例
def fetch_and_parse_comments(url, sight_name):
    """获取网页并解析评论"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.encoding = 'utf-8'
        
        if response.status_code == 200:
            comments = parse_comments_from_html(response.text, sight_name)
            return comments
        else:
            print(f"请求失败，状态码: {response.status_code}")
            return []
            
    except Exception as e:
        print(f"请求出错: {e}")
        return []

# 主函数
def main():
    # 读取景点数据
    df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
    
    all_comments = []
    
    for index, row in df.iterrows():
        sight_name = row['name']
        url = row['url']
        
        print(f"正在爬取: {sight_name}")
        comments = fetch_and_parse_comments(url, sight_name)
        all_comments.extend(comments)
        
        print(f"找到 {len(comments)} 条评论")
        
        # 避免请求过快
        import time
        time.sleep(2)
    
    # 保存结果
    if all_comments:
        df_comments = pd.DataFrame(all_comments)
        df_comments.to_csv('ctrip_comments_detailed.csv', index=False, encoding='utf-8-sig')
        print(f"保存完成！共 {len(all_comments)} 条评论")
        
        # 显示统计信息
        print("\n各景点评论数量:")
        comment_counts = df_comments['sight_name'].value_counts()
        print(comment_counts)
    else:
        print("未找到任何评论")

if __name__ == "__main__":
    main()

正在爬取: 外滩
找到 10 条评论
正在爬取: 湖南省博物馆
找到 10 条评论
正在爬取: 湖北省博物馆
找到 10 条评论
正在爬取: 陆家嘴
找到 10 条评论
正在爬取: 成都博物馆
找到 10 条评论
正在爬取: 山西博物院
找到 10 条评论
正在爬取: 河南博物院
找到 10 条评论
正在爬取: 呼伦贝尔大草原
找到 8 条评论
正在爬取: 新疆维吾尔自治区博物馆
找到 10 条评论
正在爬取: 开封博物馆
找到 10 条评论
正在爬取: 陕西考古博物馆
找到 10 条评论
正在爬取: 迪士尼小镇
找到 10 条评论
正在爬取: 山东博物馆
找到 10 条评论
正在爬取: 七彩云南古滇度假区
找到 10 条评论
正在爬取: 文殊院
找到 10 条评论
正在爬取: 辽宁省博物馆
找到 10 条评论
正在爬取: 中央大街
找到 10 条评论
正在爬取: 长江三峡
找到 10 条评论
正在爬取: 海南省博物馆
找到 10 条评论
正在爬取: 李自健美术馆
找到 10 条评论
正在爬取: 喀什古城
找到 10 条评论
正在爬取: 珠穆朗玛峰国家公园
找到 10 条评论
正在爬取: 三娘湾
找到 10 条评论
正在爬取: 海河
找到 10 条评论
正在爬取: 八廓街
找到 10 条评论
正在爬取: 沙溪古镇
找到 10 条评论
正在爬取: 太原北齐壁画博物馆
找到 10 条评论
正在爬取: 晋祠胜境
找到 10 条评论
正在爬取: 天定山旅游度假小镇
找到 10 条评论
正在爬取: 宁明花山岩画景区
找到 10 条评论
正在爬取: 北海老街
找到 10 条评论
正在爬取: 陕西历史博物馆秦汉馆
找到 10 条评论
正在爬取: 沈阳博物馆
找到 5 条评论
正在爬取: 洱海生态廊道
找到 10 条评论
正在爬取: 宜昌三峡旅游度假区
找到 10 条评论
正在爬取: 太舞滑雪小镇
找到 10 条评论
正在爬取: 独库公路
找到 10 条评论
正在爬取: 腾格里沙漠
找到 9 条评论
正在爬取: 鲤鱼山公园
找到 10 条评论
正在爬取: 全富岛
找到 10 条评论
正在爬取: 西九文化区
找到 10 条评论
正在爬取: 帕米尔高原
找到 10 条评论
正在爬取: 日月潭
找到 10 条评论
正在爬取: 河北博物院
找到 10 条评论
正在爬取: 敦煌書局
找到 9

In [9]:
import asyncio
import os
import csv
import json
import re
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

# 常量定义
BASE_URL = "https://you.ctrip.com/sight/china110000"
DUMP_DIR = "html_dumps"
OUT_CSV = "ctrip_sights_latest.csv"
OUT_JSON = "ctrip_sights_latest.json"
COMMENTS_CSV = "ctrip_sights_comments.csv"
COMMENTS_JSON = "ctrip_sights_comments.json"

async def fetch_page(page, url, page_no):
    """获取页面HTML内容"""
    print(f"[INFO] Loading page {page_no}: {url}")
    try:
        await page.goto(url, wait_until="domcontentloaded", timeout=60000)
        await asyncio.sleep(6)
        html = await page.content()
        
        # 保存HTML用于调试
        os.makedirs(DUMP_DIR, exist_ok=True)
        with open(f"{DUMP_DIR}/page_{page_no}.html", "w", encoding="utf-8") as f:
            f.write(html)
        return html
    except Exception as e:
        print(f"[ERROR] Failed to fetch page {page_no}: {e}")
        return None

def parse_html(html, page_no):
    """解析景点列表页面"""
    soup = BeautifulSoup(html, "lxml")
    items = []
    
    # 新版：每个景点卡片在 div.sightItemCard_box__*
    for div in soup.find_all("div", class_=re.compile("^sightItemCard_box__")):
        name_tag = div.find("a", href=True)
        if not name_tag:
            continue
            
        name = name_tag.get("title") or name_tag.text.strip()
        href = name_tag["href"]
        if not href.startswith("http"):
            href = "https://you.ctrip.com" + href

        # 提取其他信息
        block = div.get_text(" ", strip=True)
        rating = None
        reviews = None
        price = None
        free = "免费" in block
        
        # 提取评分
        m = re.search(r"([0-9.]+)分", block)
        if m:
            rating = float(m.group(1))

        # 提取评论数
        m2 = re.search(r"([0-9.]+)条点评", block)
        if m2:
            try:
                reviews = int(float(m2.group(1)))
            except:
                pass

        # 提取价格
        m3 = re.search(r"¥\s*([0-9.]+)", block)
        if m3:
            price = float(m3.group(1))

        items.append({
            "page": page_no,
            "name": name,
            "url": href,
            "rating": rating,
            "reviews": reviews,
            "price": price,
            "free": free,
            "raw": block[:150]
        })

    print(f"[INFO] Page {page_no}: {len(items)} items found")
    return items

async def fetch_comments_for_sight(page, sight):
    """爬取单个景点的评论"""
    print(f"[INFO] Fetching comments for: {sight['name']}")
    
    try:
        await page.goto(sight['url'], wait_until="domcontentloaded", timeout=60000)
        await asyncio.sleep(4)
        
        # 滚动页面加载评论
        await page.evaluate("window.scrollTo(0, 800)")
        await asyncio.sleep(2)
        
        all_comments = []
        page_num = 1
        max_comment_pages = 10  # 限制评论页数
        
        while page_num <= max_comment_pages:
            print(f"[INFO] {sight['name']} - Comment page {page_num}")
            
            html = await page.content()
            comments = parse_comments_from_html(html, sight)
            
            if comments:
                all_comments.extend(comments)
                print(f"[INFO] Found {len(comments)} comments on page {page_num}")
            else:
                print(f"[INFO] No comments found on page {page_num}, stopping")
                break
            
            # 尝试翻页
            if not await try_click_next_comment_page(page):
                break
                
            page_num += 1
            await asyncio.sleep(2)
            
        return all_comments
        
    except Exception as e:
        print(f"[ERROR] Failed to fetch comments for {sight['name']}: {e}")
        return []

def parse_comments_from_html(html, sight):
    """解析评论内容"""
    soup = BeautifulSoup(html, "lxml")
    comments = []
    
    # 多种评论选择器
    comment_selectors = [
        'div[class*="comment"]',
        'div[class*="review"]',
        'li[class*="comment"]',
        '.comment-item',
        '.review-item'
    ]
    
    for selector in comment_selectors:
        comment_blocks = soup.select(selector)
        for block in comment_blocks:
            try:
                # 提取用户名
                user_name = extract_user_info(block)
                
                # 提取评论内容
                comment_content = extract_comment_content(block)
                
                if user_name and comment_content:
                    comments.append({
                        'sight_name': sight['name'],
                        'sight_url': sight['url'],
                        'sight_rating': sight['rating'],
                        'user_name': user_name,
                        'comment_content': comment_content,
                        'page': sight['page']
                    })
            except Exception as e:
                continue
                
        if comments:
            break
            
    return comments

def extract_user_info(comment_element):
    """提取用户信息"""
    user_selectors = [
        '.userName',
        '.user-name',
        '.username',
        '[class*="user"]',
        '.name'
    ]
    
    for selector in user_selectors:
        user_element = comment_element.select_one(selector)
        if user_element:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1:
                return text
    return "未知用户"

def extract_comment_content(comment_element):
    """提取评论内容"""
    content_selectors = [
        '.commentDetail',
        '.comment-detail',
        '.content',
        '.comment-content',
        '.detail',
        'p'
    ]
    
    for selector in content_selectors:
        content_element = comment_element.select_one(selector)
        if content_element:
            text = content_element.get_text(strip=True)
            if text and len(text) >= 10:
                return text
    
    # 备选方案：获取整个元素的文本
    full_text = comment_element.get_text(strip=True)
    if len(full_text) >= 20:
        return full_text
        
    return None

async def try_click_next_comment_page(page):
    """尝试点击评论下一页"""
    next_selectors = [
        "a:has-text('下一页')",
        "button:has-text('下一页')",
        ".next-page",
        "a[class*='next']"
    ]
    
    for selector in next_selectors:
        try:
            next_btn = await page.query_selector(selector)
            if next_btn:
                is_disabled = await next_btn.get_attribute("disabled")
                is_visible = await next_btn.is_visible()
                
                if not is_disabled and is_visible:
                    await next_btn.click()
                    await asyncio.sleep(3)
                    return True
        except:
            continue
    return False

async def main():
    """主函数"""
    all_items = []
    all_comments = []
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(user_agent=(
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        ))
        page = await context.new_page()

        print("=== 开始爬取景点列表 ===")
        # 爬取1-301页景点列表
        for pno in range(1, 302):  # 1-301页
            url = BASE_URL if pno == 1 else f"{BASE_URL}/s0-p{pno}.html"
            html = await fetch_page(page, url, pno)
            
            if html:
                items = parse_html(html, pno)
                all_items.extend(items)
                print(f"[PROGRESS] Page {pno}: {len(items)} items, Total: {len(all_items)}")
            else:
                print(f"[WARNING] Page {pno} failed, skipping")
            
            await asyncio.sleep(2)  # 页面间延迟

        # 保存景点列表数据
        if all_items:
            with open(OUT_CSV, "w", newline="", encoding="utf-8-sig") as f:
                w = csv.DictWriter(f, fieldnames=all_items[0].keys())
                w.writeheader()
                w.writerows(all_items)
            with open(OUT_JSON, "w", encoding="utf-8") as f:
                json.dump(all_items, f, ensure_ascii=False, indent=2)
            print(f"[SUCCESS] Saved {len(all_items)} sights → {OUT_CSV}")
        else:
            print("[ERROR] No sight items parsed")
            return

        print("\n=== 开始爬取景点评论 ===")
        # 爬取每个景点的评论
        total_sights = len(all_items)
        for i, sight in enumerate(all_items, 1):
            print(f"[PROGRESS] ({i}/{total_sights}) Fetching comments for: {sight['name']}")
            
            comments = await fetch_comments_for_sight(page, sight)
            all_comments.extend(comments)
            
            print(f"[INFO] Found {len(comments)} comments for {sight['name']}")
            
            # 每个景点间延迟
            await asyncio.sleep(1)
            
            # 每10个景点保存一次进度
            if i % 10 == 0:
                print(f"[CHECKPOINT] Processed {i} sights, total comments: {len(all_comments)}")

        await browser.close()

    # 保存评论数据
    if all_comments:
        with open(COMMENTS_CSV, "w", newline="", encoding="utf-8-sig") as f:
            fieldnames = ['sight_name', 'sight_url', 'sight_rating', 'user_name', 'comment_content', 'page']
            w = csv.DictWriter(f, fieldnames=fieldnames)
            w.writeheader()
            w.writerows(all_comments)
        with open(COMMENTS_JSON, "w", encoding="utf-8") as f:
            json.dump(all_comments, f, ensure_ascii=False, indent=2)
        
        print(f"\n[SUCCESS] Complete!")
        print(f"Total sights: {len(all_items)}")
        print(f"Total comments: {len(all_comments)}")
        print(f"Sights saved: {OUT_CSV}")
        print(f"Comments saved: {COMMENTS_CSV}")
        
        # 统计信息
        from collections import Counter
        comment_counts = Counter([comment['sight_name'] for comment in all_comments])
        print("\nComments per sight (top 10):")
        for sight, count in comment_counts.most_common(10):
            print(f"  {sight}: {count} comments")
    else:
        print("[WARNING] No comments collected")

if __name__ == "__main__":
    asyncio.run(main())

ModuleNotFoundError: No module named 'playwright'

In [11]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import os
import csv
import json

# 常量定义
BASE_URL = "https://you.ctrip.com/sight/china110000"
DUMP_DIR = "html_dumps"
OUT_CSV = "ctrip_sights_latest.csv"
OUT_JSON = "ctrip_sights_latest.json"
COMMENTS_CSV = "ctrip_comments_detailed.csv"

def fetch_page(url, page_no):
    """获取页面HTML内容"""
    print(f"[INFO] Loading page {page_no}: {url}")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.encoding = 'utf-8'
        
        if response.status_code == 200:
            # 保存HTML用于调试
            os.makedirs(DUMP_DIR, exist_ok=True)
            with open(f"{DUMP_DIR}/page_{page_no}.html", "w", encoding="utf-8") as f:
                f.write(response.text)
            return response.text
        else:
            print(f"[ERROR] Page {page_no} failed with status: {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] Failed to fetch page {page_no}: {e}")
        return None

def parse_html(html, page_no):
    """解析景点列表页面"""
    if not html:
        return []
        
    soup = BeautifulSoup(html, "lxml")
    items = []
    
    # 查找景点卡片
    for div in soup.find_all("div", class_=re.compile("^sightItemCard_box__")):
        name_tag = div.find("a", href=True)
        if not name_tag:
            continue
            
        name = name_tag.get("title") or name_tag.text.strip()
        href = name_tag["href"]
        if not href.startswith("http"):
            href = "https://you.ctrip.com" + href

        # 提取其他信息
        block = div.get_text(" ", strip=True)
        rating = None
        reviews = None
        price = None
        free = "免费" in block
        
        # 提取评分
        m = re.search(r"([0-9.]+)分", block)
        if m:
            rating = float(m.group(1))

        # 提取评论数
        m2 = re.search(r"([0-9.]+)条点评", block)
        if m2:
            try:
                reviews = int(float(m2.group(1)))
            except:
                pass

        # 提取价格
        m3 = re.search(r"¥\s*([0-9.]+)", block)
        if m3:
            price = float(m3.group(1))

        items.append({
            "page": page_no,
            "name": name,
            "url": href,
            "rating": rating,
            "reviews": reviews,
            "price": price,
            "free": free,
            "raw": block[:150]
        })

    print(f"[INFO] Page {page_no}: {len(items)} items found")
    return items

def parse_comments_from_html(html_content, sight_name):
    """
    从HTML中解析评论的用户名和内容
    """
    soup = BeautifulSoup(html_content, 'lxml')
    comments = []
    
    # 方案1：查找commentlist相关的div
    comment_containers = soup.find_all('div', class_=re.compile(r'commentlist|commentList|comment-list'))
    
    for container in comment_containers:
        # 在评论容器中查找每个评论项
        comment_items = container.find_all('div', class_=re.compile(r'comment-item|commentItem|comment_item'))
        
        for item in comment_items:
            try:
                # 提取用户名 - 多种可能的选择器
                user_name = extract_user_name(item)
                
                # 提取评论内容 - 多种可能的选择器
                comment_content = extract_comment_content(item)
                
                if user_name and comment_content:
                    comments.append({
                        'sight_name': sight_name,
                        'user_name': user_name,
                        'comment_content': comment_content
                    })
                    
            except Exception as e:
                continue
    
    # 方案2：如果上面没找到，尝试更通用的选择器
    if not comments:
        comments = fallback_comment_parsing(soup, sight_name)
    
    return comments

def extract_user_name(comment_element):
    """提取用户名"""
    user_selectors = [
        '.userInfo .userName',
        '.user-info .user-name',
        '.username',
        '.userName',
        '.user-name',
        'span[class*="user"]',
        'div[class*="user"]',
        '.name'
    ]
    
    for selector in user_selectors:
        user_element = comment_element.select_one(selector)
        if user_element:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1:
                return text
    
    # 如果选择器没找到，尝试正则匹配
    text_content = comment_element.get_text()
    user_match = re.search(r'用户\w+|\w+用户|游客\w+', text_content)
    if user_match:
        return user_match.group()
    
    return None

def extract_comment_content(comment_element):
    """提取评论内容"""
    content_selectors = [
        '.contentInfo .commentDetail',
        '.content-info .comment-detail',
        '.commentContent',
        '.comment-content',
        '.content',
        '.detail',
        '.comment-text',
        '.commentDetail',
        'p',
        'span[class*="content"]',
        'div[class*="content"]'
    ]
    
    for selector in content_selectors:
        content_element = comment_element.select_one(selector)
        if content_element:
            text = content_element.get_text(strip=True)
            if text and len(text) >= 5:  # 过滤太短的内容
                return text
    
    # 如果选择器没找到，尝试获取整个评论元素的文本
    full_text = comment_element.get_text(strip=True)
    # 清理文本，移除可能的用户名、时间等信息
    cleaned_text = clean_comment_text(full_text)
    if cleaned_text and len(cleaned_text) >= 10:
        return cleaned_text
    
    return None

def clean_comment_text(text):
    """清理评论文本"""
    # 移除常见的前缀和后缀
    patterns_to_remove = [
        r'用户\d+\s*',
        r'游客\d+\s*',
        r'\d{4}-\d{2}-\d{2}\s*',
        r'\d+月\d+日\s*',
        r'发表于\s*',
        r'评分：\d+',
        r'点击回复',
        r'回复'
    ]
    
    cleaned = text
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

def fallback_comment_parsing(soup, sight_name):
    """备选方案：更通用的评论解析"""
    comments = []
    
    # 查找所有可能包含评论的div
    potential_comments = soup.find_all('div', class_=re.compile(r'comment|review|点评|评价'))
    
    for element in potential_comments:
        try:
            # 获取完整文本
            full_text = element.get_text(strip=True)
            
            # 跳过太短或包含特定关键词的文本
            if (len(full_text) < 20 or 
                '暂无点评' in full_text or 
                '我要点评' in full_text or
                '条点评' in full_text):
                continue
            
            # 尝试分离用户名和评论内容
            user_name, comment_content = separate_user_and_content(full_text)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'comment_content': comment_content
                })
                
        except Exception as e:
            continue
    
    return comments

def separate_user_and_content(text):
    """尝试从文本中分离用户名和评论内容"""
    # 常见的用户名模式
    user_patterns = [
        r'^([\u4e00-\u9fa5a-zA-Z0-9_]{2,10})\s+',
        r'用户(\d+)\s*',
        r'游客(\d+)\s*',
        r'^(\w+)\s+'
    ]
    
    user_name = None
    comment_content = text
    
    for pattern in user_patterns:
        match = re.match(pattern, text)
        if match:
            user_name = match.group(1)
            comment_content = text[match.end():].strip()
            break
    
    return user_name, comment_content

def fetch_and_parse_comments(url, sight_name):
    """获取网页并解析评论"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.encoding = 'utf-8'
        
        if response.status_code == 200:
            comments = parse_comments_from_html(response.text, sight_name)
            return comments
        else:
            print(f"请求失败，状态码: {response.status_code}")
            return []
            
    except Exception as e:
        print(f"请求出错: {e}")
        return []

def main():
    """主函数"""
    all_sights = []  # 存储所有景点信息
    all_comments = []  # 存储所有评论信息
    
    print("=== 开始爬取景点列表（1-301页）===")
    
    # 第一阶段：爬取1-301页的景点列表
    for pno in range(1, 302):  # 1-301页
        # 构造URL（第一页格式不同）
        url = BASE_URL if pno == 1 else f"{BASE_URL}/s0-p{pno}.html"
        
        # 获取页面HTML
        html = fetch_page(url, pno)
        
        # 解析页面获取景点信息
        items = parse_html(html, pno)
        all_sights.extend(items)
        
        print(f"[进度] 第{pno}页: 找到{len(items)}个景点，总计{len(all_sights)}个景点")
        
        # 页面间延迟
        time.sleep(2)
    
    # 保存景点列表数据
    if all_sights:
        with open(OUT_CSV, "w", newline="", encoding="utf-8-sig") as f:
            w = csv.DictWriter(f, fieldnames=all_sights[0].keys())
            w.writeheader()
            w.writerows(all_sights)
        with open(OUT_JSON, "w", encoding="utf-8") as f:
            json.dump(all_sights, f, ensure_ascii=False, indent=2)
        print(f"[SUCCESS] 保存{len(all_sights)}个景点信息到 {OUT_CSV}")
    else:
        print("[ERROR] 未找到任何景点信息")
        return
    
    print("\n=== 开始爬取景点评论 ===")
    
    # 第二阶段：为每个景点爬取评论
    total_sights = len(all_sights)
    for i, sight in enumerate(all_sights, 1):
        sight_name = sight['name']
        url = sight['url']
        
        print(f"[进度] ({i}/{total_sights}) 正在爬取: {sight_name}")
        
        # 获取并解析评论
        comments = fetch_and_parse_comments(url, sight_name)
        all_comments.extend(comments)
        
        print(f"找到 {len(comments)} 条评论")
        
        # 避免请求过快
        time.sleep(2)
        
        # 每10个景点显示一次进度
        if i % 10 == 0:
            print(f"[检查点] 已处理 {i} 个景点，总计 {len(all_comments)} 条评论")
    
    # 保存评论数据
    if all_comments:
        df_comments = pd.DataFrame(all_comments)
        df_comments.to_csv(COMMENTS_CSV, index=False, encoding='utf-8-sig')
        print(f"[SUCCESS] 保存完成！共 {len(all_comments)} 条评论")
        
        # 显示统计信息
        print("\n各景点评论数量:")
        comment_counts = df_comments['sight_name'].value_counts()
        print(comment_counts)
        
        # 保存评论的JSON文件
        with open("ctrip_comments_detailed.json", "w", encoding="utf-8") as f:
            json.dump(all_comments, f, ensure_ascii=False, indent=2)
            
        print(f"\n=== 爬取完成 ===")
        print(f"景点总数: {len(all_sights)}")
        print(f"评论总数: {len(all_comments)}")
        print(f"景点文件: {OUT_CSV}")
        print(f"评论文件: {COMMENTS_CSV}")
    else:
        print("[WARNING] 未找到任何评论")

if __name__ == "__main__":
    main()

=== 开始爬取景点列表（1-301页）===
[INFO] Loading page 1: https://you.ctrip.com/sight/china110000
[ERROR] Page 1 failed with status: 404
[进度] 第1页: 找到0个景点，总计0个景点


KeyboardInterrupt: 

In [4]:
# 安装核心库
pip install pandas beautifulsoup4 lxml openpyxl

# 安装 playwright
pip install playwright

# 安装浏览器（这步很重要！）
playwright install

SyntaxError: invalid syntax (4127123322.py, line 2)

In [13]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import urllib.parse

def parse_comments_from_html(html_content, sight_name):
    """
    从HTML中解析评论的用户名和内容
    """
    soup = BeautifulSoup(html_content, 'lxml')
    comments = []
    
    # 方案1：查找commentlist相关的div
    comment_containers = soup.find_all('div', class_=re.compile(r'commentlist|commentList|comment-list'))
    
    for container in comment_containers:
        # 在评论容器中查找每个评论项
        comment_items = container.find_all('div', class_=re.compile(r'comment-item|commentItem|comment_item'))
        
        for item in comment_items:
            try:
                # 提取用户名 - 多种可能的选择器
                user_name = extract_user_name(item)
                
                # 提取评论内容 - 多种可能的选择器
                comment_content = extract_comment_content(item)
                
                if user_name and comment_content:
                    comments.append({
                        'sight_name': sight_name,
                        'user_name': user_name,
                        'comment_content': comment_content
                    })
                    
            except Exception as e:
                continue
    
    # 方案2：如果上面没找到，尝试更通用的选择器
    if not comments:
        comments = fallback_comment_parsing(soup, sight_name)
    
    return comments

def extract_user_name(comment_element):
    """提取用户名"""
    user_selectors = [
        '.userInfo .userName',
        '.user-info .user-name',
        '.username',
        '.userName',
        '.user-name',
        'span[class*="user"]',
        'div[class*="user"]',
        '.name'
    ]
    
    for selector in user_selectors:
        user_element = comment_element.select_one(selector)
        if user_element:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1:
                return text
    
    # 如果选择器没找到，尝试正则匹配
    text_content = comment_element.get_text()
    user_match = re.search(r'用户\w+|\w+用户|游客\w+', text_content)
    if user_match:
        return user_match.group()
    
    return None

def extract_comment_content(comment_element):
    """提取评论内容"""
    content_selectors = [
        '.contentInfo .commentDetail',
        '.content-info .comment-detail',
        '.commentContent',
        '.comment-content',
        '.content',
        '.detail',
        '.comment-text',
        '.commentDetail',
        'p',
        'span[class*="content"]',
        'div[class*="content"]'
    ]
    
    for selector in content_selectors:
        content_element = comment_element.select_one(selector)
        if content_element:
            text = content_element.get_text(strip=True)
            if text and len(text) >= 5:  # 过滤太短的内容
                return text
    
    # 如果选择器没找到，尝试获取整个评论元素的文本
    full_text = comment_element.get_text(strip=True)
    # 清理文本，移除可能的用户名、时间等信息
    cleaned_text = clean_comment_text(full_text)
    if cleaned_text and len(cleaned_text) >= 10:
        return cleaned_text
    
    return None

def clean_comment_text(text):
    """清理评论文本"""
    # 移除常见的前缀和后缀
    patterns_to_remove = [
        r'用户\d+\s*',
        r'游客\d+\s*',
        r'\d{4}-\d{2}-\d{2}\s*',
        r'\d+月\d+日\s*',
        r'发表于\s*',
        r'评分：\d+',
        r'点击回复',
        r'回复'
    ]
    
    cleaned = text
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

def fallback_comment_parsing(soup, sight_name):
    """备选方案：更通用的评论解析"""
    comments = []
    
    # 查找所有可能包含评论的div
    potential_comments = soup.find_all('div', class_=re.compile(r'comment|review|点评|评价'))
    
    for element in potential_comments:
        try:
            # 获取完整文本
            full_text = element.get_text(strip=True)
            
            # 跳过太短或包含特定关键词的文本
            if (len(full_text) < 20 or 
                '暂无点评' in full_text or 
                '我要点评' in full_text or
                '条点评' in full_text):
                continue
            
            # 尝试分离用户名和评论内容
            user_name, comment_content = separate_user_and_content(full_text)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'comment_content': comment_content
                })
                
        except Exception as e:
            continue
    
    return comments

def separate_user_and_content(text):
    """尝试从文本中分离用户名和评论内容"""
    # 常见的用户名模式
    user_patterns = [
        r'^([\u4e00-\u9fa5a-zA-Z0-9_]{2,10})\s+',
        r'用户(\d+)\s*',
        r'游客(\d+)\s*',
        r'^(\w+)\s+'
    ]
    
    user_name = None
    comment_content = text
    
    for pattern in user_patterns:
        match = re.match(pattern, text)
        if match:
            user_name = match.group(1)
            comment_content = text[match.end():].strip()
            break
    
    return user_name, comment_content

def fetch_and_parse_comments(url, sight_name):
    """获取网页并解析评论（支持多页爬取）"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    }
    
    all_comments = []
    page_num = 1
    max_pages = 50  # 最大爬取页数，防止无限循环
    
    while page_num <= max_pages:
        try:
            # 构造分页URL（不同页面的URL格式可能不同）
            if page_num == 1:
                comment_url = url
            else:
                # 尝试不同的分页URL格式
                comment_url = construct_comment_page_url(url, page_num)
            
            print(f"  正在爬取第{page_num}页评论: {comment_url}")
            
            response = requests.get(comment_url, headers=headers, timeout=15)
            response.encoding = 'utf-8'
            
            if response.status_code != 200:
                print(f"  第{page_num}页请求失败，状态码: {response.status_code}")
                break
            
            # 解析当前页的评论
            page_comments = parse_comments_from_html(response.text, sight_name)
            
            if not page_comments:
                print(f"  第{page_num}页未找到评论，停止爬取")
                break
            
            all_comments.extend(page_comments)
            print(f"  第{page_num}页找到 {len(page_comments)} 条评论")
            
            # 检查是否还有更多页面
            if not has_next_comment_page(response.text):
                print(f"  没有更多评论页面，停止爬取")
                break
            
            page_num += 1
            time.sleep(1)  # 页面间延迟
            
        except Exception as e:
            print(f"  第{page_num}页爬取出错: {e}")
            break
    
    return all_comments

def construct_comment_page_url(base_url, page_num):
    """构造评论分页URL"""
    # 方法1：在URL中添加分页参数
    if '?' in base_url:
        return f"{base_url}&p{page_num}"
    else:
        return f"{base_url}?p{page_num}"
    
    # 方法2：使用不同的分页模式（如果方法1不行可以尝试这个）
    # parsed_url = urllib.parse.urlparse(base_url)
    # path_parts = parsed_url.path.split('/')
    # if len(path_parts) > 1:
    #     new_path = '/'.join(path_parts[:-1]) + f'/s0-t3-p{page_num}.html'
    #     return f"{parsed_url.scheme}://{parsed_url.netloc}{new_path}"
    # return base_url

def has_next_comment_page(html_content):
    """检查是否还有下一页评论"""
    soup = BeautifulSoup(html_content, 'lxml')
    
    # 检查下一页按钮是否存在且可用
    next_selectors = [
        'a:contains("下一页")',
        'a:contains("next")',
        '.next-page',
        'a[class*="next"]',
        'button:contains("下一页")'
    ]
    
    for selector in next_selectors:
        try:
            next_element = soup.select_one(selector)
            if next_element:
                # 检查是否被禁用
                if next_element.has_attr('disabled') or 'disabled' in next_element.get('class', []):
                    return False
                return True
        except:
            continue
    
    # 检查是否有页码指示器
    page_indicators = soup.find_all(class_=re.compile(r'page|pager|pagination'))
    if page_indicators:
        return True
    
    return False

# 主函数
def main():
    # 读取景点数据
    df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
    
    all_comments = []
    total_sights = len(df)
    
    for index, row in df.iterrows():
        sight_name = row['name']
        url = row['url']
        
        print(f"[{index+1}/{total_sights}] 正在爬取: {sight_name}")
        
        # 爬取多页评论
        comments = fetch_and_parse_comments(url, sight_name)
        all_comments.extend(comments)
        
        print(f"  {sight_name} 总计找到 {len(comments)} 条评论")
        
        # 避免请求过快
        time.sleep(2)
        
        # 每5个景点保存一次进度（防止数据丢失）
        if (index + 1) % 5 == 0:
            print(f"[进度保存] 已处理 {index+1} 个景点，当前总计 {len(all_comments)} 条评论")
            # 可以在这里添加临时保存逻辑
    
    # 保存结果
    if all_comments:
        df_comments = pd.DataFrame(all_comments)
        df_comments.to_csv('ctrip_comments_detailed.csv', index=False, encoding='utf-8-sig')
        print(f"保存完成！共 {len(all_comments)} 条评论")
        
        # 显示统计信息
        print("\n各景点评论数量:")
        comment_counts = df_comments['sight_name'].value_counts()
        for sight, count in comment_counts.items():
            print(f"  {sight}: {count}条")
            
        # 保存详细统计
        print(f"\n=== 爬取统计 ===")
        print(f"景点总数: {total_sights}")
        print(f"评论总数: {len(all_comments)}")
        print(f"平均每个景点: {len(all_comments)/total_sights:.1f}条评论")
        
    else:
        print("未找到任何评论")

if __name__ == "__main__":
    main()

[1/80] 正在爬取: 外滩
  正在爬取第1页评论: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  第1页找到 10 条评论
  正在爬取第2页评论: https://you.ctrip.com/sight/shanghai2/736.html?scene=online&p2
  第2页找到 10 条评论
  正在爬取第3页评论: https://you.ctrip.com/sight/shanghai2/736.html?scene=online&p3
  第3页找到 10 条评论
  正在爬取第4页评论: https://you.ctrip.com/sight/shanghai2/736.html?scene=online&p4
  第4页找到 10 条评论
  正在爬取第5页评论: https://you.ctrip.com/sight/shanghai2/736.html?scene=online&p5
  第5页找到 10 条评论
  正在爬取第6页评论: https://you.ctrip.com/sight/shanghai2/736.html?scene=online&p6
  第6页找到 10 条评论
  正在爬取第7页评论: https://you.ctrip.com/sight/shanghai2/736.html?scene=online&p7
  第7页找到 10 条评论
  正在爬取第8页评论: https://you.ctrip.com/sight/shanghai2/736.html?scene=online&p8
  第8页找到 10 条评论
  正在爬取第9页评论: https://you.ctrip.com/sight/shanghai2/736.html?scene=online&p9
  第9页找到 10 条评论
  正在爬取第10页评论: https://you.ctrip.com/sight/shanghai2/736.html?scene=online&p10
  第10页找到 10 条评论
  正在爬取第11页评论: https://you.ctrip.com/sight/shanghai2/736.html?scene=online&

KeyboardInterrupt: 

In [14]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import csv
import json

# 常量定义
BASE_URL = "https://you.ctrip.com/sight/china110000"
SIGHTS_CSV = "ctrip_all_sights.csv"
COMMENTS_CSV = "ctrip_all_comments.csv"

def fetch_page(url, page_no):
    """获取页面HTML内容"""
    print(f"[INFO] 正在获取第 {page_no} 页: {url}")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.encoding = 'utf-8'
        
        if response.status_code == 200:
            return response.text
        else:
            print(f"[ERROR] 第 {page_no} 页请求失败，状态码: {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] 获取第 {page_no} 页失败: {e}")
        return None

def parse_sights_from_html(html, page_no):
    """解析景点列表页面"""
    if not html:
        return []
        
    soup = BeautifulSoup(html, "lxml")
    items = []
    
    # 查找景点卡片
    for div in soup.find_all("div", class_=re.compile("^sightItemCard_box__")):
        name_tag = div.find("a", href=True)
        if not name_tag:
            continue
            
        name = name_tag.get("title") or name_tag.text.strip()
        href = name_tag["href"]
        if not href.startswith("http"):
            href = "https://you.ctrip.com" + href

        # 提取其他信息
        block = div.get_text(" ", strip=True)
        rating = None
        reviews = None
        price = None
        free = "免费" in block
        
        # 提取评分
        m = re.search(r"([0-9.]+)分", block)
        if m:
            rating = float(m.group(1))

        # 提取评论数
        m2 = re.search(r"([0-9.]+)条点评", block)
        if m2:
            try:
                reviews = int(float(m2.group(1)))
            except:
                pass

        # 提取价格
        m3 = re.search(r"¥\s*([0-9.]+)", block)
        if m3:
            price = float(m3.group(1))

        items.append({
            "page": page_no,
            "name": name,
            "url": href,
            "rating": rating,
            "reviews": reviews,
            "price": price,
            "free": free
        })

    print(f"[INFO] 第 {page_no} 页: 找到 {len(items)} 个景点")
    return items

def parse_comments_from_html(html_content, sight_name):
    """
    从HTML中解析评论的用户名和内容
    """
    soup = BeautifulSoup(html_content, 'lxml')
    comments = []
    
    # 方案1：查找commentlist相关的div
    comment_containers = soup.find_all('div', class_=re.compile(r'commentlist|commentList|comment-list'))
    
    for container in comment_containers:
        # 在评论容器中查找每个评论项
        comment_items = container.find_all('div', class_=re.compile(r'comment-item|commentItem|comment_item'))
        
        for item in comment_items:
            try:
                # 提取用户名 - 多种可能的选择器
                user_name = extract_user_name(item)
                
                # 提取评论内容 - 多种可能的选择器
                comment_content = extract_comment_content(item)
                
                if user_name and comment_content:
                    comments.append({
                        'sight_name': sight_name,
                        'user_name': user_name,
                        'comment_content': comment_content
                    })
                    
            except Exception as e:
                continue
    
    # 方案2：如果上面没找到，尝试更通用的选择器
    if not comments:
        comments = fallback_comment_parsing(soup, sight_name)
    
    return comments

def extract_user_name(comment_element):
    """提取用户名"""
    user_selectors = [
        '.userInfo .userName',
        '.user-info .user-name',
        '.username',
        '.userName',
        '.user-name',
        'span[class*="user"]',
        'div[class*="user"]',
        '.name'
    ]
    
    for selector in user_selectors:
        user_element = comment_element.select_one(selector)
        if user_element:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1:
                return text
    
    # 如果选择器没找到，尝试正则匹配
    text_content = comment_element.get_text()
    user_match = re.search(r'用户\w+|\w+用户|游客\w+', text_content)
    if user_match:
        return user_match.group()
    
    return "未知用户"

def extract_comment_content(comment_element):
    """提取评论内容"""
    content_selectors = [
        '.contentInfo .commentDetail',
        '.content-info .comment-detail',
        '.commentContent',
        '.comment-content',
        '.content',
        '.detail',
        '.comment-text',
        '.commentDetail',
        'p',
        'span[class*="content"]',
        'div[class*="content"]'
    ]
    
    for selector in content_selectors:
        content_element = comment_element.select_one(selector)
        if content_element:
            text = content_element.get_text(strip=True)
            if text and len(text) >= 5:  # 过滤太短的内容
                return text
    
    # 如果选择器没找到，尝试获取整个评论元素的文本
    full_text = comment_element.get_text(strip=True)
    # 清理文本，移除可能的用户名、时间等信息
    cleaned_text = clean_comment_text(full_text)
    if cleaned_text and len(cleaned_text) >= 10:
        return cleaned_text
    
    return None

def clean_comment_text(text):
    """清理评论文本"""
    # 移除常见的前缀和后缀
    patterns_to_remove = [
        r'用户\d+\s*',
        r'游客\d+\s*',
        r'\d{4}-\d{2}-\d{2}\s*',
        r'\d+月\d+日\s*',
        r'发表于\s*',
        r'评分：\d+',
        r'点击回复',
        r'回复'
    ]
    
    cleaned = text
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

def fallback_comment_parsing(soup, sight_name):
    """备选方案：更通用的评论解析"""
    comments = []
    
    # 查找所有可能包含评论的div
    potential_comments = soup.find_all('div', class_=re.compile(r'comment|review|点评|评价'))
    
    for element in potential_comments:
        try:
            # 获取完整文本
            full_text = element.get_text(strip=True)
            
            # 跳过太短或包含特定关键词的文本
            if (len(full_text) < 20 or 
                '暂无点评' in full_text or 
                '我要点评' in full_text or
                '条点评' in full_text):
                continue
            
            # 尝试分离用户名和评论内容
            user_name, comment_content = separate_user_and_content(full_text)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'comment_content': comment_content
                })
                
        except Exception as e:
            continue
    
    return comments

def separate_user_and_content(text):
    """尝试从文本中分离用户名和评论内容"""
    # 常见的用户名模式
    user_patterns = [
        r'^([\u4e00-\u9fa5a-zA-Z0-9_]{2,10})\s+',
        r'用户(\d+)\s*',
        r'游客(\d+)\s*',
        r'^(\w+)\s+'
    ]
    
    user_name = None
    comment_content = text
    
    for pattern in user_patterns:
        match = re.match(pattern, text)
        if match:
            user_name = match.group(1)
            comment_content = text[match.end():].strip()
            break
    
    return user_name, comment_content

def fetch_comments_for_sight(url, sight_name):
    """获取单个景点的评论"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.encoding = 'utf-8'
        
        if response.status_code == 200:
            comments = parse_comments_from_html(response.text, sight_name)
            return comments
        else:
            print(f"  评论请求失败，状态码: {response.status_code}")
            return []
            
    except Exception as e:
        print(f"  评论请求出错: {e}")
        return []

# 主函数
def main():
    """主函数"""
    all_sights = []  # 存储所有景点信息
    all_comments = []  # 存储所有评论信息
    
    print("=== 开始爬取1-301页景点列表 ===")
    
    # 第一阶段：爬取1-301页的景点列表
    for pno in range(1, 302):  # 1-301页
        # 构造URL（第一页格式不同）
        url = BASE_URL if pno == 1 else f"{BASE_URL}/s0-p{pno}.html"
        
        # 获取页面HTML
        html = fetch_page(url, pno)
        
        # 解析页面获取景点信息
        items = parse_sights_from_html(html, pno)
        all_sights.extend(items)
        
        print(f"[进度] 第{pno}页: 找到{len(items)}个景点，总计{len(all_sights)}个景点")
        
        # 页面间延迟
        time.sleep(2)
    
    # 保存景点列表数据
    if all_sights:
        with open(SIGHTS_CSV, "w", newline="", encoding="utf-8-sig") as f:
            w = csv.DictWriter(f, fieldnames=all_sights[0].keys())
            w.writeheader()
            w.writerows(all_sights)
        
        # 保存JSON格式
        with open("ctrip_all_sights.json", "w", encoding="utf-8") as f:
            json.dump(all_sights, f, ensure_ascii=False, indent=2)
            
        print(f"[SUCCESS] 保存{len(all_sights)}个景点信息到 {SIGHTS_CSV}")
    else:
        print("[ERROR] 未找到任何景点信息")
        return
    
    print("\n=== 开始爬取景点评论 ===")
    
    # 第二阶段：为每个景点爬取评论（每个景点只爬取一次）
    total_sights = len(all_sights)
    for i, sight in enumerate(all_sights, 1):
        sight_name = sight['name']
        url = sight['url']
        
        print(f"[{i}/{total_sights}] 正在爬取: {sight_name}")
        
        # 获取并解析评论（每个景点只爬取第一页评论）
        comments = fetch_comments_for_sight(url, sight_name)
        all_comments.extend(comments)
        
        print(f"  找到 {len(comments)} 条评论")
        
        # 避免请求过快
        time.sleep(2)
        
        # 每10个景点显示一次进度
        if i % 10 == 0:
            print(f"[检查点] 已处理 {i} 个景点，总计 {len(all_comments)} 条评论")
    
    # 保存评论数据
    if all_comments:
        with open(COMMENTS_CSV, "w", newline="", encoding="utf-8-sig") as f:
            fieldnames = ['sight_name', 'user_name', 'comment_content']
            w = csv.DictWriter(f, fieldnames=fieldnames)
            w.writeheader()
            w.writerows(all_comments)
        
        # 保存JSON格式
        with open("ctrip_all_comments.json", "w", encoding="utf-8") as f:
            json.dump(all_comments, f, ensure_ascii=False, indent=2)
            
        print(f"[SUCCESS] 保存完成！共 {len(all_comments)} 条评论")
        
        # 显示统计信息
        print("\n=== 爬取统计 ===")
        print(f"景点总数: {len(all_sights)}")
        print(f"评论总数: {len(all_comments)}")
        print(f"平均每个景点: {len(all_comments)/len(all_sights):.1f}条评论")
        
        # 各景点评论数量统计
        from collections import Counter
        comment_counts = Counter([comment['sight_name'] for comment in all_comments])
        print(f"\n评论数量最多的前10个景点:")
        for sight, count in comment_counts.most_common(10):
            print(f"  {sight}: {count}条")
            
    else:
        print("[WARNING] 未找到任何评论")

if __name__ == "__main__":
    main()

=== 开始爬取1-301页景点列表 ===
[INFO] 正在获取第 1 页: https://you.ctrip.com/sight/china110000
[ERROR] 第 1 页请求失败，状态码: 404
[进度] 第1页: 找到0个景点，总计0个景点
[INFO] 正在获取第 2 页: https://you.ctrip.com/sight/china110000/s0-p2.html
[INFO] 第 2 页: 找到 10 个景点
[进度] 第2页: 找到10个景点，总计10个景点
[INFO] 正在获取第 3 页: https://you.ctrip.com/sight/china110000/s0-p3.html
[INFO] 第 3 页: 找到 10 个景点
[进度] 第3页: 找到10个景点，总计20个景点
[INFO] 正在获取第 4 页: https://you.ctrip.com/sight/china110000/s0-p4.html
[INFO] 第 4 页: 找到 10 个景点
[进度] 第4页: 找到10个景点，总计30个景点
[INFO] 正在获取第 5 页: https://you.ctrip.com/sight/china110000/s0-p5.html
[INFO] 第 5 页: 找到 10 个景点
[进度] 第5页: 找到10个景点，总计40个景点
[INFO] 正在获取第 6 页: https://you.ctrip.com/sight/china110000/s0-p6.html
[INFO] 第 6 页: 找到 10 个景点
[进度] 第6页: 找到10个景点，总计50个景点
[INFO] 正在获取第 7 页: https://you.ctrip.com/sight/china110000/s0-p7.html
[INFO] 第 7 页: 找到 10 个景点
[进度] 第7页: 找到10个景点，总计60个景点
[INFO] 正在获取第 8 页: https://you.ctrip.com/sight/china110000/s0-p8.html
[INFO] 第 8 页: 找到 10 个景点
[进度] 第8页: 找到10个景点，总计70个景点
[INFO] 正在获取第 9 页: https://you.ctrip

KeyboardInterrupt: 

In [15]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import csv
import json
import urllib.parse

# 常量定义
COMMENTS_CSV = "ctrip_all_pages_comments.csv"

def parse_comments_from_html(html_content, sight_name):
    """
    从HTML中解析评论的用户名和内容
    """
    soup = BeautifulSoup(html_content, 'lxml')
    comments = []
    
    # 方案1：查找commentlist相关的div
    comment_containers = soup.find_all('div', class_=re.compile(r'commentlist|commentList|comment-list'))
    
    for container in comment_containers:
        # 在评论容器中查找每个评论项
        comment_items = container.find_all('div', class_=re.compile(r'comment-item|commentItem|comment_item'))
        
        for item in comment_items:
            try:
                # 提取用户名 - 多种可能的选择器
                user_name = extract_user_name(item)
                
                # 提取评论内容 - 多种可能的选择器
                comment_content = extract_comment_content(item)
                
                if user_name and comment_content:
                    comments.append({
                        'sight_name': sight_name,
                        'user_name': user_name,
                        'comment_content': comment_content
                    })
                    
            except Exception as e:
                continue
    
    # 方案2：如果上面没找到，尝试更通用的选择器
    if not comments:
        comments = fallback_comment_parsing(soup, sight_name)
    
    return comments

def extract_user_name(comment_element):
    """提取用户名"""
    user_selectors = [
        '.userInfo .userName',
        '.user-info .user-name',
        '.username',
        '.userName',
        '.user-name',
        'span[class*="user"]',
        'div[class*="user"]',
        '.name'
    ]
    
    for selector in user_selectors:
        user_element = comment_element.select_one(selector)
        if user_element:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1:
                return text
    
    # 如果选择器没找到，尝试正则匹配
    text_content = comment_element.get_text()
    user_match = re.search(r'用户\w+|\w+用户|游客\w+', text_content)
    if user_match:
        return user_match.group()
    
    return "未知用户"

def extract_comment_content(comment_element):
    """提取评论内容"""
    content_selectors = [
        '.contentInfo .commentDetail',
        '.content-info .comment-detail',
        '.commentContent',
        '.comment-content',
        '.content',
        '.detail',
        '.comment-text',
        '.commentDetail',
        'p',
        'span[class*="content"]',
        'div[class*="content"]'
    ]
    
    for selector in content_selectors:
        content_element = comment_element.select_one(selector)
        if content_element:
            text = content_element.get_text(strip=True)
            if text and len(text) >= 5:  # 过滤太短的内容
                return text
    
    # 如果选择器没找到，尝试获取整个评论元素的文本
    full_text = comment_element.get_text(strip=True)
    # 清理文本，移除可能的用户名、时间等信息
    cleaned_text = clean_comment_text(full_text)
    if cleaned_text and len(cleaned_text) >= 10:
        return cleaned_text
    
    return None

def clean_comment_text(text):
    """清理评论文本"""
    # 移除常见的前缀和后缀
    patterns_to_remove = [
        r'用户\d+\s*',
        r'游客\d+\s*',
        r'\d{4}-\d{2}-\d{2}\s*',
        r'\d+月\d+日\s*',
        r'发表于\s*',
        r'评分：\d+',
        r'点击回复',
        r'回复'
    ]
    
    cleaned = text
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

def fallback_comment_parsing(soup, sight_name):
    """备选方案：更通用的评论解析"""
    comments = []
    
    # 查找所有可能包含评论的div
    potential_comments = soup.find_all('div', class_=re.compile(r'comment|review|点评|评价'))
    
    for element in potential_comments:
        try:
            # 获取完整文本
            full_text = element.get_text(strip=True)
            
            # 跳过太短或包含特定关键词的文本
            if (len(full_text) < 20 or 
                '暂无点评' in full_text or 
                '我要点评' in full_text or
                '条点评' in full_text):
                continue
            
            # 尝试分离用户名和评论内容
            user_name, comment_content = separate_user_and_content(full_text)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'comment_content': comment_content
                })
                
        except Exception as e:
            continue
    
    return comments

def separate_user_and_content(text):
    """尝试从文本中分离用户名和评论内容"""
    # 常见的用户名模式
    user_patterns = [
        r'^([\u4e00-\u9fa5a-zA-Z0-9_]{2,10})\s+',
        r'用户(\d+)\s*',
        r'游客(\d+)\s*',
        r'^(\w+)\s+'
    ]
    
    user_name = None
    comment_content = text
    
    for pattern in user_patterns:
        match = re.match(pattern, text)
        if match:
            user_name = match.group(1)
            comment_content = text[match.end():].strip()
            break
    
    return user_name, comment_content

def construct_comment_page_url(base_url, page_num):
    """构造评论分页URL"""
    # 方法1：在URL中添加分页参数
    if '?' in base_url:
        return f"{base_url}&p{page_num}"
    else:
        return f"{base_url}?p{page_num}"
    
    # 方法2：使用不同的分页模式
    # parsed_url = urllib.parse.urlparse(base_url)
    # path_parts = parsed_url.path.split('/')
    # if len(path_parts) > 1:
    #     new_path = '/'.join(path_parts[:-1]) + f'/s0-t3-p{page_num}.html'
    #     return f"{parsed_url.scheme}://{parsed_url.netloc}{new_path}"
    # return base_url

def has_next_comment_page(html_content):
    """检查是否还有下一页评论"""
    soup = BeautifulSoup(html_content, 'lxml')
    
    # 检查下一页按钮是否存在且可用
    next_selectors = [
        'a:contains("下一页")',
        'a:contains("next")',
        '.next-page',
        'a[class*="next"]',
        'button:contains("下一页")'
    ]
    
    for selector in next_selectors:
        try:
            next_element = soup.select_one(selector)
            if next_element:
                # 检查是否被禁用
                if next_element.has_attr('disabled') or 'disabled' in next_element.get('class', []):
                    return False
                return True
        except:
            continue
    
    # 检查是否有页码指示器
    page_indicators = soup.find_all(class_=re.compile(r'page|pager|pagination'))
    if page_indicators:
        return True
    
    return False

def fetch_all_comments_for_sight(url, sight_name, max_pages=301):
    """获取单个景点的所有评论页面（1-301页）"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    }
    
    all_comments = []
    page_num = 1
    
    print(f"  开始爬取 {sight_name} 的评论页面...")
    
    while page_num <= max_pages:
        try:
            # 构造分页URL
            if page_num == 1:
                comment_url = url
            else:
                comment_url = construct_comment_page_url(url, page_num)
            
            print(f"    正在爬取第{page_num}页: {comment_url}")
            
            response = requests.get(comment_url, headers=headers, timeout=15)
            response.encoding = 'utf-8'
            
            if response.status_code != 200:
                print(f"    第{page_num}页请求失败，状态码: {response.status_code}")
                break
            
            # 解析当前页的评论
            page_comments = parse_comments_from_html(response.text, sight_name)
            
            if not page_comments:
                print(f"    第{page_num}页未找到评论，停止爬取")
                break
            
            all_comments.extend(page_comments)
            print(f"    第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
            
            # 检查是否还有更多页面
            if not has_next_comment_page(response.text) or page_num >= max_pages:
                print(f"    没有更多评论页面或已达到最大页数{max_pages}，停止爬取")
                break
            
            page_num += 1
            time.sleep(1)  # 页面间延迟
            
        except Exception as e:
            print(f"    第{page_num}页爬取出错: {e}")
            break
    
    return all_comments

# 主函数
def main():
    """主函数 - 为Excel中的每个景点爬取1-301页评论"""
    
    # 读取Excel文件中的景点数据
    try:
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
        # 显示前几个景点信息用于确认
        print("\n前5个景点信息:")
        for i, row in df.head().iterrows():
            print(f"  {row['name']} - {row['url']}")
            
    except Exception as e:
        print(f"[ERROR] 读取Excel文件失败: {e}")
        print("请确保文件 '4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx' 在当前目录下")
        return
    
    all_comments = []
    total_sights = len(df)
    
    print(f"\n=== 开始为 {total_sights} 个景点爬取1-301页评论 ===")
    
    # 为Excel中的每个景点爬取所有评论页面
    for index, row in df.iterrows():
        sight_name = row['name']
        url = row['url']
        rating = row['评分（0-5分）']
        category = row['分类']
        city = row['所在城市']
        
        print(f"\n[{index+1}/{total_sights}] 正在爬取: {sight_name}")
        print(f"  评分: {rating}, 分类: {category}, 城市: {city}")
        
        # 获取并解析所有评论页面（1-301页）
        comments = fetch_all_comments_for_sight(url, sight_name, max_pages=301)
        
        # 为每条评论添加额外信息
        for comment in comments:
            comment.update({
                'sight_rating': rating,
                'sight_category': category,
                'sight_city': city,
                'sight_url': url,
                'comment_page': '多页爬取'  # 标识这是多页爬取的结果
            })
        
        all_comments.extend(comments)
        
        print(f"  {sight_name} 总计找到 {len(comments)} 条评论")
        
        # 避免请求过快
        time.sleep(3)  # 景点间延迟稍长
        
        # 每2个景点保存一次进度（防止数据丢失）
        if (index + 1) % 2 == 0:
            print(f"[进度保存] 已处理 {index+1} 个景点，总计 {len(all_comments)} 条评论")
            
            # 临时保存进度
            temp_df = pd.DataFrame(all_comments)
            temp_df.to_csv('ctrip_comments_temp.csv', index=False, encoding='utf-8-sig')
            print("  [临时备份] 评论数据已保存到 ctrip_comments_temp.csv")
    
    # 保存最终评论数据
    if all_comments:
        # 保存为CSV
        with open(COMMENTS_CSV, "w", newline="", encoding="utf-8-sig") as f:
            fieldnames = ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                         'user_name', 'comment_content', 'sight_url', 'comment_page']
            w = csv.DictWriter(f, fieldnames=fieldnames)
            w.writeheader()
            w.writerows(all_comments)
        
        # 保存为JSON
        with open("ctrip_all_pages_comments.json", "w", encoding="utf-8") as f:
            json.dump(all_comments, f, ensure_ascii=False, indent=2)
            
        print(f"\n[SUCCESS] 爬取完成！")
        print(f"景点数量: {total_sights}")
        print(f"评论总数: {len(all_comments)}")
        print(f"评论文件: {COMMENTS_CSV}")
        
        # 显示详细统计信息
        print("\n=== 详细统计 ===")
        from collections import Counter
        comment_counts = Counter([comment['sight_name'] for comment in all_comments])
        
        print("各景点评论数量:")
        for sight, count in comment_counts.most_common():
            print(f"  {sight}: {count}条")
            
        # 按城市统计
        print("\n按城市统计:")
        city_counts = Counter([comment['sight_city'] for comment in all_comments])
        for city, count in city_counts.most_common():
            print(f"  {city}: {count}条评论")
            
        # 总览
        print(f"\n总览:")
        print(f"  平均每个景点: {len(all_comments)/total_sights:.1f}条评论")
        print(f"  评论最多的景点: {comment_counts.most_common(1)[0][0]} ({comment_counts.most_common(1)[0][1]}条)")
        print(f"  评论最少的景点: {comment_counts.most_common()[-1][0]} ({comment_counts.most_common()[-1][1]}条)")
            
    else:
        print("[WARNING] 未找到任何评论")

if __name__ == "__main__":
    main()

[INFO] 成功读取Excel文件，共 80 个景点

前5个景点信息:
  外滩 - https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  湖南省博物馆 - https://you.ctrip.com/sight/changsha148/8981.html?scene=online
  湖北省博物馆 - https://you.ctrip.com/sight/wuhan145/8977.html?scene=online
  陆家嘴 - https://you.ctrip.com/sight/shanghai2/1815444.html?scene=online
  成都博物馆 - https://you.ctrip.com/sight/chengdu104/2006697.html?scene=online

=== 开始为 80 个景点爬取1-301页评论 ===

[1/80] 正在爬取: 外滩
  评分: 4.8, 分类: 遛娃宝藏地, 城市: 上海 
  开始爬取 外滩 的评论页面...
    正在爬取第1页: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
    第1页找到 10 条评论，累计 10 条
    正在爬取第2页: https://you.ctrip.com/sight/shanghai2/736.html?scene=online&p2
    第2页找到 10 条评论，累计 20 条
    正在爬取第3页: https://you.ctrip.com/sight/shanghai2/736.html?scene=online&p3
    第3页找到 10 条评论，累计 30 条
    正在爬取第4页: https://you.ctrip.com/sight/shanghai2/736.html?scene=online&p4
    第4页找到 10 条评论，累计 40 条
    正在爬取第5页: https://you.ctrip.com/sight/shanghai2/736.html?scene=online&p5
    第5页找到 10 条评论，累计 50 条
    正

KeyboardInterrupt: 

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import csv
import json
import urllib.parse

# 常量定义
COMMENTS_CSV = "ctrip_all_pages_comments.csv"
TEST_CSV = "ctrip_test_first_sight.csv"  # 测试用的CSV文件名

def parse_comments_from_html(html_content, sight_name):
    """
    从HTML中解析评论的用户名和内容
    """
    soup = BeautifulSoup(html_content, 'lxml')
    comments = []
    
    # 方案1：查找commentlist相关的div
    comment_containers = soup.find_all('div', class_=re.compile(r'commentlist|commentList|comment-list'))
    
    for container in comment_containers:
        # 在评论容器中查找每个评论项
        comment_items = container.find_all('div', class_=re.compile(r'comment-item|commentItem|comment_item'))
        
        for item in comment_items:
            try:
                # 提取用户名 - 多种可能的选择器
                user_name = extract_user_name(item)
                
                # 提取评论内容 - 多种可能的选择器
                comment_content = extract_comment_content(item)
                
                if user_name and comment_content:
                    comments.append({
                        'sight_name': sight_name,
                        'user_name': user_name,
                        'comment_content': comment_content
                    })
                    
            except Exception as e:
                continue
    
    # 方案2：如果上面没找到，尝试更通用的选择器
    if not comments:
        comments = fallback_comment_parsing(soup, sight_name)
    
    return comments

def extract_user_name(comment_element):
    """提取用户名"""
    user_selectors = [
        '.userInfo .userName',
        '.user-info .user-name',
        '.username',
        '.userName',
        '.user-name',
        'span[class*="user"]',
        'div[class*="user"]',
        '.name'
    ]
    
    for selector in user_selectors:
        user_element = comment_element.select_one(selector)
        if user_element:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1:
                return text
    
    # 如果选择器没找到，尝试正则匹配
    text_content = comment_element.get_text()
    user_match = re.search(r'用户\w+|\w+用户|游客\w+', text_content)
    if user_match:
        return user_match.group()
    
    return "未知用户"

def extract_comment_content(comment_element):
    """提取评论内容"""
    content_selectors = [
        '.contentInfo .commentDetail',
        '.content-info .comment-detail',
        '.commentContent',
        '.comment-content',
        '.content',
        '.detail',
        '.comment-text',
        '.commentDetail',
        'p',
        'span[class*="content"]',
        'div[class*="content"]'
    ]
    
    for selector in content_selectors:
        content_element = comment_element.select_one(selector)
        if content_element:
            text = content_element.get_text(strip=True)
            if text and len(text) >= 5:  # 过滤太短的内容
                return text
    
    # 如果选择器没找到，尝试获取整个评论元素的文本
    full_text = comment_element.get_text(strip=True)
    # 清理文本，移除可能的用户名、时间等信息
    cleaned_text = clean_comment_text(full_text)
    if cleaned_text and len(cleaned_text) >= 10:
        return cleaned_text
    
    return None

def clean_comment_text(text):
    """清理评论文本"""
    # 移除常见的前缀和后缀
    patterns_to_remove = [
        r'用户\d+\s*',
        r'游客\d+\s*',
        r'\d{4}-\d{2}-\d{2}\s*',
        r'\d+月\d+日\s*',
        r'发表于\s*',
        r'评分：\d+',
        r'点击回复',
        r'回复'
    ]
    
    cleaned = text
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

def fallback_comment_parsing(soup, sight_name):
    """备选方案：更通用的评论解析"""
    comments = []
    
    # 查找所有可能包含评论的div
    potential_comments = soup.find_all('div', class_=re.compile(r'comment|review|点评|评价'))
    
    for element in potential_comments:
        try:
            # 获取完整文本
            full_text = element.get_text(strip=True)
            
            # 跳过太短或包含特定关键词的文本
            if (len(full_text) < 20 or 
                '暂无点评' in full_text or 
                '我要点评' in full_text or
                '条点评' in full_text):
                continue
            
            # 尝试分离用户名和评论内容
            user_name, comment_content = separate_user_and_content(full_text)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'comment_content': comment_content
                })
                
        except Exception as e:
            continue
    
    return comments

def separate_user_and_content(text):
    """尝试从文本中分离用户名和评论内容"""
    # 常见的用户名模式
    user_patterns = [
        r'^([\u4e00-\u9fa5a-zA-Z0-9_]{2,10})\s+',
        r'用户(\d+)\s*',
        r'游客(\d+)\s*',
        r'^(\w+)\s+'
    ]
    
    user_name = None
    comment_content = text
    
    for pattern in user_patterns:
        match = re.match(pattern, text)
        if match:
            user_name = match.group(1)
            comment_content = text[match.end():].strip()
            break
    
    return user_name, comment_content

def construct_comment_page_url(base_url, page_num):
    """构造评论分页URL"""
    # 方法1：在URL中添加分页参数
    if '?' in base_url:
        return f"{base_url}&p{page_num}"
    else:
        return f"{base_url}?p{page_num}"

def has_next_comment_page(html_content):
    """检查是否还有下一页评论"""
    soup = BeautifulSoup(html_content, 'lxml')
    
    # 检查下一页按钮是否存在且可用
    next_selectors = [
        'a:contains("下一页")',
        'a:contains("next")',
        '.next-page',
        'a[class*="next"]',
        'button:contains("下一页")'
    ]
    
    for selector in next_selectors:
        try:
            next_element = soup.select_one(selector)
            if next_element:
                # 检查是否被禁用
                if next_element.has_attr('disabled') or 'disabled' in next_element.get('class', []):
                    return False
                return True
        except:
            continue
    
    return False

def fetch_all_comments_for_sight(url, sight_name, max_pages=301, test_mode=False):
    """获取单个景点的所有评论页面（1-301页）"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    }
    
    all_comments = []
    page_num = 1
    
    if test_mode:
        print(f"  [测试模式] 开始爬取 {sight_name} 的评论页面...")
        max_pages = 3  # 测试模式只爬取3页
    else:
        print(f"  开始爬取 {sight_name} 的评论页面...")
    
    while page_num <= max_pages:
        try:
            # 构造分页URL
            if page_num == 1:
                comment_url = url
            else:
                comment_url = construct_comment_page_url(url, page_num)
            
            if test_mode:
                print(f"    [测试] 正在爬取第{page_num}页")
            else:
                print(f"    正在爬取第{page_num}页: {comment_url}")
            
            response = requests.get(comment_url, headers=headers, timeout=15)
            response.encoding = 'utf-8'
            
            if response.status_code != 200:
                print(f"    第{page_num}页请求失败，状态码: {response.status_code}")
                break
            
            # 解析当前页的评论
            page_comments = parse_comments_from_html(response.text, sight_name)
            
            if not page_comments:
                print(f"    第{page_num}页未找到评论，停止爬取")
                break
            
            all_comments.extend(page_comments)
            
            if test_mode:
                print(f"    [测试] 第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
            else:
                print(f"    第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
            
            # 检查是否还有更多页面
            if not has_next_comment_page(response.text) or page_num >= max_pages:
                if test_mode:
                    print(f"    [测试] 测试完成，共爬取 {page_num} 页")
                else:
                    print(f"    没有更多评论页面或已达到最大页数{max_pages}，停止爬取")
                break
            
            page_num += 1
            time.sleep(1)  # 页面间延迟
            
        except Exception as e:
            print(f"    第{page_num}页爬取出错: {e}")
            break
    
    return all_comments

def test_first_sight():
    """测试模式：只爬取第一个景点"""
    print("=== 测试模式：只爬取第一个景点 ===")
    
    # 读取Excel文件中的景点数据
    try:
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
        if len(df) == 0:
            print("[ERROR] Excel文件中没有数据")
            return
            
        # 获取第一个景点
        first_sight = df.iloc[0]
        sight_name = first_sight['name']
        url = first_sight['url']
        rating = first_sight['评分（0-5分）']
        category = first_sight['分类']
        city = first_sight['所在城市']
        
        print(f"\n测试景点信息:")
        print(f"  名称: {sight_name}")
        print(f"  URL: {url}")
        print(f"  评分: {rating}")
        print(f"  分类: {category}")
        print(f"  城市: {city}")
        
    except Exception as e:
        print(f"[ERROR] 读取Excel文件失败: {e}")
        return
    
    # 爬取第一个景点的评论（测试模式只爬取3页）
    print(f"\n开始测试爬取...")
    comments = fetch_all_comments_for_sight(url, sight_name, max_pages=3, test_mode=True)
    
    # 为每条评论添加额外信息
    for comment in comments:
        comment.update({
            'sight_rating': rating,
            'sight_category': category,
            'sight_city': city,
            'sight_url': url,
            'comment_page': '测试模式-前3页'
        })
    
    # 保存测试结果
    if comments:
        # 保存为CSV
        with open(TEST_CSV, "w", newline="", encoding="utf-8-sig") as f:
            fieldnames = ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                         'user_name', 'comment_content', 'sight_url', 'comment_page']
            w = csv.DictWriter(f, fieldnames=fieldnames)
            w.writeheader()
            w.writerows(comments)
        
        # 保存为JSON
        with open("ctrip_test_comments.json", "w", encoding="utf-8") as f:
            json.dump(comments, f, ensure_ascii=False, indent=2)
            
        print(f"\n[测试完成]")
        print(f"爬取页数: 3页")
        print(f"评论数量: {len(comments)} 条")
        print(f"测试文件: {TEST_CSV}")
        print(f"JSON文件: ctrip_test_comments.json")
        
        # 显示前几条评论预览
        print(f"\n前5条评论预览:")
        for i, comment in enumerate(comments[:5], 1):
            print(f"  {i}. 用户: {comment['user_name']}")
            print(f"     内容: {comment['comment_content'][:50]}...")
            print()
            
        print(f"CSV文件列名: {fieldnames}")
        print(f"\n您现在可以查看 {TEST_CSV} 文件来确认数据格式")
        
    else:
        print("[测试失败] 未找到任何评论")

def main():
    """主函数 - 为Excel中的每个景点爬取1-301页评论"""
    
    # 读取Excel文件中的景点数据
    try:
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
    except Exception as e:
        print(f"[ERROR] 读取Excel文件失败: {e}")
        return
    
    all_comments = []
    total_sights = len(df)
    
    print(f"\n=== 开始为 {total_sights} 个景点爬取1-301页评论 ===")
    
    # 为Excel中的每个景点爬取所有评论页面
    for index, row in df.iterrows():
        sight_name = row['name']
        url = row['url']
        rating = row['评分（0-5分）']
        category = row['分类']
        city = row['所在城市']
        
        print(f"\n[{index+1}/{total_sights}] 正在爬取: {sight_name}")
        print(f"  评分: {rating}, 分类: {category}, 城市: {city}")
        
        # 获取并解析所有评论页面（1-301页）
        comments = fetch_all_comments_for_sight(url, sight_name, max_pages=301)
        
        # 为每条评论添加额外信息
        for comment in comments:
            comment.update({
                'sight_rating': rating,
                'sight_category': category,
                'sight_city': city,
                'sight_url': url,
                'comment_page': '多页爬取'
            })
        
        all_comments.extend(comments)
        
        print(f"  {sight_name} 总计找到 {len(comments)} 条评论")
        
        time.sleep(3)
        
        if (index + 1) % 2 == 0:
            print(f"[进度保存] 已处理 {index+1} 个景点，总计 {len(all_comments)} 条评论")
            temp_df = pd.DataFrame(all_comments)
            temp_df.to_csv('ctrip_comments_temp.csv', index=False, encoding='utf-8-sig')
    
    # 保存最终评论数据
    if all_comments:
        with open(COMMENTS_CSV, "w", newline="", encoding="utf-8-sig") as f:
            fieldnames = ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                         'user_name', 'comment_content', 'sight_url', 'comment_page']
            w = csv.DictWriter(f, fieldnames=fieldnames)
            w.writeheader()
            w.writerows(all_comments)
        
        with open("ctrip_all_pages_comments.json", "w", encoding="utf-8") as f:
            json.dump(all_comments, f, ensure_ascii=False, indent=2)
            
        print(f"\n[SUCCESS] 爬取完成！")
        print(f"评论文件: {COMMENTS_CSV}")
        
    else:
        print("[WARNING] 未找到任何评论")

if __name__ == "__main__":
    # 询问用户要运行测试模式还是完整模式
    print("请选择运行模式:")
    print("1. 测试模式 (只爬取第一个景点的前3页)")
    print("2. 完整模式 (爬取所有景点的1-301页)")
    
    choice = input("请输入选择 (1 或 2): ").strip()
    
    if choice == "1":
        test_first_sight()
    elif choice == "2":
        main()
    else:
        print("无效选择，默认运行测试模式")
        test_first_sight()

请选择运行模式:
1. 测试模式 (只爬取第一个景点的前3页)
2. 完整模式 (爬取所有景点的1-301页)


In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import csv
import json
import urllib.parse

# 常量定义
COMMENTS_CSV = "ctrip_all_pages_comments.csv"
TEST_CSV = "ctrip_test_first_sight.csv"  # 测试用的CSV文件名

def parse_comments_from_html(html_content, sight_name):
    """
    从HTML中解析评论的用户名和内容
    """
    soup = BeautifulSoup(html_content, 'lxml')
    comments = []
    
    # 方案1：查找commentlist相关的div
    comment_containers = soup.find_all('div', class_=re.compile(r'commentlist|commentList|comment-list'))
    
    for container in comment_containers:
        # 在评论容器中查找每个评论项
        comment_items = container.find_all('div', class_=re.compile(r'comment-item|commentItem|comment_item'))
        
        for item in comment_items:
            try:
                # 提取用户名 - 多种可能的选择器
                user_name = extract_user_name(item)
                
                # 提取评论内容 - 多种可能的选择器
                comment_content = extract_comment_content(item)
                
                if user_name and comment_content:
                    comments.append({
                        'sight_name': sight_name,
                        'user_name': user_name,
                        'comment_content': comment_content
                    })
                    
            except Exception as e:
                continue
    
    # 方案2：如果上面没找到，尝试更通用的选择器
    if not comments:
        comments = fallback_comment_parsing(soup, sight_name)
    
    return comments

def extract_user_name(comment_element):
    """提取用户名"""
    user_selectors = [
        '.userInfo .userName',
        '.user-info .user-name',
        '.username',
        '.userName',
        '.user-name',
        'span[class*="user"]',
        'div[class*="user"]',
        '.name'
    ]
    
    for selector in user_selectors:
        user_element = comment_element.select_one(selector)
        if user_element:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1:
                return text
    
    # 如果选择器没找到，尝试正则匹配
    text_content = comment_element.get_text()
    user_match = re.search(r'用户\w+|\w+用户|游客\w+', text_content)
    if user_match:
        return user_match.group()
    
    return "未知用户"

def extract_comment_content(comment_element):
    """提取评论内容"""
    content_selectors = [
        '.contentInfo .commentDetail',
        '.content-info .comment-detail',
        '.commentContent',
        '.comment-content',
        '.content',
        '.detail',
        '.comment-text',
        '.commentDetail',
        'p',
        'span[class*="content"]',
        'div[class*="content"]'
    ]
    
    for selector in content_selectors:
        content_element = comment_element.select_one(selector)
        if content_element:
            text = content_element.get_text(strip=True)
            if text and len(text) >= 5:  # 过滤太短的内容
                return text
    
    # 如果选择器没找到，尝试获取整个评论元素的文本
    full_text = comment_element.get_text(strip=True)
    # 清理文本，移除可能的用户名、时间等信息
    cleaned_text = clean_comment_text(full_text)
    if cleaned_text and len(cleaned_text) >= 10:
        return cleaned_text
    
    return None

def clean_comment_text(text):
    """清理评论文本"""
    # 移除常见的前缀和后缀
    patterns_to_remove = [
        r'用户\d+\s*',
        r'游客\d+\s*',
        r'\d{4}-\d{2}-\d{2}\s*',
        r'\d+月\d+日\s*',
        r'发表于\s*',
        r'评分：\d+',
        r'点击回复',
        r'回复'
    ]
    
    cleaned = text
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

def fallback_comment_parsing(soup, sight_name):
    """备选方案：更通用的评论解析"""
    comments = []
    
    # 查找所有可能包含评论的div
    potential_comments = soup.find_all('div', class_=re.compile(r'comment|review|点评|评价'))
    
    for element in potential_comments:
        try:
            # 获取完整文本
            full_text = element.get_text(strip=True)
            
            # 跳过太短或包含特定关键词的文本
            if (len(full_text) < 20 or 
                '暂无点评' in full_text or 
                '我要点评' in full_text or
                '条点评' in full_text):
                continue
            
            # 尝试分离用户名和评论内容
            user_name, comment_content = separate_user_and_content(full_text)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'comment_content': comment_content
                })
                
        except Exception as e:
            continue
    
    return comments

def separate_user_and_content(text):
    """尝试从文本中分离用户名和评论内容"""
    # 常见的用户名模式
    user_patterns = [
        r'^([\u4e00-\u9fa5a-zA-Z0-9_]{2,10})\s+',
        r'用户(\d+)\s*',
        r'游客(\d+)\s*',
        r'^(\w+)\s+'
    ]
    
    user_name = None
    comment_content = text
    
    for pattern in user_patterns:
        match = re.match(pattern, text)
        if match:
            user_name = match.group(1)
            comment_content = text[match.end():].strip()
            break
    
    return user_name, comment_content

def construct_comment_page_url(base_url, page_num):
    """构造评论分页URL"""
    # 方法1：在URL中添加分页参数
    if '?' in base_url:
        return f"{base_url}&p{page_num}"
    else:
        return f"{base_url}?p{page_num}"

def has_next_comment_page(html_content):
    """检查是否还有下一页评论"""
    soup = BeautifulSoup(html_content, 'lxml')
    
    # 检查下一页按钮是否存在且可用
    next_selectors = [
        'a:contains("下一页")',
        'a:contains("next")',
        '.next-page',
        'a[class*="next"]',
        'button:contains("下一页")'
    ]
    
    for selector in next_selectors:
        try:
            next_element = soup.select_one(selector)
            if next_element:
                # 检查是否被禁用
                if next_element.has_attr('disabled') or 'disabled' in next_element.get('class', []):
                    return False
                return True
        except:
            continue
    
    return False

def fetch_all_comments_for_sight(url, sight_name, max_pages=301, test_mode=False):
    """获取单个景点的所有评论页面（1-301页）"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    }
    
    all_comments = []
    page_num = 1
    
    if test_mode:
        print(f"  [测试模式] 开始爬取 {sight_name} 的评论页面...")
        max_pages = 3  # 测试模式只爬取3页
    else:
        print(f"  开始爬取 {sight_name} 的评论页面...")
    
    while page_num <= max_pages:
        try:
            # 构造分页URL
            if page_num == 1:
                comment_url = url
            else:
                comment_url = construct_comment_page_url(url, page_num)
            
            if test_mode:
                print(f"    [测试] 正在爬取第{page_num}页")
            else:
                print(f"    正在爬取第{page_num}页: {comment_url}")
            
            response = requests.get(comment_url, headers=headers, timeout=15)
            response.encoding = 'utf-8'
            
            if response.status_code != 200:
                print(f"    第{page_num}页请求失败，状态码: {response.status_code}")
                break
            
            # 解析当前页的评论
            page_comments = parse_comments_from_html(response.text, sight_name)
            
            if not page_comments:
                print(f"    第{page_num}页未找到评论，停止爬取")
                break
            
            all_comments.extend(page_comments)
            
            if test_mode:
                print(f"    [测试] 第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
            else:
                print(f"    第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
            
            # 检查是否还有更多页面
            if not has_next_comment_page(response.text) or page_num >= max_pages:
                if test_mode:
                    print(f"    [测试] 测试完成，共爬取 {page_num} 页")
                else:
                    print(f"    没有更多评论页面或已达到最大页数{max_pages}，停止爬取")
                break
            
            page_num += 1
            time.sleep(1)  # 页面间延迟
            
        except Exception as e:
            print(f"    第{page_num}页爬取出错: {e}")
            break
    
    return all_comments

def test_first_sight():
    """测试模式：只爬取第一个景点"""
    print("=== 测试模式：只爬取第一个景点 ===")
    
    # 读取Excel文件中的景点数据
    try:
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
        if len(df) == 0:
            print("[ERROR] Excel文件中没有数据")
            return
            
        # 获取第一个景点
        first_sight = df.iloc[0]
        sight_name = first_sight['name']
        url = first_sight['url']
        rating = first_sight['评分（0-5分）']
        category = first_sight['分类']
        city = first_sight['所在城市']
        
        print(f"\n测试景点信息:")
        print(f"  名称: {sight_name}")
        print(f"  URL: {url}")
        print(f"  评分: {rating}")
        print(f"  分类: {category}")
        print(f"  城市: {city}")
        
    except Exception as e:
        print(f"[ERROR] 读取Excel文件失败: {e}")
        return
    
    # 爬取第一个景点的评论（测试模式只爬取3页）
    print(f"\n开始测试爬取...")
    comments = fetch_all_comments_for_sight(url, sight_name, max_pages=3, test_mode=True)
    
    # 为每条评论添加额外信息
    for comment in comments:
        comment.update({
            'sight_rating': rating,
            'sight_category': category,
            'sight_city': city,
            'sight_url': url,
            'comment_page': '测试模式-前3页'
        })
    
    # 保存测试结果
    if comments:
        # 保存为CSV
        with open(TEST_CSV, "w", newline="", encoding="utf-8-sig") as f:
            fieldnames = ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                         'user_name', 'comment_content', 'sight_url', 'comment_page']
            w = csv.DictWriter(f, fieldnames=fieldnames)
            w.writeheader()
            w.writerows(comments)
        
        # 保存为JSON
        with open("ctrip_test_comments.json", "w", encoding="utf-8") as f:
            json.dump(comments, f, ensure_ascii=False, indent=2)
            
        print(f"\n[测试完成]")
        print(f"爬取页数: 3页")
        print(f"评论数量: {len(comments)} 条")
        print(f"测试文件: {TEST_CSV}")
        print(f"JSON文件: ctrip_test_comments.json")
        
        # 显示前几条评论预览
        print(f"\n前5条评论预览:")
        for i, comment in enumerate(comments[:5], 1):
            print(f"  {i}. 用户: {comment['user_name']}")
            print(f"     内容: {comment['comment_content'][:50]}...")
            print()
            
        print(f"CSV文件列名: {fieldnames}")
        print(f"\n您现在可以查看 {TEST_CSV} 文件来确认数据格式")
        
    else:
        print("[测试失败] 未找到任何评论")

def main():
    """主函数 - 为Excel中的每个景点爬取1-301页评论"""
    
    # 读取Excel文件中的景点数据
    try:
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
    except Exception as e:
        print(f"[ERROR] 读取Excel文件失败: {e}")
        return
    
    all_comments = []
    total_sights = len(df)
    
    print(f"\n=== 开始为 {total_sights} 个景点爬取1-301页评论 ===")
    
    # 为Excel中的每个景点爬取所有评论页面
    for index, row in df.iterrows():
        sight_name = row['name']
        url = row['url']
        rating = row['评分（0-5分）']
        category = row['分类']
        city = row['所在城市']
        
        print(f"\n[{index+1}/{total_sights}] 正在爬取: {sight_name}")
        print(f"  评分: {rating}, 分类: {category}, 城市: {city}")
        
        # 获取并解析所有评论页面（1-301页）
        comments = fetch_all_comments_for_sight(url, sight_name, max_pages=301)
        
        # 为每条评论添加额外信息
        for comment in comments:
            comment.update({
                'sight_rating': rating,
                'sight_category': category,
                'sight_city': city,
                'sight_url': url,
                'comment_page': '多页爬取'
            })
        
        all_comments.extend(comments)
        
        print(f"  {sight_name} 总计找到 {len(comments)} 条评论")
        
        time.sleep(3)
        
        if (index + 1) % 2 == 0:
            print(f"[进度保存] 已处理 {index+1} 个景点，总计 {len(all_comments)} 条评论")
            temp_df = pd.DataFrame(all_comments)
            temp_df.to_csv('ctrip_comments_temp.csv', index=False, encoding='utf-8-sig')
    
    # 保存最终评论数据
    if all_comments:
        with open(COMMENTS_CSV, "w", newline="", encoding="utf-8-sig") as f:
            fieldnames = ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                         'user_name', 'comment_content', 'sight_url', 'comment_page']
            w = csv.DictWriter(f, fieldnames=fieldnames)
            w.writeheader()
            w.writerows(all_comments)
        
        with open("ctrip_all_pages_comments.json", "w", encoding="utf-8") as f:
            json.dump(all_comments, f, ensure_ascii=False, indent=2)
            
        print(f"\n[SUCCESS] 爬取完成！")
        print(f"评论文件: {COMMENTS_CSV}")
        
    else:
        print("[WARNING] 未找到任何评论")

if __name__ == "__main__":
    # 询问用户要运行测试模式还是完整模式
    print("请选择运行模式:")
    print("1. 测试模式 (只爬取第一个景点的前3页)")
    print("2. 完整模式 (爬取所有景点的1-301页)")
    
    choice = input("请输入选择 (1 或 2): ").strip()
    
    if choice == "1":
        test_first_sight()
    elif choice == "2":
        main()
    else:
        print("无效选择，默认运行测试模式")
        test_first_sight()

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import csv
import json
import urllib.parse

# 常量定义
COMMENTS_CSV = "ctrip_all_pages_comments.csv"
TEST_CSV = "ctrip_test_first_sight.csv"  # 测试用的CSV文件名

def parse_comments_from_html(html_content, sight_name):
    """
    从HTML中解析评论的用户名和内容
    """
    soup = BeautifulSoup(html_content, 'lxml')
    comments = []
    
    # 方案1：查找commentlist相关的div
    comment_containers = soup.find_all('div', class_=re.compile(r'commentlist|commentList|comment-list'))
    
    for container in comment_containers:
        # 在评论容器中查找每个评论项
        comment_items = container.find_all('div', class_=re.compile(r'comment-item|commentItem|comment_item'))
        
        for item in comment_items:
            try:
                # 提取用户名 - 多种可能的选择器
                user_name = extract_user_name(item)
                
                # 提取评论内容 - 多种可能的选择器
                comment_content = extract_comment_content(item)
                
                if user_name and comment_content:
                    comments.append({
                        'sight_name': sight_name,
                        'user_name': user_name,
                        'comment_content': comment_content
                    })
                    
            except Exception as e:
                continue
    
    # 方案2：如果上面没找到，尝试更通用的选择器
    if not comments:
        comments = fallback_comment_parsing(soup, sight_name)
    
    return comments

def extract_user_name(comment_element):
    """提取用户名"""
    user_selectors = [
        '.userInfo .userName',
        '.user-info .user-name',
        '.username',
        '.userName',
        '.user-name',
        'span[class*="user"]',
        'div[class*="user"]',
        '.name'
    ]
    
    for selector in user_selectors:
        user_element = comment_element.select_one(selector)
        if user_element:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1:
                return text
    
    # 如果选择器没找到，尝试正则匹配
    text_content = comment_element.get_text()
    user_match = re.search(r'用户\w+|\w+用户|游客\w+', text_content)
    if user_match:
        return user_match.group()
    
    return "未知用户"

def extract_comment_content(comment_element):
    """提取评论内容"""
    content_selectors = [
        '.contentInfo .commentDetail',
        '.content-info .comment-detail',
        '.commentContent',
        '.comment-content',
        '.content',
        '.detail',
        '.comment-text',
        '.commentDetail',
        'p',
        'span[class*="content"]',
        'div[class*="content"]'
    ]
    
    for selector in content_selectors:
        content_element = comment_element.select_one(selector)
        if content_element:
            text = content_element.get_text(strip=True)
            if text and len(text) >= 5:  # 过滤太短的内容
                return text
    
    # 如果选择器没找到，尝试获取整个评论元素的文本
    full_text = comment_element.get_text(strip=True)
    # 清理文本，移除可能的用户名、时间等信息
    cleaned_text = clean_comment_text(full_text)
    if cleaned_text and len(cleaned_text) >= 10:
        return cleaned_text
    
    return None

def clean_comment_text(text):
    """清理评论文本"""
    # 移除常见的前缀和后缀
    patterns_to_remove = [
        r'用户\d+\s*',
        r'游客\d+\s*',
        r'\d{4}-\d{2}-\d{2}\s*',
        r'\d+月\d+日\s*',
        r'发表于\s*',
        r'评分：\d+',
        r'点击回复',
        r'回复'
    ]
    
    cleaned = text
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

def fallback_comment_parsing(soup, sight_name):
    """备选方案：更通用的评论解析"""
    comments = []
    
    # 查找所有可能包含评论的div
    potential_comments = soup.find_all('div', class_=re.compile(r'comment|review|点评|评价'))
    
    for element in potential_comments:
        try:
            # 获取完整文本
            full_text = element.get_text(strip=True)
            
            # 跳过太短或包含特定关键词的文本
            if (len(full_text) < 20 or 
                '暂无点评' in full_text or 
                '我要点评' in full_text or
                '条点评' in full_text):
                continue
            
            # 尝试分离用户名和评论内容
            user_name, comment_content = separate_user_and_content(full_text)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'comment_content': comment_content
                })
                
        except Exception as e:
            continue
    
    return comments

def separate_user_and_content(text):
    """尝试从文本中分离用户名和评论内容"""
    # 常见的用户名模式
    user_patterns = [
        r'^([\u4e00-\u9fa5a-zA-Z0-9_]{2,10})\s+',
        r'用户(\d+)\s*',
        r'游客(\d+)\s*',
        r'^(\w+)\s+'
    ]
    
    user_name = None
    comment_content = text
    
    for pattern in user_patterns:
        match = re.match(pattern, text)
        if match:
            user_name = match.group(1)
            comment_content = text[match.end():].strip()
            break
    
    return user_name, comment_content

def construct_comment_page_url(base_url, page_num):
    """构造评论分页URL"""
    # 方法1：在URL中添加分页参数
    if '?' in base_url:
        return f"{base_url}&p{page_num}"
    else:
        return f"{base_url}?p{page_num}"

def has_next_comment_page(html_content):
    """检查是否还有下一页评论"""
    soup = BeautifulSoup(html_content, 'lxml')
    
    # 检查下一页按钮是否存在且可用
    next_selectors = [
        'a:contains("下一页")',
        'a:contains("next")',
        '.next-page',
        'a[class*="next"]',
        'button:contains("下一页")'
    ]
    
    for selector in next_selectors:
        try:
            next_element = soup.select_one(selector)
            if next_element:
                # 检查是否被禁用
                if next_element.has_attr('disabled') or 'disabled' in next_element.get('class', []):
                    return False
                return True
        except:
            continue
    
    return False

def fetch_all_comments_for_sight(url, sight_name, max_pages=301, test_mode=False):
    """获取单个景点的所有评论页面（1-301页）"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    }
    
    all_comments = []
    page_num = 1
    
    if test_mode:
        print(f"  [测试模式] 开始爬取 {sight_name} 的评论页面...")
        max_pages = 3  # 测试模式只爬取3页
    else:
        print(f"  开始爬取 {sight_name} 的评论页面...")
    
    while page_num <= max_pages:
        try:
            # 构造分页URL
            if page_num == 1:
                comment_url = url
            else:
                comment_url = construct_comment_page_url(url, page_num)
            
            if test_mode:
                print(f"    [测试] 正在爬取第{page_num}页")
            else:
                print(f"    正在爬取第{page_num}页: {comment_url}")
            
            response = requests.get(comment_url, headers=headers, timeout=15)
            response.encoding = 'utf-8'
            
            if response.status_code != 200:
                print(f"    第{page_num}页请求失败，状态码: {response.status_code}")
                break
            
            # 解析当前页的评论
            page_comments = parse_comments_from_html(response.text, sight_name)
            
            if not page_comments:
                print(f"    第{page_num}页未找到评论，停止爬取")
                break
            
            all_comments.extend(page_comments)
            
            if test_mode:
                print(f"    [测试] 第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
            else:
                print(f"    第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
            
            # 检查是否还有更多页面
            if not has_next_comment_page(response.text) or page_num >= max_pages:
                if test_mode:
                    print(f"    [测试] 测试完成，共爬取 {page_num} 页")
                else:
                    print(f"    没有更多评论页面或已达到最大页数{max_pages}，停止爬取")
                break
            
            page_num += 1
            time.sleep(1)  # 页面间延迟
            
        except Exception as e:
            print(f"    第{page_num}页爬取出错: {e}")
            break
    
    return all_comments

def test_first_sight():
    """测试模式：只爬取第一个景点"""
    print("=== 测试模式：只爬取第一个景点 ===")
    
    # 读取Excel文件中的景点数据
    try:
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
        if len(df) == 0:
            print("[ERROR] Excel文件中没有数据")
            return
            
        # 获取第一个景点
        first_sight = df.iloc[0]
        sight_name = first_sight['name']
        url = first_sight['url']
        rating = first_sight['评分（0-5分）']
        category = first_sight['分类']
        city = first_sight['所在城市']
        
        print(f"\n测试景点信息:")
        print(f"  名称: {sight_name}")
        print(f"  URL: {url}")
        print(f"  评分: {rating}")
        print(f"  分类: {category}")
        print(f"  城市: {city}")
        
    except Exception as e:
        print(f"[ERROR] 读取Excel文件失败: {e}")
        return
    
    # 爬取第一个景点的评论（测试模式只爬取3页）
    print(f"\n开始测试爬取...")
    comments = fetch_all_comments_for_sight(url, sight_name, max_pages=3, test_mode=True)
    
    # 为每条评论添加额外信息
    for comment in comments:
        comment.update({
            'sight_rating': rating,
            'sight_category': category,
            'sight_city': city,
            'sight_url': url,
            'comment_page': '测试模式-前3页'
        })
    
    # 保存测试结果
    if comments:
        # 保存为CSV
        with open(TEST_CSV, "w", newline="", encoding="utf-8-sig") as f:
            fieldnames = ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                         'user_name', 'comment_content', 'sight_url', 'comment_page']
            w = csv.DictWriter(f, fieldnames=fieldnames)
            w.writeheader()
            w.writerows(comments)
        
        # 保存为JSON
        with open("ctrip_test_comments.json", "w", encoding="utf-8") as f:
            json.dump(comments, f, ensure_ascii=False, indent=2)
            
        print(f"\n[测试完成]")
        print(f"爬取页数: 3页")
        print(f"评论数量: {len(comments)} 条")
        print(f"测试文件: {TEST_CSV}")
        print(f"JSON文件: ctrip_test_comments.json")
        
        # 显示前几条评论预览
        print(f"\n前5条评论预览:")
        for i, comment in enumerate(comments[:5], 1):
            print(f"  {i}. 用户: {comment['user_name']}")
            print(f"     内容: {comment['comment_content'][:50]}...")
            print()
            
        print(f"CSV文件列名: {fieldnames}")
        print(f"\n您现在可以查看 {TEST_CSV} 文件来确认数据格式")
        
    else:
        print("[测试失败] 未找到任何评论")

def main():
    """主函数 - 为Excel中的每个景点爬取1-301页评论"""
    
    # 读取Excel文件中的景点数据
    try:
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
    except Exception as e:
        print(f"[ERROR] 读取Excel文件失败: {e}")
        return
    
    all_comments = []
    total_sights = len(df)
    
    print(f"\n=== 开始为 {total_sights} 个景点爬取1-301页评论 ===")
    
    # 为Excel中的每个景点爬取所有评论页面
    for index, row in df.iterrows():
        sight_name = row['name']
        url = row['url']
        rating = row['评分（0-5分）']
        category = row['分类']
        city = row['所在城市']
        
        print(f"\n[{index+1}/{total_sights}] 正在爬取: {sight_name}")
        print(f"  评分: {rating}, 分类: {category}, 城市: {city}")
        
        # 获取并解析所有评论页面（1-301页）
        comments = fetch_all_comments_for_sight(url, sight_name, max_pages=301)
        
        # 为每条评论添加额外信息
        for comment in comments:
            comment.update({
                'sight_rating': rating,
                'sight_category': category,
                'sight_city': city,
                'sight_url': url,
                'comment_page': '多页爬取'
            })
        
        all_comments.extend(comments)
        
        print(f"  {sight_name} 总计找到 {len(comments)} 条评论")
        
        time.sleep(3)
        
        if (index + 1) % 2 == 0:
            print(f"[进度保存] 已处理 {index+1} 个景点，总计 {len(all_comments)} 条评论")
            temp_df = pd.DataFrame(all_comments)
            temp_df.to_csv('ctrip_comments_temp.csv', index=False, encoding='utf-8-sig')
    
    # 保存最终评论数据
    if all_comments:
        with open(COMMENTS_CSV, "w", newline="", encoding="utf-8-sig") as f:
            fieldnames = ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                         'user_name', 'comment_content', 'sight_url', 'comment_page']
            w = csv.DictWriter(f, fieldnames=fieldnames)
            w.writeheader()
            w.writerows(all_comments)
        
        with open("ctrip_all_pages_comments.json", "w", encoding="utf-8") as f:
            json.dump(all_comments, f, ensure_ascii=False, indent=2)
            
        print(f"\n[SUCCESS] 爬取完成！")
        print(f"评论文件: {COMMENTS_CSV}")
        
    else:
        print("[WARNING] 未找到任何评论")

if __name__ == "__main__":
    # 询问用户要运行测试模式还是完整模式
    print("请选择运行模式:")
    print("1. 测试模式 (只爬取第一个景点的前3页)")
    print("2. 完整模式 (爬取所有景点的1-301页)")
    
    choice = input("请输入选择 (1 或 2): ").strip()
    
    if choice == "1":
        test_first_sight()
    elif choice == "2":
        main()
    else:
        print("无效选择，默认运行测试模式")
        test_first_sight()

请选择运行模式:
1. 测试模式 (只爬取第一个景点的前3页)
2. 完整模式 (爬取所有景点的1-301页)
无效选择，默认运行测试模式
=== 测试模式：只爬取第一个景点 ===
[INFO] 成功读取Excel文件，共 80 个景点

测试景点信息:
  名称: 外滩
  URL: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  评分: 4.8
  分类: 遛娃宝藏地
  城市: 上海 

开始测试爬取...
  [测试模式] 开始爬取 外滩 的评论页面...
    [测试] 正在爬取第1页
    [测试] 第1页找到 10 条评论，累计 10 条




    [测试] 正在爬取第2页
    [测试] 第2页找到 10 条评论，累计 20 条
    [测试] 正在爬取第3页
    [测试] 第3页找到 10 条评论，累计 30 条
    [测试] 测试完成，共爬取 3 页

[测试完成]
爬取页数: 3页
评论数量: 30 条
测试文件: ctrip_test_first_sight.csv
JSON文件: ctrip_test_comments.json

前5条评论预览:
  1. 用户: Annie长翅膀的喵
     内容: 外滩，上海的名片。
        以前出名的情人墙和远东第一弯，现在都弱化了，万国建筑群是依旧盛名...

  2. 用户: Grace_cyt
     内容: 夜景很漂亮，在乍浦路桥上可以看外白渡桥和人民英雄纪念碑印衬托下的浦东美景，然后向外白渡桥的方向走，过...

  3. 用户: 胡di 卖火柴的老男孩
     内容: 上海外滩不愧为万国建筑博览。浦东浦西两岸的景色都很惊艳！上海外滩不愧为万国建筑博览。浦东浦西两岸的景...

  4. 用户: 行者2004
     内容: 上海外滩闻名世界，推荐到外滩3号7楼POP酒吧露台看看无敌江景和游船，东方明珠和上海中心近在咫尺。P...

  5. 用户: Annie长翅膀的喵
     内容: 黄浦江的夜，向来是亮的。不是星月交辉的亮，而是人造的亮，是电的亮。这亮光排开了夜色，将江水也染成了五...

CSV文件列名: ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 'user_name', 'comment_content', 'sight_url', 'comment_page']

您现在可以查看 ctrip_test_first_sight.csv 文件来确认数据格式


In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import csv
import json
import urllib.parse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import os

# 常量定义
COMMENTS_CSV = "ctrip_all_pages_comments.csv"
TEST_CSV = "ctrip_test_first_sight.csv"

def setup_driver():
    """设置Chrome驱动"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # 无头模式
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def click_time_sort(driver, url):
    """点击时间排序按钮"""
    try:
        print("  正在访问页面...")
        driver.get(url)
        
        # 等待页面加载
        time.sleep(5)
        
        # 滚动到评论区域
        driver.execute_script("window.scrollTo(0, 1000)")
        time.sleep(2)
        
        print("  正在查找排序按钮...")
        
        # 多种可能的选择器来找到排序区域
        sort_selectors = [
            ".commentModuleRef-commentModule-sortList",
            ".commentModule-sortList",
            ".sortList",
            "[class*='sort']",
            ".comment-sort"
        ]
        
        sort_container = None
        for selector in sort_selectors:
            try:
                sort_container = driver.find_element(By.CSS_SELECTOR, selector)
                if sort_container:
                    print(f"  找到排序容器: {selector}")
                    break
            except:
                continue
        
        if not sort_container:
            print("  未找到排序容器，尝试直接查找时间排序按钮")
            # 直接查找时间排序按钮
            time_sort_selectors = [
                "button:contains('时间排序')",
                "span:contains('时间排序')",
                "div:contains('时间排序')",
                "a:contains('时间排序')",
                "[class*='time']:contains('时间')",
                "[class*='sort']:contains('时间')"
            ]
            
            for selector in time_sort_selectors:
                try:
                    time_sort_btn = driver.find_element(By.XPATH, f"//*[contains(text(), '时间排序')]")
                    if time_sort_btn:
                        print("  找到时间排序按钮")
                        driver.execute_script("arguments[0].click();", time_sort_btn)
                        time.sleep(3)
                        return True
                except:
                    continue
            return False
        
        # 在排序容器中查找时间排序标签
        print("  在排序容器中查找时间排序...")
        time_sort_selectors = [
            ".sortTag:contains('时间排序')",
            "[class*='sortTag']:contains('时间')",
            "button:contains('时间排序')",
            "span:contains('时间排序')"
        ]
        
        for selector in time_sort_selectors:
            try:
                # 使用XPath在排序容器内查找包含"时间排序"的元素
                time_sort_elements = sort_container.find_elements(By.XPATH, ".//*[contains(text(), '时间排序')]")
                if time_sort_elements:
                    time_sort_btn = time_sort_elements[0]
                    print("  找到时间排序按钮，正在点击...")
                    
                    # 使用JavaScript点击，避免元素不可点击的问题
                    driver.execute_script("arguments[0].click();", time_sort_btn)
                    time.sleep(3)
                    
                    print("  时间排序点击成功")
                    return True
            except Exception as e:
                print(f"  点击时间排序失败: {e}")
                continue
        
        print("  未找到时间排序按钮")
        return False
        
    except Exception as e:
        print(f"  点击时间排序时出错: {e}")
        return False

def parse_comments_from_html(html_content, sight_name):
    """
    从HTML中解析评论的用户名和内容
    """
    soup = BeautifulSoup(html_content, 'lxml')
    comments = []
    
    # 方案1：查找commentlist相关的div
    comment_containers = soup.find_all('div', class_=re.compile(r'commentlist|commentList|comment-list'))
    
    for container in comment_containers:
        # 在评论容器中查找每个评论项
        comment_items = container.find_all('div', class_=re.compile(r'comment-item|commentItem|comment_item'))
        
        for item in comment_items:
            try:
                # 提取用户名 - 多种可能的选择器
                user_name = extract_user_name(item)
                
                # 提取评论内容 - 多种可能的选择器
                comment_content = extract_comment_content(item)
                
                if user_name and comment_content:
                    comments.append({
                        'sight_name': sight_name,
                        'user_name': user_name,
                        'comment_content': comment_content
                    })
                    
            except Exception as e:
                continue
    
    # 方案2：如果上面没找到，尝试更通用的选择器
    if not comments:
        comments = fallback_comment_parsing(soup, sight_name)
    
    return comments

def extract_user_name(comment_element):
    """提取用户名"""
    user_selectors = [
        '.userInfo .userName',
        '.user-info .user-name',
        '.username',
        '.userName',
        '.user-name',
        'span[class*="user"]',
        'div[class*="user"]',
        '.name'
    ]
    
    for selector in user_selectors:
        user_element = comment_element.select_one(selector)
        if user_element:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1:
                return text
    
    # 如果选择器没找到，尝试正则匹配
    text_content = comment_element.get_text()
    user_match = re.search(r'用户\w+|\w+用户|游客\w+', text_content)
    if user_match:
        return user_match.group()
    
    return "未知用户"

def extract_comment_content(comment_element):
    """提取评论内容"""
    content_selectors = [
        '.contentInfo .commentDetail',
        '.content-info .comment-detail',
        '.commentContent',
        '.comment-content',
        '.content',
        '.detail',
        '.comment-text',
        '.commentDetail',
        'p',
        'span[class*="content"]',
        'div[class*="content"]'
    ]
    
    for selector in content_selectors:
        content_element = comment_element.select_one(selector)
        if content_element:
            text = content_element.get_text(strip=True)
            if text and len(text) >= 5:  # 过滤太短的内容
                return text
    
    # 如果选择器没找到，尝试获取整个评论元素的文本
    full_text = comment_element.get_text(strip=True)
    # 清理文本，移除可能的用户名、时间等信息
    cleaned_text = clean_comment_text(full_text)
    if cleaned_text and len(cleaned_text) >= 10:
        return cleaned_text
    
    return None

def clean_comment_text(text):
    """清理评论文本"""
    # 移除常见的前缀和后缀
    patterns_to_remove = [
        r'用户\d+\s*',
        r'游客\d+\s*',
        r'\d{4}-\d{2}-\d{2}\s*',
        r'\d+月\d+日\s*',
        r'发表于\s*',
        r'评分：\d+',
        r'点击回复',
        r'回复'
    ]
    
    cleaned = text
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

def fallback_comment_parsing(soup, sight_name):
    """备选方案：更通用的评论解析"""
    comments = []
    
    # 查找所有可能包含评论的div
    potential_comments = soup.find_all('div', class_=re.compile(r'comment|review|点评|评价'))
    
    for element in potential_comments:
        try:
            # 获取完整文本
            full_text = element.get_text(strip=True)
            
            # 跳过太短或包含特定关键词的文本
            if (len(full_text) < 20 or 
                '暂无点评' in full_text or 
                '我要点评' in full_text or
                '条点评' in full_text):
                continue
            
            # 尝试分离用户名和评论内容
            user_name, comment_content = separate_user_and_content(full_text)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'comment_content': comment_content
                })
                
        except Exception as e:
            continue
    
    return comments

def separate_user_and_content(text):
    """尝试从文本中分离用户名和评论内容"""
    # 常见的用户名模式
    user_patterns = [
        r'^([\u4e00-\u9fa5a-zA-Z0-9_]{2,10})\s+',
        r'用户(\d+)\s*',
        r'游客(\d+)\s*',
        r'^(\w+)\s+'
    ]
    
    user_name = None
    comment_content = text
    
    for pattern in user_patterns:
        match = re.match(pattern, text)
        if match:
            user_name = match.group(1)
            comment_content = text[match.end():].strip()
            break
    
    return user_name, comment_content

def construct_comment_page_url(base_url, page_num):
    """构造评论分页URL"""
    # 方法1：在URL中添加分页参数
    if '?' in base_url:
        return f"{base_url}&p{page_num}"
    else:
        return f"{base_url}?p{page_num}"

def has_next_comment_page(html_content):
    """检查是否还有下一页评论"""
    soup = BeautifulSoup(html_content, 'lxml')
    
    # 检查下一页按钮是否存在且可用
    next_selectors = [
        'a:contains("下一页")',
        'a:contains("next")',
        '.next-page',
        'a[class*="next"]',
        'button:contains("下一页")'
    ]
    
    for selector in next_selectors:
        try:
            next_element = soup.select_one(selector)
            if next_element:
                # 检查是否被禁用
                if next_element.has_attr('disabled') or 'disabled' in next_element.get('class', []):
                    return False
                return True
        except:
            continue
    
    return False

def fetch_all_comments_for_sight(driver, url, sight_name, max_pages=301, test_mode=False):
    """获取单个景点的所有评论页面（1-301页）"""
    all_comments = []
    page_num = 1
    
    if test_mode:
        print(f"  [测试模式] 开始爬取 {sight_name} 的评论页面...")
        max_pages = 3  # 测试模式只爬取3页
    else:
        print(f"  开始爬取 {sight_name} 的评论页面...")
    
    # 首先点击时间排序
    print("  正在设置时间排序...")
    sort_success = click_time_sort(driver, url)
    if not sort_success:
        print("  时间排序设置失败，将继续使用默认排序")
    
    while page_num <= max_pages:
        try:
            # 获取当前页面HTML（排序后的页面）
            html_content = driver.page_source
            
            # 解析当前页的评论
            page_comments = parse_comments_from_html(html_content, sight_name)
            
            if not page_comments:
                print(f"    第{page_num}页未找到评论，停止爬取")
                break
            
            all_comments.extend(page_comments)
            
            if test_mode:
                print(f"    [测试] 第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
            else:
                print(f"    第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
            
            # 检查是否还有更多页面
            if not has_next_comment_page(html_content) or page_num >= max_pages:
                if test_mode:
                    print(f"    [测试] 测试完成，共爬取 {page_num} 页")
                else:
                    print(f"    没有更多评论页面或已达到最大页数{max_pages}，停止爬取")
                break
            
            # 点击下一页
            page_num += 1
            next_page_url = construct_comment_page_url(url, page_num)
            print(f"    正在跳转到第{page_num}页...")
            driver.get(next_page_url)
            time.sleep(3)
            
        except Exception as e:
            print(f"    第{page_num}页爬取出错: {e}")
            break
    
    return all_comments

def test_first_sight():
    """测试模式：只爬取第一个景点"""
    print("=== 测试模式：只爬取第一个景点 ===")
    
    # 设置浏览器驱动
    driver = setup_driver()
    
    try:
        # 读取Excel文件中的景点数据
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
        if len(df) == 0:
            print("[ERROR] Excel文件中没有数据")
            return
            
        # 获取第一个景点
        first_sight = df.iloc[0]
        sight_name = first_sight['name']
        url = first_sight['url']
        rating = first_sight['评分（0-5分）']
        category = first_sight['分类']
        city = first_sight['所在城市']
        
        print(f"\n测试景点信息:")
        print(f"  名称: {sight_name}")
        print(f"  URL: {url}")
        print(f"  评分: {rating}")
        print(f"  分类: {category}")
        print(f"  城市: {city}")
        
        # 爬取第一个景点的评论（测试模式只爬取3页）
        print(f"\n开始测试爬取...")
        comments = fetch_all_comments_for_sight(driver, url, sight_name, max_pages=3, test_mode=True)
        
        # 为每条评论添加额外信息
        for comment in comments:
            comment.update({
                'sight_rating': rating,
                'sight_category': category,
                'sight_city': city,
                'sight_url': url,
                'comment_page': '测试模式-前3页-时间排序'
            })
        
        # 保存测试结果
        if comments:
            with open(TEST_CSV, "w", newline="", encoding="utf-8-sig") as f:
                fieldnames = ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                             'user_name', 'comment_content', 'sight_url', 'comment_page']
                w = csv.DictWriter(f, fieldnames=fieldnames)
                w.writeheader()
                w.writerows(comments)
            
            with open("ctrip_test_comments.json", "w", encoding="utf-8") as f:
                json.dump(comments, f, ensure_ascii=False, indent=2)
                
            print(f"\n[测试完成]")
            print(f"爬取页数: 3页")
            print(f"评论数量: {len(comments)} 条")
            print(f"测试文件: {TEST_CSV}")
            
            # 显示前几条评论预览
            print(f"\n前5条评论预览:")
            for i, comment in enumerate(comments[:5], 1):
                print(f"  {i}. 用户: {comment['user_name']}")
                print(f"     内容: {comment['comment_content'][:50]}...")
                print()
                
        else:
            print("[测试失败] 未找到任何评论")
            
    finally:
        driver.quit()

def main():
    """主函数 - 为Excel中的每个景点爬取1-301页评论"""
    
    # 设置浏览器驱动
    driver = setup_driver()
    
    try:
        # 读取Excel文件中的景点数据
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
        all_comments = []
        total_sights = len(df)
        
        print(f"\n=== 开始为 {total_sights} 个景点爬取1-301页评论 ===")
        
        # 为Excel中的每个景点爬取所有评论页面
        for index, row in df.iterrows():
            sight_name = row['name']
            url = row['url']
            rating = row['评分（0-5分）']
            category = row['分类']
            city = row['所在城市']
            
            print(f"\n[{index+1}/{total_sights}] 正在爬取: {sight_name}")
            print(f"  评分: {rating}, 分类: {category}, 城市: {city}")
            
            # 获取并解析所有评论页面（1-301页）
            comments = fetch_all_comments_for_sight(driver, url, sight_name, max_pages=301)
            
            # 为每条评论添加额外信息
            for comment in comments:
                comment.update({
                    'sight_rating': rating,
                    'sight_category': category,
                    'sight_city': city,
                    'sight_url': url,
                    'comment_page': '多页爬取-时间排序'
                })
            
            all_comments.extend(comments)
            
            print(f"  {sight_name} 总计找到 {len(comments)} 条评论")
            
            time.sleep(3)
            
            if (index + 1) % 2 == 0:
                print(f"[进度保存] 已处理 {index+1} 个景点，总计 {len(all_comments)} 条评论")
                temp_df = pd.DataFrame(all_comments)
                temp_df.to_csv('ctrip_comments_temp.csv', index=False, encoding='utf-8-sig')
        
        # 保存最终评论数据
        if all_comments:
            with open(COMMENTS_CSV, "w", newline="", encoding="utf-8-sig") as f:
                fieldnames = ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                             'user_name', 'comment_content', 'sight_url', 'comment_page']
                w = csv.DictWriter(f, fieldnames=fieldnames)
                w.writeheader()
                w.writerows(all_comments)
            
            with open("ctrip_all_pages_comments.json", "w", encoding="utf-8") as f:
                json.dump(all_comments, f, ensure_ascii=False, indent=2)
                
            print(f"\n[SUCCESS] 爬取完成！")
            print(f"评论文件: {COMMENTS_CSV}")
            
        else:
            print("[WARNING] 未找到任何评论")
            
    finally:
        driver.quit()

if __name__ == "__main__":
    # 检查是否安装了selenium
    try:
        from selenium import webdriver
    except ImportError:
        print("错误：需要安装selenium")
        print("请运行: pip install selenium")
        print("并下载ChromeDriver: https://chromedriver.chromium.org/")
        exit(1)
    
    # 询问用户要运行测试模式还是完整模式
    print("请选择运行模式:")
    print("1. 测试模式 (只爬取第一个景点的前3页)")
    print("2. 完整模式 (爬取所有景点的1-301页)")
    
    choice = input("请输入选择 (1 或 2): ").strip()
    
    if choice == "1":
        test_first_sight()
    elif choice == "2":
        main()
    else:
        print("无效选择，默认运行测试模式")
        test_first_sight()

请选择运行模式:
1. 测试模式 (只爬取第一个景点的前3页)
2. 完整模式 (爬取所有景点的1-301页)
=== 测试模式：只爬取第一个景点 ===
[INFO] 成功读取Excel文件，共 80 个景点

测试景点信息:
  名称: 外滩
  URL: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  评分: 4.8
  分类: 遛娃宝藏地
  城市: 上海 

开始测试爬取...
  [测试模式] 开始爬取 外滩 的评论页面...
  正在设置时间排序...
  正在访问页面...
  正在查找排序按钮...
  找到排序容器: .sortList
  在排序容器中查找时间排序...
  找到时间排序按钮，正在点击...
  时间排序点击成功
    [测试] 第1页找到 10 条评论，累计 10 条
    正在跳转到第2页...
    [测试] 第2页找到 10 条评论，累计 20 条
    正在跳转到第3页...
    [测试] 第3页找到 10 条评论，累计 30 条
    [测试] 测试完成，共爬取 3 页

[测试完成]
爬取页数: 3页
评论数量: 30 条
测试文件: ctrip_test_first_sight.csv

前5条评论预览:
  1. 用户: _TI***pr
     内容: 你不应该在晚上错过男人...

  2. 用户: YoYo_2Y0I2W6N
     内容: 很好很好很好很好...

  3. 用户: 熊爹爹
     内容: 非常不错！值得...

  4. 用户: _TI***99
     内容: 游船顶部有一个相当小的观景台,所以人多,感觉挺局促的。自助餐,菜品没有贴标签,很难识别每个。而且,我...

  5. 用户: 匿名用户
     内容: 不错不错值得一游...



In [4]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import csv
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# 常量定义
COMMENTS_CSV = "ctrip_all_pages_comments.csv"
TEST_CSV = "ctrip_test_first_sight.csv"

def setup_driver():
    """设置Chrome驱动"""
    chrome_options = Options()
    # chrome_options.add_argument('--headless')  # 先注释掉，便于调试
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def click_time_sort(driver):
    """点击时间排序按钮"""
    try:
        print("  正在查找排序按钮...")
        
        # 多种可能的选择器来找到排序区域
        sort_selectors = [
            ".commentModuleRef-commentModule-sortList",
            ".commentModule-sortList",
            ".sortList",
            "[class*='sort']",
            ".comment-sort"
        ]
        
        sort_container = None
        for selector in sort_selectors:
            try:
                sort_container = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                )
                if sort_container:
                    print(f"  找到排序容器: {selector}")
                    break
            except:
                continue
        
        if not sort_container:
            print("  未找到排序容器，尝试直接查找时间排序按钮")
            # 直接查找时间排序按钮
            try:
                time_sort_btn = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), '时间排序')]"))
                )
                print("  找到时间排序按钮")
                driver.execute_script("arguments[0].click();", time_sort_btn)
                time.sleep(3)
                return True
            except:
                return False
        
        # 在排序容器中查找时间排序标签
        print("  在排序容器中查找时间排序...")
        try:
            # 使用XPath在排序容器内查找包含"时间排序"的元素
            time_sort_btn = WebDriverWait(sort_container, 10).until(
                EC.element_to_be_clickable((By.XPATH, ".//*[contains(text(), '时间排序')]"))
            )
            print("  找到时间排序按钮，正在点击...")
            
            # 使用JavaScript点击，避免元素不可点击的问题
            driver.execute_script("arguments[0].click();", time_sort_btn)
            time.sleep(3)
            
            print("  时间排序点击成功")
            return True
        except Exception as e:
            print(f"  点击时间排序失败: {e}")
            return False
        
    except Exception as e:
        print(f"  点击时间排序时出错: {e}")
        return False

def click_next_page(driver):
    """点击下一页按钮"""
    try:
        print("  正在查找下一页按钮...")
        
        # 使用您提供的分页路径
        next_page_selectors = [
            ".commentModuleRef-commentModule-myPagination-ant-pagination .ant-pagination-next",
            ".commentModule-myPagination-ant-pagination .ant-pagination-next",
            ".myPagination-ant-pagination .ant-pagination-next",
            ".ant-pagination-next",
            "a[class*='next']",
            "button[class*='next']",
            "li[class*='next']"
        ]
        
        for selector in next_page_selectors:
            try:
                next_btn = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                )
                
                # 检查下一页按钮是否可用（没有被禁用）
                is_disabled = next_btn.get_attribute("disabled") or "ant-pagination-disabled" in next_btn.get_attribute("class")
                if is_disabled:
                    print("  下一页按钮被禁用，没有更多页面")
                    return False
                
                print(f"  找到下一页按钮: {selector}，正在点击...")
                driver.execute_script("arguments[0].click();", next_btn)
                time.sleep(3)
                
                # 等待页面加载完成
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, ".commentModuleRef-commentModule-sortList, .commentModule-sortList, [class*='comment']"))
                )
                
                print("  下一页加载成功")
                return True
                
            except TimeoutException:
                continue
            except Exception as e:
                print(f"  点击下一页失败 ({selector}): {e}")
                continue
        
        print("  未找到可用的下一页按钮")
        return False
        
    except Exception as e:
        print(f"  点击下一页时出错: {e}")
        return False

def parse_comments_from_html(html_content, sight_name):
    """
    从HTML中解析评论的用户名和内容
    """
    soup = BeautifulSoup(html_content, 'lxml')
    comments = []
    
    # 方案1：查找commentlist相关的div
    comment_containers = soup.find_all('div', class_=re.compile(r'commentlist|commentList|comment-list'))
    
    for container in comment_containers:
        # 在评论容器中查找每个评论项
        comment_items = container.find_all('div', class_=re.compile(r'comment-item|commentItem|comment_item'))
        
        for item in comment_items:
            try:
                # 提取用户名 - 多种可能的选择器
                user_name = extract_user_name(item)
                
                # 提取评论内容 - 多种可能的选择器
                comment_content = extract_comment_content(item)
                
                if user_name and comment_content:
                    comments.append({
                        'sight_name': sight_name,
                        'user_name': user_name,
                        'comment_content': comment_content
                    })
                    
            except Exception as e:
                continue
    
    # 方案2：如果上面没找到，尝试更通用的选择器
    if not comments:
        comments = fallback_comment_parsing(soup, sight_name)
    
    return comments

def extract_user_name(comment_element):
    """提取用户名"""
    user_selectors = [
        '.userInfo .userName',
        '.user-info .user-name',
        '.username',
        '.userName',
        '.user-name',
        'span[class*="user"]',
        'div[class*="user"]',
        '.name'
    ]
    
    for selector in user_selectors:
        user_element = comment_element.select_one(selector)
        if user_element:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1:
                return text
    
    # 如果选择器没找到，尝试正则匹配
    text_content = comment_element.get_text()
    user_match = re.search(r'用户\w+|\w+用户|游客\w+', text_content)
    if user_match:
        return user_match.group()
    
    return "未知用户"

def extract_comment_content(comment_element):
    """提取评论内容"""
    content_selectors = [
        '.contentInfo .commentDetail',
        '.content-info .comment-detail',
        '.commentContent',
        '.comment-content',
        '.content',
        '.detail',
        '.comment-text',
        '.commentDetail',
        'p',
        'span[class*="content"]',
        'div[class*="content"]'
    ]
    
    for selector in content_selectors:
        content_element = comment_element.select_one(selector)
        if content_element:
            text = content_element.get_text(strip=True)
            if text and len(text) >= 5:  # 过滤太短的内容
                return text
    
    # 如果选择器没找到，尝试获取整个评论元素的文本
    full_text = comment_element.get_text(strip=True)
    # 清理文本，移除可能的用户名、时间等信息
    cleaned_text = clean_comment_text(full_text)
    if cleaned_text and len(cleaned_text) >= 10:
        return cleaned_text
    
    return None

def clean_comment_text(text):
    """清理评论文本"""
    # 移除常见的前缀和后缀
    patterns_to_remove = [
        r'用户\d+\s*',
        r'游客\d+\s*',
        r'\d{4}-\d{2}-\d{2}\s*',
        r'\d+月\d+日\s*',
        r'发表于\s*',
        r'评分：\d+',
        r'点击回复',
        r'回复'
    ]
    
    cleaned = text
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

def fallback_comment_parsing(soup, sight_name):
    """备选方案：更通用的评论解析"""
    comments = []
    
    # 查找所有可能包含评论的div
    potential_comments = soup.find_all('div', class_=re.compile(r'comment|review|点评|评价'))
    
    for element in potential_comments:
        try:
            # 获取完整文本
            full_text = element.get_text(strip=True)
            
            # 跳过太短或包含特定关键词的文本
            if (len(full_text) < 20 or 
                '暂无点评' in full_text or 
                '我要点评' in full_text or
                '条点评' in full_text):
                continue
            
            # 尝试分离用户名和评论内容
            user_name, comment_content = separate_user_and_content(full_text)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'comment_content': comment_content
                })
                
        except Exception as e:
            continue
    
    return comments

def separate_user_and_content(text):
    """尝试从文本中分离用户名和评论内容"""
    # 常见的用户名模式
    user_patterns = [
        r'^([\u4e00-\u9fa5a-zA-Z0-9_]{2,10})\s+',
        r'用户(\d+)\s*',
        r'游客(\d+)\s*',
        r'^(\w+)\s+'
    ]
    
    user_name = None
    comment_content = text
    
    for pattern in user_patterns:
        match = re.match(pattern, text)
        if match:
            user_name = match.group(1)
            comment_content = text[match.end():].strip()
            break
    
    return user_name, comment_content

def fetch_all_comments_for_sight(driver, url, sight_name, max_pages=301, test_mode=False):
    """获取单个景点的所有评论页面（1-301页）"""
    all_comments = []
    page_num = 1
    
    if test_mode:
        print(f"  [测试模式] 开始爬取 {sight_name} 的评论页面...")
        max_pages = 3  # 测试模式只爬取3页
    else:
        print(f"  开始爬取 {sight_name} 的评论页面...")
    
    try:
        # 访问景点页面
        print(f"  正在访问: {url}")
        driver.get(url)
        
        # 等待页面加载
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
        )
        time.sleep(3)
        
        # 滚动到评论区域
        print("  滚动到评论区域...")
        driver.execute_script("window.scrollTo(0, 1000)")
        time.sleep(2)
        
        # 首先点击时间排序
        print("  正在设置时间排序...")
        sort_success = click_time_sort(driver)
        if not sort_success:
            print("  时间排序设置失败，将继续使用默认排序")
        
        # 开始爬取页面
        while page_num <= max_pages:
            print(f"  正在处理第 {page_num} 页...")
            
            # 获取当前页面HTML
            html_content = driver.page_source
            
            # 解析当前页的评论
            page_comments = parse_comments_from_html(html_content, sight_name)
            
            if page_comments:
                all_comments.extend(page_comments)
                if test_mode:
                    print(f"    [测试] 第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
                else:
                    print(f"    第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
            else:
                print(f"    第{page_num}页未找到评论")
            
            # 检查是否达到最大页数
            if page_num >= max_pages:
                if test_mode:
                    print(f"    [测试] 测试完成，共爬取 {page_num} 页")
                else:
                    print(f"    已达到最大页数 {max_pages}，停止爬取")
                break
            
            # 尝试点击下一页
            print(f"    尝试跳转到第 {page_num + 1} 页...")
            next_success = click_next_page(driver)
            
            if not next_success:
                print("    没有更多页面，停止爬取")
                break
            
            page_num += 1
            
    except Exception as e:
        print(f"  爬取过程中出错: {e}")
    
    return all_comments

def test_first_sight():
    """测试模式：只爬取第一个景点"""
    print("=== 测试模式：只爬取第一个景点 ===")
    
    # 设置浏览器驱动
    driver = setup_driver()
    
    try:
        # 读取Excel文件中的景点数据
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
        if len(df) == 0:
            print("[ERROR] Excel文件中没有数据")
            return
            
        # 获取第一个景点
        first_sight = df.iloc[0]
        sight_name = first_sight['name']
        url = first_sight['url']
        rating = first_sight['评分（0-5分）']
        category = first_sight['分类']
        city = first_sight['所在城市']
        
        print(f"\n测试景点信息:")
        print(f"  名称: {sight_name}")
        print(f"  URL: {url}")
        print(f"  评分: {rating}")
        print(f"  分类: {category}")
        print(f"  城市: {city}")
        
        # 爬取第一个景点的评论（测试模式只爬取3页）
        print(f"\n开始测试爬取...")
        comments = fetch_all_comments_for_sight(driver, url, sight_name, max_pages=3, test_mode=True)
        
        # 为每条评论添加额外信息
        for comment in comments:
            comment.update({
                'sight_rating': rating,
                'sight_category': category,
                'sight_city': city,
                'sight_url': url,
                'comment_page': '测试模式-时间排序'
            })
        
        # 保存测试结果
        if comments:
            with open(TEST_CSV, "w", newline="", encoding="utf-8-sig") as f:
                fieldnames = ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                             'user_name', 'comment_content', 'sight_url', 'comment_page']
                w = csv.DictWriter(f, fieldnames=fieldnames)
                w.writeheader()
                w.writerows(comments)
            
            with open("ctrip_test_comments.json", "w", encoding="utf-8") as f:
                json.dump(comments, f, ensure_ascii=False, indent=2)
                
            print(f"\n[测试完成]")
            print(f"爬取页数: {min(3, len(comments))}页")
            print(f"评论数量: {len(comments)} 条")
            print(f"测试文件: {TEST_CSV}")
            
            # 显示前几条评论预览
            print(f"\n前5条评论预览:")
            for i, comment in enumerate(comments[:5], 1):
                print(f"  {i}. 用户: {comment['user_name']}")
                print(f"     内容: {comment['comment_content'][:50]}...")
                print()
                
        else:
            print("[测试失败] 未找到任何评论")
            
    finally:
        driver.quit()

def main():
    """主函数 - 为Excel中的每个景点爬取1-301页评论"""
    
    # 设置浏览器驱动
    driver = setup_driver()
    
    try:
        # 读取Excel文件中的景点数据
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
        all_comments = []
        total_sights = len(df)
        
        print(f"\n=== 开始为 {total_sights} 个景点爬取1-301页评论 ===")
        
        # 为Excel中的每个景点爬取所有评论页面
        for index, row in df.iterrows():
            sight_name = row['name']
            url = row['url']
            rating = row['评分（0-5分）']
            category = row['分类']
            city = row['所在城市']
            
            print(f"\n[{index+1}/{total_sights}] 正在爬取: {sight_name}")
            print(f"  评分: {rating}, 分类: {category}, 城市: {city}")
            
            # 获取并解析所有评论页面（1-301页）
            comments = fetch_all_comments_for_sight(driver, url, sight_name, max_pages=301)
            
            # 为每条评论添加额外信息
            for comment in comments:
                comment.update({
                    'sight_rating': rating,
                    'sight_category': category,
                    'sight_city': city,
                    'sight_url': url,
                    'comment_page': '完整爬取-时间排序'
                })
            
            all_comments.extend(comments)
            
            print(f"  {sight_name} 总计找到 {len(comments)} 条评论")
            
            time.sleep(3)
            
            if (index + 1) % 2 == 0:
                print(f"[进度保存] 已处理 {index+1} 个景点，总计 {len(all_comments)} 条评论")
                temp_df = pd.DataFrame(all_comments)
                temp_df.to_csv('ctrip_comments_temp.csv', index=False, encoding='utf-8-sig')
        
        # 保存最终评论数据
        if all_comments:
            with open(COMMENTS_CSV, "w", newline="", encoding="utf-8-sig") as f:
                fieldnames = ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                             'user_name', 'comment_content', 'sight_url', 'comment_page']
                w = csv.DictWriter(f, fieldnames=fieldnames)
                w.writeheader()
                w.writerows(all_comments)
            
            with open("ctrip_all_pages_comments.json", "w", encoding="utf-8") as f:
                json.dump(all_comments, f, ensure_ascii=False, indent=2)
                
            print(f"\n[SUCCESS] 爬取完成！")
            print(f"评论文件: {COMMENTS_CSV}")
            
        else:
            print("[WARNING] 未找到任何评论")
            
    finally:
        driver.quit()

if __name__ == "__main__":
    # 检查是否安装了selenium
    try:
        from selenium import webdriver
    except ImportError:
        print("错误：需要安装selenium")
        print("请运行: pip install selenium")
        print("并下载ChromeDriver: https://chromedriver.chromium.org/")
        exit(1)
    
    # 询问用户要运行测试模式还是完整模式
    print("请选择运行模式:")
    print("1. 测试模式 (只爬取第一个景点的前3页)")
    print("2. 完整模式 (爬取所有景点的1-301页)")
    
    choice = input("请输入选择 (1 或 2): ").strip()
    
    if choice == "1":
        test_first_sight()
    elif choice == "2":
        main()
    else:
        print("无效选择，默认运行测试模式")
        test_first_sight()

请选择运行模式:
1. 测试模式 (只爬取第一个景点的前3页)
2. 完整模式 (爬取所有景点的1-301页)
=== 测试模式：只爬取第一个景点 ===
[INFO] 成功读取Excel文件，共 80 个景点

测试景点信息:
  名称: 外滩
  URL: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  评分: 4.8
  分类: 遛娃宝藏地
  城市: 上海 

开始测试爬取...
  [测试模式] 开始爬取 外滩 的评论页面...
  正在访问: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  滚动到评论区域...
  正在设置时间排序...
  正在查找排序按钮...
  找到排序容器: .sortList
  在排序容器中查找时间排序...
  找到时间排序按钮，正在点击...
  时间排序点击成功
  正在处理第 1 页...
    [测试] 第1页找到 10 条评论，累计 10 条
    尝试跳转到第 2 页...
  正在查找下一页按钮...
  找到下一页按钮: .ant-pagination-next，正在点击...
  下一页加载成功
  正在处理第 2 页...
    [测试] 第2页找到 10 条评论，累计 20 条
    尝试跳转到第 3 页...
  正在查找下一页按钮...
  找到下一页按钮: .ant-pagination-next，正在点击...
  下一页加载成功
  正在处理第 3 页...
    [测试] 第3页找到 10 条评论，累计 30 条
    [测试] 测试完成，共爬取 3 页

[测试完成]
爬取页数: 3页
评论数量: 30 条
测试文件: ctrip_test_first_sight.csv

前5条评论预览:
  1. 用户: _TI***pr
     内容: 你不应该在晚上错过男人...

  2. 用户: YoYo_2Y0I2W6N
     内容: 很好很好很好很好...

  3. 用户: 熊爹爹
     内容: 非常不错！值得...

  4. 用户: _TI***99
     内容: 游船顶部有一个相当小的观景台,所以人多,

In [5]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import csv
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# 常量定义
COMMENTS_CSV = "ctrip_all_pages_comments.csv"
TEST_CSV = "ctrip_test_first_sight.csv"

def setup_driver():
    """设置Chrome驱动"""
    chrome_options = Options()
    # chrome_options.add_argument('--headless')  # 先注释掉，便于调试
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def click_time_sort(driver):
    """点击时间排序按钮"""
    try:
        print("  正在查找排序按钮...")
        
        # 多种可能的选择器来找到排序区域
        sort_selectors = [
            ".commentModuleRef-commentModule-sortList",
            ".commentModule-sortList",
            ".sortList",
            "[class*='sort']",
            ".comment-sort"
        ]
        
        sort_container = None
        for selector in sort_selectors:
            try:
                sort_container = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                )
                if sort_container:
                    print(f"  找到排序容器: {selector}")
                    break
            except:
                continue
        
        if not sort_container:
            print("  未找到排序容器，尝试直接查找时间排序按钮")
            # 直接查找时间排序按钮
            try:
                time_sort_btn = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), '时间排序')]"))
                )
                print("  找到时间排序按钮")
                driver.execute_script("arguments[0].click();", time_sort_btn)
                time.sleep(3)
                return True
            except:
                return False
        
        # 在排序容器中查找时间排序标签
        print("  在排序容器中查找时间排序...")
        try:
            # 使用XPath在排序容器内查找包含"时间排序"的元素
            time_sort_btn = WebDriverWait(sort_container, 10).until(
                EC.element_to_be_clickable((By.XPATH, ".//*[contains(text(), '时间排序')]"))
            )
            print("  找到时间排序按钮，正在点击...")
            
            # 使用JavaScript点击，避免元素不可点击的问题
            driver.execute_script("arguments[0].click();", time_sort_btn)
            time.sleep(3)
            
            print("  时间排序点击成功")
            return True
        except Exception as e:
            print(f"  点击时间排序失败: {e}")
            return False
        
    except Exception as e:
        print(f"  点击时间排序时出错: {e}")
        return False

def click_next_page(driver, current_page_num):
    """点击下一页按钮并验证页面是否更新"""
    try:
        print(f"  正在查找第{current_page_num + 1}页按钮...")
        
        # 获取当前页面的评论内容作为参考
        old_comments = get_page_comments_preview(driver)
        
        # 使用您提供的分页路径
        next_page_selectors = [
            ".commentModuleRef-commentModule-myPagination-ant-pagination .ant-pagination-next",
            ".commentModule-myPagination-ant-pagination .ant-pagination-next",
            ".myPagination-ant-pagination .ant-pagination-next",
            ".ant-pagination-next",
            "a[class*='next']",
            "button[class*='next']",
            "li[class*='next']"
        ]
        
        for selector in next_page_selectors:
            try:
                next_btn = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                )
                
                # 检查下一页按钮是否可用（没有被禁用）
                is_disabled = next_btn.get_attribute("disabled") or "ant-pagination-disabled" in next_btn.get_attribute("class") or "disabled" in next_btn.get_attribute("class")
                if is_disabled:
                    print("  下一页按钮被禁用，没有更多页面")
                    return False
                
                print(f"  找到下一页按钮: {selector}，正在点击...")
                
                # 点击下一页
                driver.execute_script("arguments[0].click();", next_btn)
                
                # 等待页面更新 - 检查评论内容是否变化
                print("  等待页面更新...")
                if wait_for_page_update(driver, old_comments):
                    print(f"  成功跳转到第{current_page_num + 1}页")
                    return True
                else:
                    print("  页面内容没有变化，可能翻页失败")
                    continue
                
            except TimeoutException:
                continue
            except Exception as e:
                print(f"  点击下一页失败 ({selector}): {e}")
                continue
        
        print("  未找到可用的下一页按钮")
        return False
        
    except Exception as e:
        print(f"  点击下一页时出错: {e}")
        return False

def get_page_comments_preview(driver):
    """获取当前页面评论的预览，用于检测页面是否更新"""
    try:
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'lxml')
        
        # 获取前几个评论的内容作为参考
        comments = []
        comment_elements = soup.find_all('div', class_=re.compile(r'comment-item|commentItem|comment_item'))
        
        for i, element in enumerate(comment_elements[:3]):  # 只取前3个评论
            text = element.get_text(strip=True)
            if text:
                comments.append(text[:50])  # 只取前50个字符
        
        return comments
    except:
        return []

def wait_for_page_update(driver, old_comments, timeout=10):
    """等待页面内容更新"""
    start_time = time.time()
    
    while time.time() - start_time < timeout:
        try:
            # 等待可能的加载动画消失
            time.sleep(2)
            
            # 获取新的评论内容
            new_comments = get_page_comments_preview(driver)
            
            # 如果评论内容发生变化，说明页面已更新
            if new_comments and new_comments != old_comments:
                return True
            
            # 检查是否有加载中的状态
            loading_selectors = [
                ".ant-spin",
                ".loading",
                "[class*='spin']",
                "[class*='loading']"
            ]
            
            is_loading = False
            for selector in loading_selectors:
                try:
                    loading_element = driver.find_element(By.CSS_SELECTOR, selector)
                    if loading_element.is_displayed():
                        is_loading = True
                        break
                except:
                    continue
            
            if is_loading:
                print("    页面仍在加载中...")
                time.sleep(1)
                continue
            else:
                # 如果没有加载动画且内容没变，可能已经是最后一页
                break
                
        except Exception as e:
            print(f"    检测页面更新时出错: {e}")
            time.sleep(1)
    
    return False

def get_current_page_number(driver):
    """获取当前页码（如果页面有显示的话）"""
    try:
        # 查找当前活跃的页码
        current_page_selectors = [
            ".ant-pagination-item-active",
            ".ant-pagination-item-active a",
            "[class*='pagination'] [class*='active']",
            ".current-page",
            ".active[class*='page']"
        ]
        
        for selector in current_page_selectors:
            try:
                current_page_element = driver.find_element(By.CSS_SELECTOR, selector)
                page_text = current_page_element.text.strip()
                if page_text.isdigit():
                    return int(page_text)
            except:
                continue
        
        # 如果找不到当前页码，返回-1表示未知
        return -1
    except:
        return -1

def parse_comments_from_html(html_content, sight_name, page_num):
    """
    从HTML中解析评论的用户名和内容
    """
    soup = BeautifulSoup(html_content, 'lxml')
    comments = []
    
    # 方案1：查找commentlist相关的div
    comment_containers = soup.find_all('div', class_=re.compile(r'commentlist|commentList|comment-list'))
    
    for container in comment_containers:
        # 在评论容器中查找每个评论项
        comment_items = container.find_all('div', class_=re.compile(r'comment-item|commentItem|comment_item'))
        
        for item in comment_items:
            try:
                # 提取用户名 - 多种可能的选择器
                user_name = extract_user_name(item)
                
                # 提取评论内容 - 多种可能的选择器
                comment_content = extract_comment_content(item)
                
                if user_name and comment_content:
                    comments.append({
                        'sight_name': sight_name,
                        'user_name': user_name,
                        'comment_content': comment_content,
                        'page_number': page_num  # 记录页码
                    })
                    
            except Exception as e:
                continue
    
    # 方案2：如果上面没找到，尝试更通用的选择器
    if not comments:
        comments = fallback_comment_parsing(soup, sight_name, page_num)
    
    return comments

def fallback_comment_parsing(soup, sight_name, page_num):
    """备选方案：更通用的评论解析"""
    comments = []
    
    # 查找所有可能包含评论的div
    potential_comments = soup.find_all('div', class_=re.compile(r'comment|review|点评|评价'))
    
    for element in potential_comments:
        try:
            # 获取完整文本
            full_text = element.get_text(strip=True)
            
            # 跳过太短或包含特定关键词的文本
            if (len(full_text) < 20 or 
                '暂无点评' in full_text or 
                '我要点评' in full_text or
                '条点评' in full_text):
                continue
            
            # 尝试分离用户名和评论内容
            user_name, comment_content = separate_user_and_content(full_text)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'comment_content': comment_content,
                    'page_number': page_num
                })
                
        except Exception as e:
            continue
    
    return comments

def extract_user_name(comment_element):
    """提取用户名"""
    user_selectors = [
        '.userInfo .userName',
        '.user-info .user-name',
        '.username',
        '.userName',
        '.user-name',
        'span[class*="user"]',
        'div[class*="user"]',
        '.name'
    ]
    
    for selector in user_selectors:
        user_element = comment_element.select_one(selector)
        if user_element:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1:
                return text
    
    return "未知用户"

def extract_comment_content(comment_element):
    """提取评论内容"""
    content_selectors = [
        '.contentInfo .commentDetail',
        '.content-info .comment-detail',
        '.commentContent',
        '.comment-content',
        '.content',
        '.detail',
        '.comment-text',
        '.commentDetail',
        'p',
        'span[class*="content"]',
        'div[class*="content"]'
    ]
    
    for selector in content_selectors:
        content_element = comment_element.select_one(selector)
        if content_element:
            text = content_element.get_text(strip=True)
            if text and len(text) >= 5:
                return text
    
    full_text = comment_element.get_text(strip=True)
    cleaned_text = clean_comment_text(full_text)
    if cleaned_text and len(cleaned_text) >= 10:
        return cleaned_text
    
    return None

def clean_comment_text(text):
    """清理评论文本"""
    patterns_to_remove = [
        r'用户\d+\s*',
        r'游客\d+\s*',
        r'\d{4}-\d{2}-\d{2}\s*',
        r'\d+月\d+日\s*',
        r'发表于\s*',
        r'评分：\d+',
        r'点击回复',
        r'回复'
    ]
    
    cleaned = text
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

def separate_user_and_content(text):
    """尝试从文本中分离用户名和评论内容"""
    user_patterns = [
        r'^([\u4e00-\u9fa5a-zA-Z0-9_]{2,10})\s+',
        r'用户(\d+)\s*',
        r'游客(\d+)\s*',
        r'^(\w+)\s+'
    ]
    
    user_name = None
    comment_content = text
    
    for pattern in user_patterns:
        match = re.match(pattern, text)
        if match:
            user_name = match.group(1)
            comment_content = text[match.end():].strip()
            break
    
    return user_name, comment_content

def fetch_all_comments_for_sight(driver, url, sight_name, max_pages=301, test_mode=False):
    """获取单个景点的所有评论页面（1-301页）"""
    all_comments = []
    page_num = 1
    
    if test_mode:
        print(f"  [测试模式] 开始爬取 {sight_name} 的评论页面...")
        max_pages = 3
    else:
        print(f"  开始爬取 {sight_name} 的评论页面...")
    
    try:
        # 访问景点页面
        print(f"  正在访问: {url}")
        driver.get(url)
        
        # 等待页面加载
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
        )
        time.sleep(3)
        
        # 滚动到评论区域
        print("  滚动到评论区域...")
        driver.execute_script("window.scrollTo(0, 1000)")
        time.sleep(2)
        
        # 首先点击时间排序
        print("  正在设置时间排序...")
        sort_success = click_time_sort(driver)
        if not sort_success:
            print("  时间排序设置失败，将继续使用默认排序")
        
        # 开始爬取页面
        while page_num <= max_pages:
            print(f"  正在处理第 {page_num} 页...")
            
            # 获取当前页面HTML
            html_content = driver.page_source
            
            # 解析当前页的评论
            page_comments = parse_comments_from_html(html_content, sight_name, page_num)
            
            if page_comments:
                all_comments.extend(page_comments)
                if test_mode:
                    print(f"    [测试] 第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
                else:
                    print(f"    第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
            else:
                print(f"    第{page_num}页未找到评论")
                # 如果连续2页都没有评论，可能已经到底了
                if page_num > 1 and len(all_comments) == 0:
                    print("    连续多页没有评论，停止爬取")
                    break
            
            # 检查是否达到最大页数
            if page_num >= max_pages:
                if test_mode:
                    print(f"    [测试] 测试完成，共爬取 {page_num} 页")
                else:
                    print(f"    已达到最大页数 {max_pages}，停止爬取")
                break
            
            # 尝试点击下一页
            next_success = click_next_page(driver, page_num)
            
            if not next_success:
                print("    没有更多页面，停止爬取")
                break
            
            page_num += 1
            time.sleep(2)  # 页面间延迟
            
    except Exception as e:
        print(f"  爬取过程中出错: {e}")
    
    return all_comments

# 其他函数保持不变（test_first_sight, main等）
def test_first_sight():
    """测试模式：只爬取第一个景点"""
    print("=== 测试模式：只爬取第一个景点 ===")
    
    # 设置浏览器驱动
    driver = setup_driver()
    
    try:
        # 读取Excel文件中的景点数据
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
        if len(df) == 0:
            print("[ERROR] Excel文件中没有数据")
            return
            
        # 获取第一个景点
        first_sight = df.iloc[0]
        sight_name = first_sight['name']
        url = first_sight['url']
        rating = first_sight['评分（0-5分）']
        category = first_sight['分类']
        city = first_sight['所在城市']
        
        print(f"\n测试景点信息:")
        print(f"  名称: {sight_name}")
        print(f"  URL: {url}")
        print(f"  评分: {rating}")
        print(f"  分类: {category}")
        print(f"  城市: {city}")
        
        # 爬取第一个景点的评论（测试模式只爬取3页）
        print(f"\n开始测试爬取...")
        comments = fetch_all_comments_for_sight(driver, url, sight_name, max_pages=3, test_mode=True)
        
        # 为每条评论添加额外信息
        for comment in comments:
            comment.update({
                'sight_rating': rating,
                'sight_category': category,
                'sight_city': city,
                'sight_url': url
            })
        
        # 保存测试结果
        if comments:
            with open(TEST_CSV, "w", newline="", encoding="utf-8-sig") as f:
                fieldnames = ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                             'user_name', 'comment_content', 'sight_url', 'page_number']
                w = csv.DictWriter(f, fieldnames=fieldnames)
                w.writeheader()
                w.writerows(comments)
            
            print(f"\n[测试完成]")
            print(f"爬取页数: {len(set([c['page_number'] for c in comments]))}页")
            print(f"评论数量: {len(comments)} 条")
            print(f"测试文件: {TEST_CSV}")
            
        else:
            print("[测试失败] 未找到任何评论")
            
    finally:
        driver.quit()

def main():
    """主函数 - 为Excel中的每个景点爬取1-301页评论"""
    
    # 设置浏览器驱动
    driver = setup_driver()
    
    try:
        # 读取Excel文件中的景点数据
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
        all_comments = []
        total_sights = len(df)
        
        print(f"\n=== 开始为 {total_sights} 个景点爬取1-301页评论 ===")
        
        # 为Excel中的每个景点爬取所有评论页面
        for index, row in df.iterrows():
            sight_name = row['name']
            url = row['url']
            rating = row['评分（0-5分）']
            category = row['分类']
            city = row['所在城市']
            
            print(f"\n[{index+1}/{total_sights}] 正在爬取: {sight_name}")
            print(f"  评分: {rating}, 分类: {category}, 城市: {city}")
            
            # 获取并解析所有评论页面（1-301页）
            comments = fetch_all_comments_for_sight(driver, url, sight_name, max_pages=301)
            
            # 为每条评论添加额外信息
            for comment in comments:
                comment.update({
                    'sight_rating': rating,
                    'sight_category': category,
                    'sight_city': city,
                    'sight_url': url
                })
            
            all_comments.extend(comments)
            
            print(f"  {sight_name} 总计找到 {len(comments)} 条评论，来自 {len(set([c['page_number'] for c in comments]))} 页")
            
            time.sleep(3)
            
            if (index + 1) % 2 == 0:
                print(f"[进度保存] 已处理 {index+1} 个景点，总计 {len(all_comments)} 条评论")
                temp_df = pd.DataFrame(all_comments)
                temp_df.to_csv('ctrip_comments_temp.csv', index=False, encoding='utf-8-sig')
        
        # 保存最终评论数据
        if all_comments:
            with open(COMMENTS_CSV, "w", newline="", encoding="utf-8-sig") as f:
                fieldnames = ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                             'user_name', 'comment_content', 'sight_url', 'page_number']
                w = csv.DictWriter(f, fieldnames=fieldnames)
                w.writeheader()
                w.writerows(all_comments)
            
            print(f"\n[SUCCESS] 爬取完成！")
            print(f"评论文件: {COMMENTS_CSV}")
            
        else:
            print("[WARNING] 未找到任何评论")
            
    finally:
        driver.quit()

if __name__ == "__main__":
    # 检查是否安装了selenium
    try:
        from selenium import webdriver
    except ImportError:
        print("错误：需要安装selenium")
        print("请运行: pip install selenium")
        print("并下载ChromeDriver: https://chromedriver.chromium.org/")
        exit(1)
    
    # 询问用户要运行测试模式还是完整模式
    print("请选择运行模式:")
    print("1. 测试模式 (只爬取第一个景点的前3页)")
    print("2. 完整模式 (爬取所有景点的1-301页)")
    
    choice = input("请输入选择 (1 或 2): ").strip()
    
    if choice == "1":
        test_first_sight()
    elif choice == "2":
        main()
    else:
        print("无效选择，默认运行测试模式")
        test_first_sight()

请选择运行模式:
1. 测试模式 (只爬取第一个景点的前3页)
2. 完整模式 (爬取所有景点的1-301页)
=== 测试模式：只爬取第一个景点 ===
[INFO] 成功读取Excel文件，共 80 个景点

测试景点信息:
  名称: 外滩
  URL: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  评分: 4.8
  分类: 遛娃宝藏地
  城市: 上海 

开始测试爬取...
  [测试模式] 开始爬取 外滩 的评论页面...
  正在访问: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  滚动到评论区域...
  正在设置时间排序...
  正在查找排序按钮...
  找到排序容器: .sortList
  在排序容器中查找时间排序...
  找到时间排序按钮，正在点击...
  时间排序点击成功
  正在处理第 1 页...
    [测试] 第1页找到 10 条评论，累计 10 条
  正在查找第2页按钮...
  找到下一页按钮: .ant-pagination-next，正在点击...
  等待页面更新...
  成功跳转到第2页
  正在处理第 2 页...
    [测试] 第2页找到 10 条评论，累计 20 条
  正在查找第3页按钮...
  找到下一页按钮: .ant-pagination-next，正在点击...
  等待页面更新...
  成功跳转到第3页
  正在处理第 3 页...
    [测试] 第3页找到 10 条评论，累计 30 条
    [测试] 测试完成，共爬取 3 页

[测试完成]
爬取页数: 3页
评论数量: 30 条
测试文件: ctrip_test_first_sight.csv


In [8]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import csv
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# 常量定义
COMMENTS_CSV = "ctrip_all_pages_comments.csv"
TEST_CSV = "ctrip_test_first_sight.csv"

def setup_driver():
    """设置Chrome驱动"""
    chrome_options = Options()
    # chrome_options.add_argument('--headless')  # 先注释掉，便于调试
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def click_time_sort(driver):
    """点击时间排序按钮"""
    try:
        print("  正在查找排序按钮...")
        
        # 多种可能的选择器来找到排序区域
        sort_selectors = [
            ".commentModuleRef-commentModule-sortList",
            ".commentModule-sortList",
            ".sortList",
            "[class*='sort']",
            ".comment-sort"
        ]
        
        sort_container = None
        for selector in sort_selectors:
            try:
                sort_container = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                )
                if sort_container:
                    print(f"  找到排序容器: {selector}")
                    break
            except:
                continue
        
        if not sort_container:
            print("  未找到排序容器，尝试直接查找时间排序按钮")
            # 直接查找时间排序按钮
            try:
                time_sort_btn = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), '时间排序')]"))
                )
                print("  找到时间排序按钮")
                driver.execute_script("arguments[0].click();", time_sort_btn)
                time.sleep(3)
                return True
            except:
                return False
        
        # 在排序容器中查找时间排序标签
        print("  在排序容器中查找时间排序...")
        try:
            # 使用XPath在排序容器内查找包含"时间排序"的元素
            time_sort_btn = WebDriverWait(sort_container, 10).until(
                EC.element_to_be_clickable((By.XPATH, ".//*[contains(text(), '时间排序')]"))
            )
            print("  找到时间排序按钮，正在点击...")
            
            # 使用JavaScript点击，避免元素不可点击的问题
            driver.execute_script("arguments[0].click();", time_sort_btn)
            time.sleep(3)
            
            print("  时间排序点击成功")
            return True
        except Exception as e:
            print(f"  点击时间排序失败: {e}")
            return False
        
    except Exception as e:
        print(f"  点击时间排序时出错: {e}")
        return False

def click_next_page(driver, current_page_num):
    """点击下一页按钮并验证页面是否更新"""
    try:
        print(f"  正在查找第{current_page_num + 1}页按钮...")
        
        # 获取当前页面的评论内容作为参考
        old_comments = get_page_comments_preview(driver)
        
        # 使用您提供的分页路径
        next_page_selectors = [
            ".commentModuleRef-commentModule-myPagination-ant-pagination .ant-pagination-next",
            ".commentModule-myPagination-ant-pagination .ant-pagination-next",
            ".myPagination-ant-pagination .ant-pagination-next",
            ".ant-pagination-next",
            "a[class*='next']",
            "button[class*='next']",
            "li[class*='next']"
        ]
        
        for selector in next_page_selectors:
            try:
                next_btn = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                )
                
                # 检查下一页按钮是否可用（没有被禁用）
                is_disabled = next_btn.get_attribute("disabled") or "ant-pagination-disabled" in next_btn.get_attribute("class") or "disabled" in next_btn.get_attribute("class")
                if is_disabled:
                    print("  下一页按钮被禁用，没有更多页面")
                    return False
                
                print(f"  找到下一页按钮: {selector}，正在点击...")
                
                # 点击下一页
                driver.execute_script("arguments[0].click();", next_btn)
                
                # 等待页面更新 - 检查评论内容是否变化
                print("  等待页面更新...")
                if wait_for_page_update(driver, old_comments):
                    print(f"  成功跳转到第{current_page_num + 1}页")
                    return True
                else:
                    print("  页面内容没有变化，可能翻页失败")
                    continue
                
            except TimeoutException:
                continue
            except Exception as e:
                print(f"  点击下一页失败 ({selector}): {e}")
                continue
        
        print("  未找到可用的下一页按钮")
        return False
        
    except Exception as e:
        print(f"  点击下一页时出错: {e}")
        return False

def get_page_comments_preview(driver):
    """获取当前页面评论的预览，用于检测页面是否更新"""
    try:
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'lxml')
        
        # 获取前几个评论的内容作为参考
        comments = []
        comment_elements = soup.find_all('div', class_=re.compile(r'comment-item|commentItem|comment_item'))
        
        for i, element in enumerate(comment_elements[:3]):  # 只取前3个评论
            text = element.get_text(strip=True)
            if text:
                comments.append(text[:50])  # 只取前50个字符
        
        return comments
    except:
        return []

def wait_for_page_update(driver, old_comments, timeout=10):
    """等待页面内容更新"""
    start_time = time.time()
    
    while time.time() - start_time < timeout:
        try:
            # 等待可能的加载动画消失
            time.sleep(2)
            
            # 获取新的评论内容
            new_comments = get_page_comments_preview(driver)
            
            # 如果评论内容发生变化，说明页面已更新
            if new_comments and new_comments != old_comments:
                return True
            
            # 检查是否有加载中的状态
            loading_selectors = [
                ".ant-spin",
                ".loading",
                "[class*='spin']",
                "[class*='loading']"
            ]
            
            is_loading = False
            for selector in loading_selectors:
                try:
                    loading_element = driver.find_element(By.CSS_SELECTOR, selector)
                    if loading_element.is_displayed():
                        is_loading = True
                        break
                except:
                    continue
            
            if is_loading:
                print("    页面仍在加载中...")
                time.sleep(1)
                continue
            else:
                # 如果没有加载动画且内容没变，可能已经是最后一页
                break
                
        except Exception as e:
            print(f"    检测页面更新时出错: {e}")
            time.sleep(1)
    
    return False

def parse_comments_from_html(html_content, sight_name, page_num):
    """
    从HTML中解析评论的用户名、评分和内容
    """
    soup = BeautifulSoup(html_content, 'lxml')
    comments = []
    
    # 查找评论项
    comment_items = soup.find_all('div', class_=re.compile(r'commentList-commentItem'))
    
    for item in comment_items:
        try:
            # 提取用户名
            user_name = extract_user_name(item)
            
            # 提取评分
            rating = extract_rating(item)
            
            # 提取评论内容
            comment_content = extract_comment_content(item)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'user_rating': rating,  # 用户评分
                    'comment_content': comment_content,
                    'page_number': page_num
                })
                
        except Exception as e:
            print(f"    解析评论时出错: {e}")
            continue
    
    # 如果上面的选择器没找到，尝试备选方案
    if not comments:
        comments = fallback_comment_parsing(soup, sight_name, page_num)
    
    return comments

def extract_user_name(comment_element):
    """提取用户名"""
    user_selectors = [
        '.userInfo .userName',
        '.user-info .user-name',
        '.username',
        '.userName',
        '.user-name',
        'span[class*="user"]',
        'div[class*="user"]',
        '.name'
    ]
    
    for selector in user_selectors:
        user_element = comment_element.select_one(selector)
        if user_element:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1:
                return text
    
    return "未知用户"

def extract_rating(comment_element):
    """提取用户评分"""
    try:
        # 使用您提供的评分路径
        rating_selectors = [
            '.commentList-commentItem-contentInfo-averageScore',
            '.contentInfo-averageScore',
            '.averageScore',
            '[class*="averageScore"]',
            '[class*="score"]',
            '.rating',
            '[class*="rating"]'
        ]
        
        for selector in rating_selectors:
            rating_element = comment_element.select_one(selector)
            if rating_element:
                # 方法1: 直接从元素文本提取数字
                rating_text = rating_element.get_text(strip=True)
                rating_match = re.search(r'(\d+(?:\.\d+)?)', rating_text)
                if rating_match:
                    return float(rating_match.group(1))
                
                # 方法2: 从class名中提取评分（如averageScore-5）
                class_attr = rating_element.get('class', [])
                for class_name in class_attr:
                    score_match = re.search(r'(\d+(?:\.\d+)?)', class_name)
                    if score_match:
                        return float(score_match.group(1))
        
        # 方法3: 查找包含数字的文本
        full_text = comment_element.get_text()
        rating_match = re.search(r'(\d+(?:\.\d+)?)\s*分', full_text)
        if rating_match:
            return float(rating_match.group(1))
            
        # 方法4: 查找星星评分
        star_elements = comment_element.select('.ant-rate-star-full, [class*="star"]')
        if star_elements:
            return len(star_elements)
        
        return None
        
    except Exception as e:
        print(f"    提取评分时出错: {e}")
        return None

def extract_comment_content(comment_element):
    """提取评论内容"""
    # 使用您提供的评论文本路径
    content_selectors = [
        '.commentList-commentItem-contentInfo-commentDetail',
        '.contentInfo-commentDetail',
        '.commentDetail',
        '[class*="commentDetail"]',
        '.comment-content',
        '.content',
        '.detail',
        'p'
    ]
    
    for selector in content_selectors:
        content_element = comment_element.select_one(selector)
        if content_element:
            text = content_element.get_text(strip=True)
            if text and len(text) >= 5:
                return clean_comment_text(text)
    
    # 备选方案
    full_text = comment_element.get_text(strip=True)
    cleaned_text = clean_comment_text(full_text)
    if cleaned_text and len(cleaned_text) >= 10:
        return cleaned_text
    
    return None

def clean_comment_text(text):
    """清理评论文本"""
    patterns_to_remove = [
        r'用户\d+\s*',
        r'游客\d+\s*',
        r'\d{4}-\d{2}-\d{2}\s*',
        r'\d+月\d+日\s*',
        r'发表于\s*',
        r'评分：\d+',
        r'点击回复',
        r'回复'
    ]
    
    cleaned = text
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

def fallback_comment_parsing(soup, sight_name, page_num):
    """备选方案：更通用的评论解析"""
    comments = []
    
    # 查找所有可能包含评论的div
    potential_comments = soup.find_all('div', class_=re.compile(r'comment|review|点评|评价'))
    
    for element in potential_comments:
        try:
            # 获取完整文本
            full_text = element.get_text(strip=True)
            
            # 跳过太短或包含特定关键词的文本
            if (len(full_text) < 20 or 
                '暂无点评' in full_text or 
                '我要点评' in full_text or
                '条点评' in full_text):
                continue
            
            # 尝试分离用户名和评论内容
            user_name, comment_content = separate_user_and_content(full_text)
            
            # 尝试提取评分
            rating = extract_rating_from_text(full_text)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'user_rating': rating,
                    'comment_content': comment_content,
                    'page_number': page_num
                })
                
        except Exception as e:
            continue
    
    return comments

def extract_rating_from_text(text):
    """从文本中提取评分"""
    try:
        # 查找评分模式
        rating_match = re.search(r'(\d+(?:\.\d+)?)\s*分', text)
        if rating_match:
            return float(rating_match.group(1))
        
        # 查找星星模式
        star_match = re.search(r'[★☆♥❤♡]', text)
        if star_match:
            # 简单计数星星
            star_count = text.count('★') + text.count('♥') + text.count('❤')
            if star_count > 0:
                return min(star_count, 5)  # 最多5分
        
        return None
    except:
        return None

def separate_user_and_content(text):
    """尝试从文本中分离用户名和评论内容"""
    user_patterns = [
        r'^([\u4e00-\u9fa5a-zA-Z0-9_]{2,10})\s+',
        r'用户(\d+)\s*',
        r'游客(\d+)\s*',
        r'^(\w+)\s+'
    ]
    
    user_name = None
    comment_content = text
    
    for pattern in user_patterns:
        match = re.match(pattern, text)
        if match:
            user_name = match.group(1)
            comment_content = text[match.end():].strip()
            break
    
    return user_name, comment_content

def fetch_all_comments_for_sight(driver, url, sight_name, max_pages=301, test_mode=False):
    """获取单个景点的所有评论页面（1-301页）"""
    all_comments = []
    page_num = 1
    
    if test_mode:
        print(f"  [测试模式] 开始爬取 {sight_name} 的评论页面...")
        max_pages = 3
    else:
        print(f"  开始爬取 {sight_name} 的评论页面...")
    
    try:
        # 访问景点页面
        print(f"  正在访问: {url}")
        driver.get(url)
        
        # 等待页面加载
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
        )
        time.sleep(3)
        
        # 滚动到评论区域
        print("  滚动到评论区域...")
        driver.execute_script("window.scrollTo(0, 1000)")
        time.sleep(2)
        
        # 首先点击时间排序
        print("  正在设置时间排序...")
        sort_success = click_time_sort(driver)
        if not sort_success:
            print("  时间排序设置失败，将继续使用默认排序")
        
        # 开始爬取页面
        while page_num <= max_pages:
            print(f"  正在处理第 {page_num} 页...")
            
            # 获取当前页面HTML
            html_content = driver.page_source
            
            # 解析当前页的评论
            page_comments = parse_comments_from_html(html_content, sight_name, page_num)
            
            if page_comments:
                all_comments.extend(page_comments)
                if test_mode:
                    print(f"    [测试] 第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
                else:
                    print(f"    第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
                
                # 显示第一条评论的详细信息（用于调试）
                if page_num == 1 and page_comments:
                    first_comment = page_comments[0]
                    print(f"    示例评论 - 用户: {first_comment['user_name']}, 评分: {first_comment['user_rating']}, 内容: {first_comment['comment_content'][:30]}...")
            else:
                print(f"    第{page_num}页未找到评论")
                # 如果连续2页都没有评论，可能已经到底了
                if page_num > 1 and len(all_comments) == 0:
                    print("    连续多页没有评论，停止爬取")
                    break
            
            # 检查是否达到最大页数
            if page_num >= max_pages:
                if test_mode:
                    print(f"    [测试] 测试完成，共爬取 {page_num} 页")
                else:
                    print(f"    已达到最大页数 {max_pages}，停止爬取")
                break
            
            # 尝试点击下一页
            next_success = click_next_page(driver, page_num)
            
            if not next_success:
                print("    没有更多页面，停止爬取")
                break
            
            page_num += 1
            time.sleep(2)  # 页面间延迟
            
    except Exception as e:
        print(f"  爬取过程中出错: {e}")
    
    return all_comments

def test_first_sight():
    """测试模式：只爬取第一个景点"""
    print("=== 测试模式：只爬取第一个景点 ===")
    
    # 设置浏览器驱动
    driver = setup_driver()
    
    try:
        # 读取Excel文件中的景点数据
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
        if len(df) == 0:
            print("[ERROR] Excel文件中没有数据")
            return
            
        # 获取第一个景点
        first_sight = df.iloc[0]
        sight_name = first_sight['name']
        url = first_sight['url']
        rating = first_sight['评分（0-5分）']
        category = first_sight['分类']
        city = first_sight['所在城市']
        
        print(f"\n测试景点信息:")
        print(f"  名称: {sight_name}")
        print(f"  URL: {url}")
        print(f"  景点评分: {rating}")
        print(f"  分类: {category}")
        print(f"  城市: {city}")
        
        # 爬取第一个景点的评论（测试模式只爬取3页）
        print(f"\n开始测试爬取...")
        comments = fetch_all_comments_for_sight(driver, url, sight_name, max_pages=3, test_mode=True)
        
        # 为每条评论添加额外信息
        for comment in comments:
            comment.update({
                'sight_rating': rating,  # 景点评分
                'sight_category': category,
                'sight_city': city,
                'sight_url': url
            })
        
        # 保存测试结果
        if comments:
            with open(TEST_CSV, "w", newline="", encoding="utf-8-sig") as f:
                fieldnames = [
                    'sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                    'user_name', 'user_rating', 'comment_content', 'sight_url', 'page_number'
                ]
                w = csv.DictWriter(f, fieldnames=fieldnames)
                w.writeheader()
                w.writerows(comments)
            
            print(f"\n[测试完成]")
            print(f"爬取页数: {len(set([c['page_number'] for c in comments]))}页")
            print(f"评论数量: {len(comments)} 条")
            print(f"测试文件: {TEST_CSV}")
            
            # 显示统计信息
            rated_comments = [c for c in comments if c['user_rating'] is not None]
            if rated_comments:
                avg_rating = sum(c['user_rating'] for c in rated_comments) / len(rated_comments)
                print(f"用户评分统计: 平均 {avg_rating:.2f}分 ({len(rated_comments)}/{len(comments)} 条评论有评分)")
            
            # 显示前几条评论预览
            print(f"\n前3条评论预览:")
            for i, comment in enumerate(comments[:3], 1):
                print(f"  {i}. 用户: {comment['user_name']}")
                print(f"     评分: {comment['user_rating']}")
                print(f"     内容: {comment['comment_content'][:50]}...")
                print()
                
        else:
            print("[测试失败] 未找到任何评论")
            
    finally:
        driver.quit()

def main():
    """主函数 - 为Excel中的每个景点爬取1-301页评论"""
    
    # 设置浏览器驱动
    driver = setup_driver()
    
    try:
        # 读取Excel文件中的景点数据
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
        all_comments = []
        total_sights = len(df)
        
        print(f"\n=== 开始为 {total_sights} 个景点爬取1-301页评论 ===")
        
        # 为Excel中的每个景点爬取所有评论页面
        for index, row in df.iterrows():
            sight_name = row['name']
            url = row['url']
            rating = row['评分（0-5分）']
            category = row['分类']
            city = row['所在城市']
            
            print(f"\n[{index+1}/{total_sights}] 正在爬取: {sight_name}")
            print(f"  景点评分: {rating}, 分类: {category}, 城市: {city}")
            
            # 获取并解析所有评论页面（1-301页）
            comments = fetch_all_comments_for_sight(driver, url, sight_name, max_pages=301)
            
            # 为每条评论添加额外信息
            for comment in comments:
                comment.update({
                    'sight_rating': rating,
                    'sight_category': category,
                    'sight_city': city,
                    'sight_url': url
                })
            
            all_comments.extend(comments)
            
            # 统计信息
            rated_count = len([c for c in comments if c['user_rating'] is not None])
            print(f"  {sight_name} 总计: {len(comments)} 条评论 ({rated_count} 条有评分)，来自 {len(set([c['page_number'] for c in comments]))} 页")
            
            time.sleep(3)
            
            if (index + 1) % 2 == 0:
                print(f"[进度保存] 已处理 {index+1} 个景点，总计 {len(all_comments)} 条评论")
                temp_df = pd.DataFrame(all_comments)
                temp_df.to_csv('ctrip_comments_temp.csv', index=False, encoding='utf-8-sig')
        
        # 保存最终评论数据
        if all_comments:
            with open(COMMENTS_CSV, "w", newline="", encoding="utf-8-sig") as f:
                fieldnames = [
                    'sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                    'user_name', 'user_rating', 'comment_content', 'sight_url', 'page_number'
                ]
                w = csv.DictWriter(f, fieldnames=fieldnames)
                w.writeheader()
                w.writerows(all_comments)
            
            print(f"\n[SUCCESS] 爬取完成！")
            print(f"评论文件: {COMMENTS_CSV}")
            
            # 最终统计
            total_rated = len([c for c in all_comments if c['user_rating'] is not None])
            print(f"总评论数: {len(all_comments)}")
            print(f"有评分的评论: {total_rated} ({total_rated/len(all_comments)*100:.1f}%)")
            
        else:
            print("[WARNING] 未找到任何评论")
            
    finally:
        driver.quit()

if __name__ == "__main__":
    # 检查是否安装了selenium
    try:
        from selenium import webdriver
    except ImportError:
        print("错误：需要安装selenium")
        print("请运行: pip install selenium")
        print("并下载ChromeDriver: https://chromedriver.chromium.org/")
        exit(1)
    
    # 询问用户要运行测试模式还是完整模式
    print("请选择运行模式:")
    print("1. 测试模式 (只爬取第一个景点的前3页)")
    print("2. 完整模式 (爬取所有景点的1-301页)")
    
    choice = input("请输入选择 (1 或 2): ").strip()
    
    if choice == "1":
        test_first_sight()
    elif choice == "2":
        main()
    else:
        print("无效选择，默认运行测试模式")
        test_first_sight()

请选择运行模式:
1. 测试模式 (只爬取第一个景点的前3页)
2. 完整模式 (爬取所有景点的1-301页)
=== 测试模式：只爬取第一个景点 ===
[INFO] 成功读取Excel文件，共 80 个景点

测试景点信息:
  名称: 外滩
  URL: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  景点评分: 4.8
  分类: 遛娃宝藏地
  城市: 上海 

开始测试爬取...
  [测试模式] 开始爬取 外滩 的评论页面...
  正在访问: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  滚动到评论区域...
  正在设置时间排序...
  正在查找排序按钮...
  找到排序容器: .sortList
  在排序容器中查找时间排序...
  找到时间排序按钮，正在点击...
  时间排序点击成功
  正在处理第 1 页...
    [测试] 第1页找到 1 条评论，累计 1 条
    示例评论 - 用户: YoYo_9T0R7W0Q4分满意性价比不高, 评分: 4.0, 内容: 难得去下可以2025-11-09IP属地：江苏举报点赞...
  正在查找第2页按钮...
  找到下一页按钮: .ant-pagination-next，正在点击...
  等待页面更新...
  成功跳转到第2页
  正在处理第 2 页...
    [测试] 第2页找到 4 条评论，累计 5 条
  正在查找第3页按钮...
  找到下一页按钮: .ant-pagination-next，正在点击...
  等待页面更新...
  成功跳转到第3页
  正在处理第 3 页...
    第3页未找到评论
    [测试] 测试完成，共爬取 3 页

[测试完成]
爬取页数: 2页
评论数量: 5 条
测试文件: ctrip_test_first_sight.csv
用户评分统计: 平均 4.75分 (4/5 条评论有评分)

前3条评论预览:
  1. 用户: YoYo_9T0R7W0Q4分满意性价比不高
     评分: 4.0
     内容: 难得去下可以2025-11-09IP属地：江苏举报点赞.

In [10]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import csv
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# 常量定义
COMMENTS_CSV = "ctrip_all_pages_comments.csv"
TEST_CSV = "ctrip_test_first_sight.csv"

def setup_driver():
    """设置Chrome驱动"""
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def debug_page_structure(driver, url):
    """调试函数：查看页面实际结构"""
    print("=== 开始调试页面结构 ===")
    
    try:
        driver.get(url)
        time.sleep(5)
        
        # 滚动到评论区域
        driver.execute_script("window.scrollTo(0, 1000)")
        time.sleep(2)
        
        # 获取页面HTML
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'lxml')
        
        print("1. 查找评论相关元素:")
        
        # 查找所有包含comment的class
        comment_elements = soup.find_all(class_=re.compile(r'comment'))
        print(f"  找到 {len(comment_elements)} 个包含'comment'的class")
        
        for i, element in enumerate(comment_elements[:10]):
            class_names = element.get('class', [])
            print(f"  元素 {i+1}: class={class_names}")
            text_preview = element.get_text(strip=True)[:100]
            print(f"      文本: {text_preview}...")
            print()
        
        print("2. 查找评分相关元素:")
        rating_elements = soup.find_all(class_=re.compile(r'score|rating|average'))
        print(f"  找到 {len(rating_elements)} 个包含'score/rating/average'的class")
        
        for i, element in enumerate(rating_elements[:5]):
            class_names = element.get('class', [])
            print(f"  评分元素 {i+1}: class={class_names}")
            print(f"      文本: {element.get_text(strip=True)}")
            print()
        
        print("3. 查找分页元素:")
        pagination_elements = soup.find_all(class_=re.compile(r'pagination|page|next'))
        print(f"  找到 {len(pagination_elements)} 个分页相关元素")
        
        return html_content
        
    except Exception as e:
        print(f"调试出错: {e}")
        return None

def parse_comments_improved(html_content, sight_name, page_num):
    """
    改进的评论解析函数 - 使用更通用的选择器
    """
    soup = BeautifulSoup(html_content, 'lxml')
    comments = []
    
    print(f"  开始解析第{page_num}页评论...")
    
    # 方法1: 查找所有可能的评论容器
    comment_containers = []
    
    # 多种评论容器选择器
    container_selectors = [
        'div[class*="comment"]',
        'li[class*="comment"]',
        'div[class*="review"]',
        'li[class*="review"]',
        'div[class*="点评"]',
        '.comment-item',
        '.review-item',
        '.comment-list',
        '.review-list'
    ]
    
    for selector in container_selectors:
        elements = soup.select(selector)
        if elements:
            print(f"    使用选择器 '{selector}' 找到 {len(elements)} 个元素")
            comment_containers.extend(elements)
    
    # 去重
    seen = set()
    unique_containers = []
    for container in comment_containers:
        container_id = id(container)
        if container_id not in seen:
            seen.add(container_id)
            unique_containers.append(container)
    
    print(f"  总共找到 {len(unique_containers)} 个可能的评论容器")
    
    # 解析每个评论容器
    for i, container in enumerate(unique_containers):
        try:
            # 提取用户名
            user_name = extract_user_name_improved(container)
            
            # 提取评分
            rating = extract_rating_improved(container)
            
            # 提取评论内容
            comment_content = extract_comment_content_improved(container)
            
            # 过滤无效评论
            if comment_content and len(comment_content) >= 10:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name or "匿名用户",
                    'user_rating': rating,
                    'comment_content': comment_content,
                    'page_number': page_num
                })
                print(f"    找到评论 {len(comments)}: 用户={user_name}, 评分={rating}, 内容长度={len(comment_content)}")
                
        except Exception as e:
            print(f"    解析评论 {i+1} 时出错: {e}")
            continue
    
    print(f"  第{page_num}页最终解析出 {len(comments)} 条有效评论")
    return comments

def extract_user_name_improved(element):
    """改进的用户名提取"""
    user_selectors = [
        '.userName',
        '.user-name',
        '.username',
        '.name',
        '[class*="user"]',
        '.user-info',
        '.userInfo'
    ]
    
    for selector in user_selectors:
        user_elements = element.select(selector)
        for user_element in user_elements:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1 and len(text) < 20:  # 合理的用户名长度
                # 过滤掉明显不是用户名的文本
                if not any(word in text for word in ['评分', '点评', '回复', '发表于']):
                    return text
    
    # 如果选择器没找到，尝试从整个元素中提取
    full_text = element.get_text()
    # 查找可能的用户名模式
    user_patterns = [
        r'^([\u4e00-\u9fa5A-Za-z0-9_]{2,15})\s',
        r'用户([A-Za-z0-9_]{3,10})',
        r'游客([A-Za-z0-9_]+)'
    ]
    
    for pattern in user_patterns:
        match = re.search(pattern, full_text)
        if match:
            return match.group(1)
    
    return None

def extract_rating_improved(element):
    """改进的评分提取"""
    try:
        # 方法1: 查找评分元素
        rating_selectors = [
            '.averageScore',
            '.score',
            '.rating',
            '[class*="score"]',
            '[class*="rating"]',
            '[class*="average"]',
            '.ant-rate',
            '.star'
        ]
        
        for selector in rating_selectors:
            rating_elements = element.select(selector)
            for rating_element in rating_elements:
                # 从文本中提取数字
                text = rating_element.get_text(strip=True)
                rating_match = re.search(r'(\d+(?:\.\d+)?)', text)
                if rating_match:
                    rating = float(rating_match.group(1))
                    if 0 <= rating <= 5:  # 合理的评分范围
                        return rating
                
                # 从class名中提取
                class_names = rating_element.get('class', [])
                for class_name in class_names:
                    score_match = re.search(r'(\d+)', class_name)
                    if score_match:
                        rating = float(score_match.group(1))
                        if 0 <= rating <= 5:
                            return rating
        
        # 方法2: 从整个文本中查找评分
        full_text = element.get_text()
        rating_patterns = [
            r'(\d+(?:\.\d+)?)\s*分',
            r'评分\s*(\d+(?:\.\d+)?)',
            r'score\s*(\d+(?:\.\d+)?)',
            r'rating\s*(\d+(?:\.\d+)?)'
        ]
        
        for pattern in rating_patterns:
            match = re.search(pattern, full_text, re.IGNORECASE)
            if match:
                rating = float(match.group(1))
                if 0 <= rating <= 5:
                    return rating
        
        # 方法3: 查找星星
        star_selectors = [
            '.ant-rate-star-full',
            '.star-full',
            '[class*="star"]'
        ]
        
        for selector in star_selectors:
            stars = element.select(selector)
            if stars:
                return min(len(stars), 5)  # 最多5分
        
        return None
        
    except Exception as e:
        print(f"    提取评分出错: {e}")
        return None

def extract_comment_content_improved(element):
    """改进的评论内容提取"""
    # 优先选择器
    content_selectors = [
        '.commentDetail',
        '.comment-content',
        '.content',
        '.detail',
        '.text',
        '.comment-text',
        'p',
        'span'
    ]
    
    for selector in content_selectors:
        content_elements = element.select(selector)
        for content_element in content_elements:
            text = content_element.get_text(strip=True)
            # 过滤条件：长度适中，不包含特定关键词
            if (text and len(text) >= 20 and len(text) <= 500 and
                not any(word in text for word in ['暂无点评', '我要点评', '条点评', '评分', '发表于'])):
                return clean_comment_text(text)
    
    # 备选：获取整个元素的文本并清理
    full_text = element.get_text(strip=True)
    cleaned_text = clean_comment_text(full_text)
    
    # 过滤出可能是评论的文本
    if (cleaned_text and len(cleaned_text) >= 20 and 
        not any(word in cleaned_text for word in ['暂无点评', '我要点评', '条点评'])):
        return cleaned_text
    
    return None

def clean_comment_text(text):
    """清理评论文本"""
    if not text:
        return text
    
    # 移除常见的前缀和后缀
    patterns_to_remove = [
        r'用户\d+\s*',
        r'游客\d+\s*',
        r'\d{4}-\d{2}-\d{2}\s*',
        r'\d+年\d+月\d+日\s*',
        r'\d+月\d+日\s*',
        r'发表于\s*',
        r'评分：\d+(?:\.\d+)?',
        r'点击回复',
        r'回复',
        r'举报',
        r'有用',
        r'展开'
    ]
    
    cleaned = text
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

def test_first_sight_debug():
    """调试模式：分析第一个景点的页面结构"""
    print("=== 调试模式：分析页面结构 ===")
    
    driver = setup_driver()
    
    try:
        # 读取Excel文件
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        if len(df) == 0:
            print("Excel文件中没有数据")
            return
            
        # 获取第一个景点
        first_sight = df.iloc[0]
        sight_name = first_sight['name']
        url = first_sight['url']
        
        print(f"测试景点: {sight_name}")
        print(f"URL: {url}")
        
        # 调试页面结构
        html_content = debug_page_structure(driver, url)
        
        if html_content:
            # 尝试解析评论
            print("\n=== 尝试解析评论 ===")
            comments = parse_comments_improved(html_content, sight_name, 1)
            
            if comments:
                print(f"\n成功解析出 {len(comments)} 条评论:")
                for i, comment in enumerate(comments[:5], 1):
                    print(f"{i}. 用户: {comment['user_name']}")
                    print(f"   评分: {comment['user_rating']}")
                    print(f"   内容: {comment['comment_content'][:80]}...")
                    print()
            else:
                print("未能解析出任何评论")
                
    finally:
        driver.quit()

def test_first_sight_full():
    """完整测试：爬取第一个景点的评论"""
    print("=== 完整测试模式 ===")
    
    driver = setup_driver()
    
    try:
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        first_sight = df.iloc[0]
        sight_name = first_sight['name']
        url = first_sight['url']
        rating = first_sight['评分（0-5分）']
        category = first_sight['分类']
        city = first_sight['所在城市']
        
        print(f"开始爬取: {sight_name}")
        
        all_comments = []
        page_num = 1
        max_pages = 5  # 测试时只爬取5页
        
        # 访问页面
        driver.get(url)
        time.sleep(5)
        
        # 滚动到评论区域
        driver.execute_script("window.scrollTo(0, 1000)")
        time.sleep(3)
        
        while page_num <= max_pages:
            print(f"处理第 {page_num} 页...")
            
            html_content = driver.page_source
            comments = parse_comments_improved(html_content, sight_name, page_num)
            
            if comments:
                all_comments.extend(comments)
                print(f"第{page_num}页找到 {len(comments)} 条评论，累计 {len(all_comments)} 条")
            else:
                print(f"第{page_num}页没有找到评论，停止爬取")
                break
            
            # 尝试翻页（简单版本）
            try:
                next_btn = driver.find_element(By.CSS_SELECTOR, '.ant-pagination-next:not(.ant-pagination-disabled)')
                driver.execute_script("arguments[0].click();", next_btn)
                time.sleep(3)
                page_num += 1
            except:
                print("没有下一页或翻页失败")
                break
        
        # 保存结果
        if all_comments:
            for comment in all_comments:
                comment.update({
                    'sight_rating': rating,
                    'sight_category': category,
                    'sight_city': city,
                    'sight_url': url
                })
            
            with open(TEST_CSV, "w", newline="", encoding="utf-8-sig") as f:
                fieldnames = ['sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                             'user_name', 'user_rating', 'comment_content', 'sight_url', 'page_number']
                w = csv.DictWriter(f, fieldnames=fieldnames)
                w.writeheader()
                w.writerows(all_comments)
            
            print(f"\n测试完成！共爬取 {len(all_comments)} 条评论")
            print(f"保存到: {TEST_CSV}")
        else:
            print("未找到任何评论")
            
    finally:
        driver.quit()

if __name__ == "__main__":
    print("请选择调试模式:")
    print("1. 页面结构分析（推荐先运行）")
    print("2. 完整测试爬取")
    
    choice = input("请输入选择 (1 或 2): ").strip()
    
    if choice == "1":
        test_first_sight_debug()
    elif choice == "2":
        test_first_sight_full()
    else:
        print("默认运行页面结构分析")
        test_first_sight_debug()

请选择调试模式:
1. 页面结构分析（推荐先运行）
2. 完整测试爬取
=== 调试模式：分析页面结构 ===
测试景点: 外滩
URL: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
=== 开始调试页面结构 ===
1. 查找评论相关元素:
  找到 162 个包含'comment'的class
  元素 1: class=['comment']
      文本: 4.8/5分157588条点评...

  元素 2: class=['commentScore']
      文本: 4.8/5分...

  元素 3: class=['commentScoreNum']
      文本: 4.8...

  元素 4: class=['commentScoreText']
      文本: /5分...

  元素 5: class=['commentCount']
      文本: 157588条点评...

  元素 6: class=['commentCountArrow', 'common-iconfont']
      文本: ...

  元素 7: class=['commentModuleRef']
      文本: 用户点评(157588)4.8/5分全部(157588)好评(128660)消费后评价(1902)差评(668)智能排序时间排序Annie长翅膀的喵5分超棒外滩，上海的名片。
        以前出名...

  元素 8: class=['commentModule', 'normalModule']
      文本: 用户点评(157588)4.8/5分全部(157588)好评(128660)消费后评价(1902)差评(668)智能排序时间排序Annie长翅膀的喵5分超棒外滩，上海的名片。
        以前出名...

  元素 9: class=['commentList']
      文本: Annie长翅膀的喵5分超棒外滩，上海的名片。
        以前出名的情人墙和远东第一弯，现在都弱化了，万国建筑群是依旧盛名的。
        好天气，浦江两岸的建筑显得尤其壮观，中山东一路的厚...

  元素 10: clas

In [11]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import csv
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# 常量定义
COMMENTS_CSV = "ctrip_all_pages_comments.csv"
TEST_CSV = "ctrip_test_first_sight.csv"

def setup_driver():
    """设置Chrome驱动"""
    chrome_options = Options()
    # chrome_options.add_argument('--headless')  # 先注释掉，便于调试
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def click_time_sort(driver):
    """点击时间排序按钮"""
    try:
        print("  正在查找排序按钮...")
        
        # 多种可能的选择器来找到排序区域
        sort_selectors = [
            ".commentModuleRef-commentModule-sortList",
            ".commentModule-sortList",
            ".sortList",
            "[class*='sort']",
            ".comment-sort"
        ]
        
        sort_container = None
        for selector in sort_selectors:
            try:
                sort_container = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                )
                if sort_container:
                    print(f"  找到排序容器: {selector}")
                    break
            except:
                continue
        
        if not sort_container:
            print("  未找到排序容器，尝试直接查找时间排序按钮")
            # 直接查找时间排序按钮
            try:
                time_sort_btn = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), '时间排序')]"))
                )
                print("  找到时间排序按钮")
                driver.execute_script("arguments[0].click();", time_sort_btn)
                time.sleep(3)
                return True
            except:
                return False
        
        # 在排序容器中查找时间排序标签
        print("  在排序容器中查找时间排序...")
        try:
            # 使用XPath在排序容器内查找包含"时间排序"的元素
            time_sort_btn = WebDriverWait(sort_container, 10).until(
                EC.element_to_be_clickable((By.XPATH, ".//*[contains(text(), '时间排序')]"))
            )
            print("  找到时间排序按钮，正在点击...")
            
            # 使用JavaScript点击，避免元素不可点击的问题
            driver.execute_script("arguments[0].click();", time_sort_btn)
            time.sleep(3)
            
            print("  时间排序点击成功")
            return True
        except Exception as e:
            print(f"  点击时间排序失败: {e}")
            return False
        
    except Exception as e:
        print(f"  点击时间排序时出错: {e}")
        return False

def click_next_page(driver, current_page_num):
    """点击下一页按钮并验证页面是否更新"""
    try:
        print(f"  正在查找第{current_page_num + 1}页按钮...")
        
        # 获取当前页面的评论内容作为参考
        old_comments = get_page_comments_preview(driver)
        
        # 使用您提供的分页路径
        next_page_selectors = [
            ".commentModuleRef-commentModule-myPagination-ant-pagination .ant-pagination-next",
            ".commentModule-myPagination-ant-pagination .ant-pagination-next",
            ".myPagination-ant-pagination .ant-pagination-next",
            ".ant-pagination-next",
            "a[class*='next']",
            "button[class*='next']",
            "li[class*='next']"
        ]
        
        for selector in next_page_selectors:
            try:
                next_btn = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                )
                
                # 检查下一页按钮是否可用（没有被禁用）
                is_disabled = next_btn.get_attribute("disabled") or "ant-pagination-disabled" in next_btn.get_attribute("class") or "disabled" in next_btn.get_attribute("class")
                if is_disabled:
                    print("  下一页按钮被禁用，没有更多页面")
                    return False
                
                print(f"  找到下一页按钮: {selector}，正在点击...")
                
                # 点击下一页
                driver.execute_script("arguments[0].click();", next_btn)
                
                # 等待页面更新 - 检查评论内容是否变化
                print("  等待页面更新...")
                if wait_for_page_update(driver, old_comments):
                    print(f"  成功跳转到第{current_page_num + 1}页")
                    return True
                else:
                    print("  页面内容没有变化，可能翻页失败")
                    continue
                
            except TimeoutException:
                continue
            except Exception as e:
                print(f"  点击下一页失败 ({selector}): {e}")
                continue
        
        print("  未找到可用的下一页按钮")
        return False
        
    except Exception as e:
        print(f"  点击下一页时出错: {e}")
        return False

def get_page_comments_preview(driver):
    """获取当前页面评论的预览，用于检测页面是否更新"""
    try:
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'lxml')
        
        # 获取前几个评论的内容作为参考
        comments = []
        comment_elements = soup.find_all('div', class_=re.compile(r'comment-item|commentItem|comment_item'))
        
        for i, element in enumerate(comment_elements[:3]):  # 只取前3个评论
            text = element.get_text(strip=True)
            if text:
                comments.append(text[:50])  # 只取前50个字符
        
        return comments
    except:
        return []

def wait_for_page_update(driver, old_comments, timeout=10):
    """等待页面内容更新"""
    start_time = time.time()
    
    while time.time() - start_time < timeout:
        try:
            # 等待可能的加载动画消失
            time.sleep(2)
            
            # 获取新的评论内容
            new_comments = get_page_comments_preview(driver)
            
            # 如果评论内容发生变化，说明页面已更新
            if new_comments and new_comments != old_comments:
                return True
            
            # 检查是否有加载中的状态
            loading_selectors = [
                ".ant-spin",
                ".loading",
                "[class*='spin']",
                "[class*='loading']"
            ]
            
            is_loading = False
            for selector in loading_selectors:
                try:
                    loading_element = driver.find_element(By.CSS_SELECTOR, selector)
                    if loading_element.is_displayed():
                        is_loading = True
                        break
                except:
                    continue
            
            if is_loading:
                print("    页面仍在加载中...")
                time.sleep(1)
                continue
            else:
                # 如果没有加载动画且内容没变，可能已经是最后一页
                break
                
        except Exception as e:
            print(f"    检测页面更新时出错: {e}")
            time.sleep(1)
    
    return False

def parse_comments_from_html(html_content, sight_name, page_num):
    """
    从HTML中解析评论的用户名、评分和内容
    """
    soup = BeautifulSoup(html_content, 'lxml')
    comments = []
    
    # 查找评论项
    comment_items = soup.find_all('div', class_=re.compile(r'commentList-commentItem'))
    
    for item in comment_items:
        try:
            # 提取用户名
            user_name = extract_user_name(item)
            
            # 提取评分
            rating = extract_rating(item)
            
            # 提取评论内容
            comment_content = extract_comment_content(item)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'user_rating': rating,  # 用户评分
                    'comment_content': comment_content,
                    'page_number': page_num
                })
                
        except Exception as e:
            print(f"    解析评论时出错: {e}")
            continue
    
    # 如果上面的选择器没找到，尝试备选方案
    if not comments:
        comments = fallback_comment_parsing(soup, sight_name, page_num)
    
    return comments

def extract_user_name(comment_element):
    """提取用户名"""
    user_selectors = [
        '.userInfo .userName',
        '.user-info .user-name',
        '.username',
        '.userName',
        '.user-name',
        'span[class*="user"]',
        'div[class*="user"]',
        '.name'
    ]
    
    for selector in user_selectors:
        user_element = comment_element.select_one(selector)
        if user_element:
            text = user_element.get_text(strip=True)
            if text and len(text) > 1:
                return text
    
    return "未知用户"

def extract_rating(comment_element):
    """提取用户评分"""
    try:
        # 使用您提供的评分路径
        rating_selectors = [
            '.commentList-commentItem-contentInfo-averageScore',
            '.contentInfo-averageScore',
            '.averageScore',
            '[class*="averageScore"]',
            '[class*="score"]',
            '.rating',
            '[class*="rating"]'
        ]
        
        for selector in rating_selectors:
            rating_element = comment_element.select_one(selector)
            if rating_element:
                # 方法1: 直接从元素文本提取数字
                rating_text = rating_element.get_text(strip=True)
                rating_match = re.search(r'(\d+(?:\.\d+)?)', rating_text)
                if rating_match:
                    return float(rating_match.group(1))
                
                # 方法2: 从class名中提取评分（如averageScore-5）
                class_attr = rating_element.get('class', [])
                for class_name in class_attr:
                    score_match = re.search(r'(\d+(?:\.\d+)?)', class_name)
                    if score_match:
                        return float(score_match.group(1))
        
        # 方法3: 查找包含数字的文本
        full_text = comment_element.get_text()
        rating_match = re.search(r'(\d+(?:\.\d+)?)\s*分', full_text)
        if rating_match:
            return float(rating_match.group(1))
            
        # 方法4: 查找星星评分
        star_elements = comment_element.select('.ant-rate-star-full, [class*="star"]')
        if star_elements:
            return len(star_elements)
        
        return None
        
    except Exception as e:
        print(f"    提取评分时出错: {e}")
        return None

def extract_comment_content(comment_element):
    """提取评论内容"""
    # 使用您提供的评论文本路径
    content_selectors = [
        '.commentList-commentItem-contentInfo-commentDetail',
        '.contentInfo-commentDetail',
        '.commentDetail',
        '[class*="commentDetail"]',
        '.comment-content',
        '.content',
        '.detail',
        'p'
    ]
    
    for selector in content_selectors:
        content_element = comment_element.select_one(selector)
        if content_element:
            text = content_element.get_text(strip=True)
            if text and len(text) >= 5:
                return clean_comment_text(text)
    
    # 备选方案
    full_text = comment_element.get_text(strip=True)
    cleaned_text = clean_comment_text(full_text)
    if cleaned_text and len(cleaned_text) >= 10:
        return cleaned_text
    
    return None

def clean_comment_text(text):
    """清理评论文本"""
    patterns_to_remove = [
        r'用户\d+\s*',
        r'游客\d+\s*',
        r'\d{4}-\d{2}-\d{2}\s*',
        r'\d+月\d+日\s*',
        r'发表于\s*',
        r'评分：\d+',
        r'点击回复',
        r'回复'
    ]
    
    cleaned = text
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned)
    
    return cleaned.strip()

def fallback_comment_parsing(soup, sight_name, page_num):
    """备选方案：更通用的评论解析"""
    comments = []
    
    # 查找所有可能包含评论的div
    potential_comments = soup.find_all('div', class_=re.compile(r'comment|review|点评|评价'))
    
    for element in potential_comments:
        try:
            # 获取完整文本
            full_text = element.get_text(strip=True)
            
            # 跳过太短或包含特定关键词的文本
            if (len(full_text) < 20 or 
                '暂无点评' in full_text or 
                '我要点评' in full_text or
                '条点评' in full_text):
                continue
            
            # 尝试分离用户名和评论内容
            user_name, comment_content = separate_user_and_content(full_text)
            
            # 尝试提取评分
            rating = extract_rating_from_text(full_text)
            
            if user_name and comment_content:
                comments.append({
                    'sight_name': sight_name,
                    'user_name': user_name,
                    'user_rating': rating,
                    'comment_content': comment_content,
                    'page_number': page_num
                })
                
        except Exception as e:
            continue
    
    return comments

def extract_rating_from_text(text):
    """从文本中提取评分"""
    try:
        # 查找评分模式
        rating_match = re.search(r'(\d+(?:\.\d+)?)\s*分', text)
        if rating_match:
            return float(rating_match.group(1))
        
        # 查找星星模式
        star_match = re.search(r'[★☆♥❤♡]', text)
        if star_match:
            # 简单计数星星
            star_count = text.count('★') + text.count('♥') + text.count('❤')
            if star_count > 0:
                return min(star_count, 5)  # 最多5分
        
        return None
    except:
        return None

def separate_user_and_content(text):
    """尝试从文本中分离用户名和评论内容"""
    user_patterns = [
        r'^([\u4e00-\u9fa5a-zA-Z0-9_]{2,10})\s+',
        r'用户(\d+)\s*',
        r'游客(\d+)\s*',
        r'^(\w+)\s+'
    ]
    
    user_name = None
    comment_content = text
    
    for pattern in user_patterns:
        match = re.match(pattern, text)
        if match:
            user_name = match.group(1)
            comment_content = text[match.end():].strip()
            break
    
    return user_name, comment_content

def fetch_all_comments_for_sight(driver, url, sight_name, max_pages=301, test_mode=False):
    """获取单个景点的所有评论页面（1-301页）"""
    all_comments = []
    page_num = 1
    
    if test_mode:
        print(f"  [测试模式] 开始爬取 {sight_name} 的评论页面...")
        max_pages = 3
    else:
        print(f"  开始爬取 {sight_name} 的评论页面...")
    
    try:
        # 访问景点页面
        print(f"  正在访问: {url}")
        driver.get(url)
        
        # 等待页面加载
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
        )
        time.sleep(3)
        
        # 滚动到评论区域
        print("  滚动到评论区域...")
        driver.execute_script("window.scrollTo(0, 1000)")
        time.sleep(2)
        
        # 首先点击时间排序
        print("  正在设置时间排序...")
        sort_success = click_time_sort(driver)
        if not sort_success:
            print("  时间排序设置失败，将继续使用默认排序")
        
        # 开始爬取页面
        while page_num <= max_pages:
            print(f"  正在处理第 {page_num} 页...")
            
            # 获取当前页面HTML
            html_content = driver.page_source
            
            # 解析当前页的评论
            page_comments = parse_comments_from_html(html_content, sight_name, page_num)
            
            if page_comments:
                all_comments.extend(page_comments)
                if test_mode:
                    print(f"    [测试] 第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
                else:
                    print(f"    第{page_num}页找到 {len(page_comments)} 条评论，累计 {len(all_comments)} 条")
                
                # 显示第一条评论的详细信息（用于调试）
                if page_num == 1 and page_comments:
                    first_comment = page_comments[0]
                    print(f"    示例评论 - 用户: {first_comment['user_name']}, 评分: {first_comment['user_rating']}, 内容: {first_comment['comment_content'][:30]}...")
            else:
                print(f"    第{page_num}页未找到评论")
                # 如果连续2页都没有评论，可能已经到底了
                if page_num > 1 and len(all_comments) == 0:
                    print("    连续多页没有评论，停止爬取")
                    break
            
            # 检查是否达到最大页数
            if page_num >= max_pages:
                if test_mode:
                    print(f"    [测试] 测试完成，共爬取 {page_num} 页")
                else:
                    print(f"    已达到最大页数 {max_pages}，停止爬取")
                break
            
            # 尝试点击下一页
            next_success = click_next_page(driver, page_num)
            
            if not next_success:
                print("    没有更多页面，停止爬取")
                break
            
            page_num += 1
            time.sleep(2)  # 页面间延迟
            
    except Exception as e:
        print(f"  爬取过程中出错: {e}")
    
    return all_comments

def test_first_sight():
    """测试模式：只爬取第一个景点"""
    print("=== 测试模式：只爬取第一个景点 ===")
    
    # 设置浏览器驱动
    driver = setup_driver()
    
    try:
        # 读取Excel文件中的景点数据
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
        if len(df) == 0:
            print("[ERROR] Excel文件中没有数据")
            return
            
        # 获取第一个景点
        first_sight = df.iloc[0]
        sight_name = first_sight['name']
        url = first_sight['url']
        rating = first_sight['评分（0-5分）']
        category = first_sight['分类']
        city = first_sight['所在城市']
        
        print(f"\n测试景点信息:")
        print(f"  名称: {sight_name}")
        print(f"  URL: {url}")
        print(f"  景点评分: {rating}")
        print(f"  分类: {category}")
        print(f"  城市: {city}")
        
        # 爬取第一个景点的评论（测试模式只爬取3页）
        print(f"\n开始测试爬取...")
        comments = fetch_all_comments_for_sight(driver, url, sight_name, max_pages=3, test_mode=True)
        
        # 为每条评论添加额外信息
        for comment in comments:
            comment.update({
                'sight_rating': rating,  # 景点评分
                'sight_category': category,
                'sight_city': city,
                'sight_url': url
            })
        
        # 保存测试结果
        if comments:
            with open(TEST_CSV, "w", newline="", encoding="utf-8-sig") as f:
                fieldnames = [
                    'sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                    'user_name', 'user_rating', 'comment_content', 'sight_url', 'page_number'
                ]
                w = csv.DictWriter(f, fieldnames=fieldnames)
                w.writeheader()
                w.writerows(comments)
            
            print(f"\n[测试完成]")
            print(f"爬取页数: {len(set([c['page_number'] for c in comments]))}页")
            print(f"评论数量: {len(comments)} 条")
            print(f"测试文件: {TEST_CSV}")
            
            # 显示统计信息
            rated_comments = [c for c in comments if c['user_rating'] is not None]
            if rated_comments:
                avg_rating = sum(c['user_rating'] for c in rated_comments) / len(rated_comments)
                print(f"用户评分统计: 平均 {avg_rating:.2f}分 ({len(rated_comments)}/{len(comments)} 条评论有评分)")
            
            # 显示前几条评论预览
            print(f"\n前3条评论预览:")
            for i, comment in enumerate(comments[:3], 1):
                print(f"  {i}. 用户: {comment['user_name']}")
                print(f"     评分: {comment['user_rating']}")
                print(f"     内容: {comment['comment_content'][:50]}...")
                print()
                
        else:
            print("[测试失败] 未找到任何评论")
            
    finally:
        driver.quit()

def main():
    """主函数 - 为Excel中的每个景点爬取1-301页评论"""
    
    # 设置浏览器驱动
    driver = setup_driver()
    
    try:
        # 读取Excel文件中的景点数据
        df = pd.read_excel("4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx")
        print(f"[INFO] 成功读取Excel文件，共 {len(df)} 个景点")
        
        all_comments = []
        total_sights = len(df)
        
        print(f"\n=== 开始为 {total_sights} 个景点爬取1-301页评论 ===")
        
        # 为Excel中的每个景点爬取所有评论页面
        for index, row in df.iterrows():
            sight_name = row['name']
            url = row['url']
            rating = row['评分（0-5分）']
            category = row['分类']
            city = row['所在城市']
            
            print(f"\n[{index+1}/{total_sights}] 正在爬取: {sight_name}")
            print(f"  景点评分: {rating}, 分类: {category}, 城市: {city}")
            
            # 获取并解析所有评论页面（1-301页）
            comments = fetch_all_comments_for_sight(driver, url, sight_name, max_pages=301)
            
            # 为每条评论添加额外信息
            for comment in comments:
                comment.update({
                    'sight_rating': rating,
                    'sight_category': category,
                    'sight_city': city,
                    'sight_url': url
                })
            
            all_comments.extend(comments)
            
            # 统计信息
            rated_count = len([c for c in comments if c['user_rating'] is not None])
            print(f"  {sight_name} 总计: {len(comments)} 条评论 ({rated_count} 条有评分)，来自 {len(set([c['page_number'] for c in comments]))} 页")
            
            time.sleep(3)
            
            if (index + 1) % 2 == 0:
                print(f"[进度保存] 已处理 {index+1} 个景点，总计 {len(all_comments)} 条评论")
                temp_df = pd.DataFrame(all_comments)
                temp_df.to_csv('ctrip_comments_temp.csv', index=False, encoding='utf-8-sig')
        
        # 保存最终评论数据
        if all_comments:
            with open(COMMENTS_CSV, "w", newline="", encoding="utf-8-sig") as f:
                fieldnames = [
                    'sight_name', 'sight_rating', 'sight_category', 'sight_city', 
                    'user_name', 'user_rating', 'comment_content', 'sight_url', 'page_number'
                ]
                w = csv.DictWriter(f, fieldnames=fieldnames)
                w.writeheader()
                w.writerows(all_comments)
            
            print(f"\n[SUCCESS] 爬取完成！")
            print(f"评论文件: {COMMENTS_CSV}")
            
            # 最终统计
            total_rated = len([c for c in all_comments if c['user_rating'] is not None])
            print(f"总评论数: {len(all_comments)}")
            print(f"有评分的评论: {total_rated} ({total_rated/len(all_comments)*100:.1f}%)")
            
        else:
            print("[WARNING] 未找到任何评论")
            
    finally:
        driver.quit()

if __name__ == "__main__":
    # 检查是否安装了selenium
    try:
        from selenium import webdriver
    except ImportError:
        print("错误：需要安装selenium")
        print("请运行: pip install selenium")
        print("并下载ChromeDriver: https://chromedriver.chromium.org/")
        exit(1)
    
    # 询问用户要运行测试模式还是完整模式
    print("请选择运行模式:")
    print("1. 测试模式 (只爬取第一个景点的前3页)")
    print("2. 完整模式 (爬取所有景点的1-301页)")
    
    choice = input("请输入选择 (1 或 2): ").strip()
    
    if choice == "1":
        test_first_sight()
    elif choice == "2":
        main()
    else:
        print("无效选择，默认运行测试模式")
        test_first_sight()

请选择运行模式:
1. 测试模式 (只爬取第一个景点的前3页)
2. 完整模式 (爬取所有景点的1-301页)
=== 测试模式：只爬取第一个景点 ===
[INFO] 成功读取Excel文件，共 80 个景点

测试景点信息:
  名称: 外滩
  URL: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  景点评分: 4.8
  分类: 遛娃宝藏地
  城市: 上海 

开始测试爬取...
  [测试模式] 开始爬取 外滩 的评论页面...
  正在访问: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  滚动到评论区域...
  正在设置时间排序...
  正在查找排序按钮...
  找到排序容器: .sortList
  在排序容器中查找时间排序...
  找到时间排序按钮，正在点击...
  时间排序点击成功
  正在处理第 1 页...
    [测试] 第1页找到 1 条评论，累计 1 条
    示例评论 - 用户: YoYo_9T0R7W0Q4分满意性价比不高, 评分: 4.0, 内容: 难得去下可以2025-11-09IP属地：江苏举报点赞...
  正在查找第2页按钮...
  找到下一页按钮: .ant-pagination-next，正在点击...
  等待页面更新...
  页面内容没有变化，可能翻页失败
  找到下一页按钮: li[class*='next']，正在点击...
  等待页面更新...
  成功跳转到第2页
  正在处理第 2 页...
    [测试] 第2页找到 1 条评论，累计 2 条
  正在查找第3页按钮...


KeyboardInterrupt: 

In [12]:
# ============================================================
# 携程评论爬虫（本地版 - 每个景点单独CSV）
# ============================================================

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import random
import re
import os
from datetime import datetime


# ============================================================
# 浏览器配置
# ============================================================
def setup_driver():
    chrome_options = Options()
    # chrome_options.add_argument("--headless=new")  # 本地运行时可以注释掉便于调试
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option("useAutomationExtension", False)

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver


# ============================================================
# 创建输出目录
# ============================================================
def create_output_directory():
    """创建输出目录"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f"ctrip_reviews_{timestamp}"
    os.makedirs(output_dir, exist_ok=True)
    return output_dir


# ============================================================
# 提取当前页评论函数
# ============================================================
def extract_comments_from_page(driver, attraction_name):
    """从当前页面提取评论数据"""
    try:
        comment_blocks = driver.find_elements(By.XPATH, "//div[contains(@class,'commentItem')]")
    except:
        comment_blocks = []

    reviews_data = []
    print(f"    找到 {len(comment_blocks)} 个评论块")

    for idx, block in enumerate(comment_blocks, 1):
        try:
            # 评分
            rating = None
            try:
                rating_elem = block.find_element(By.XPATH, ".//span[contains(@class,'averageScore')]")
                rating_text = rating_elem.text.strip()
                match = re.search(r"(\d+)", rating_text)
                if match:
                    rating = int(match.group(1))
            except:
                pass

            # 评论内容
            comment_text = ""
            try:
                text_elem = block.find_element(By.XPATH, ".//div[contains(@class,'commentDetail')]")
                comment_text = text_elem.text.strip()
            except:
                pass

            # 清理评论内容
            comment_text = re.sub(r".*", "", comment_text)
            comment_text = re.sub(r"\d{4}-\d{2}-\d{2}.*", "", comment_text)
            comment_text = comment_text.replace("\n", "").strip()

            # 用户名（可选）
            user_name = "匿名用户"
            try:
                user_elem = block.find_element(By.XPATH, ".//span[contains(@class,'userName')]")
                user_name = user_elem.text.strip()
            except:
                pass

            if comment_text and rating is not None:
                reviews_data.append({
                    "景点名称": attraction_name,
                    "用户名称": user_name,
                    "评分": rating,
                    "评论内容": comment_text,
                    "爬取时间": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
                print(f"      第{idx}条评论: 评分{rating}, 内容长度{len(comment_text)}")
                
        except Exception as e:
            print(f"      解析第{idx}条评论时出错: {e}")
            continue

    return reviews_data


# ============================================================
# 点击时间排序
# ============================================================
def click_time_sort(driver):
    """点击时间排序按钮"""
    try:
        print("    正在查找时间排序按钮...")
        
        # 多种可能的选择器
        sort_selectors = [
            "//*[contains(text(), '时间排序')]",
            "//span[contains(text(), '时间排序')]",
            "//a[contains(text(), '时间排序')]",
            "//button[contains(text(), '时间排序')]"
        ]
        
        for selector in sort_selectors:
            try:
                sort_btn = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, selector))
                )
                driver.execute_script("arguments[0].click();", sort_btn)
                print("    ✓ 点击时间排序成功")
                time.sleep(3)
                return True
            except:
                continue
        
        print("    ⚠️ 未找到时间排序按钮")
        return False
        
    except Exception as e:
        print(f"    ✗ 点击时间排序失败: {e}")
        return False


# ============================================================
# 自动翻页爬取
# ============================================================
def scrape_all_pages(driver, url, attraction_name, max_pages=300):
    """爬取单个景点的所有评论页面"""
    print(f"  开始访问: {url}")
    driver.get(url)
    time.sleep(6)

    # 点击"用户点评"标签（如果存在）
    try:
        tab = driver.find_element(By.XPATH, "//a[contains(text(),'用户点评') or contains(text(),'点评')]")
        driver.execute_script("arguments[0].click();", tab)
        print("  ✓ 点击'用户点评'成功")
        time.sleep(4)
    except:
        print("  ⚠️ 未找到'用户点评'标签，继续当前页面")

    # 点击时间排序
    click_time_sort(driver)

    # 滚动加载第一页
    driver.execute_script("window.scrollBy(0, 800);")
    time.sleep(3)

    all_reviews = []
    page = 1
    consecutive_empty_pages = 0  # 连续空页计数器

    while page <= max_pages:
        print(f"    --- 正在爬取第 {page} 页 ---")
        
        # 滚动到页面底部以加载更多内容
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        
        page_reviews = extract_comments_from_page(driver, attraction_name)
        
        if page_reviews:
            all_reviews.extend(page_reviews)
            consecutive_empty_pages = 0  # 重置空页计数器
            print(f"    ✓ 本页提取 {len(page_reviews)} 条评论，累计 {len(all_reviews)} 条")
        else:
            consecutive_empty_pages += 1
            print(f"    ⚠️ 第{page}页未找到评论")
            
            # 如果连续3页都没有评论，停止爬取
            if consecutive_empty_pages >= 3:
                print("    ⚠️ 连续多页无评论，停止爬取")
                break

        # 尝试点击"下一页"
        try:
            next_selectors = [
                "//a[contains(text(),'下一页')]",
                "//button[contains(text(),'下一页')]",
                "//span[contains(text(),'下一页')]",
                "//a[@aria-label='Next']",
                "//li[contains(@class,'next')]//a"
            ]
            
            next_found = False
            for selector in next_selectors:
                try:
                    next_btn = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable((By.XPATH, selector))
                    )
                    
                    # 检查下一页按钮是否被禁用
                    if "disabled" in next_btn.get_attribute("class") or not next_btn.is_enabled():
                        print("    ⚠️ 下一页按钮被禁用，到达最后一页")
                        break
                    
                    driver.execute_script("arguments[0].click();", next_btn)
                    print("    → 点击下一页成功")
                    time.sleep(random.uniform(3, 5))
                    page += 1
                    next_found = True
                    break
                    
                except:
                    continue
            
            if not next_found:
                print("    ⚠️ 未找到可用的下一页按钮，停止爬取")
                break
                
        except Exception as e:
            print(f"    ✗ 翻页时出错: {e}")
            break

    print(f"  ✅ {attraction_name} 爬取完成，共 {len(all_reviews)} 条评论（{page} 页）")
    return all_reviews


# ============================================================
# 保存单个景点的评论
# ============================================================
def save_single_attraction_reviews(reviews_data, attraction_name, output_dir):
    """保存单个景点的评论到CSV文件"""
    if not reviews_data:
        print(f"  ⚠️ {attraction_name} 没有评论数据，跳过保存")
        return None
    
    # 清理文件名中的非法字符
    safe_filename = re.sub(r'[\\/*?:"<>|]', "", attraction_name)
    filename = f"{safe_filename}_评论.csv"
    filepath = os.path.join(output_dir, filename)
    
    df = pd.DataFrame(reviews_data)
    df.to_csv(filepath, index=False, encoding='utf-8-sig')
    
    print(f"  ✅ 已保存 {len(reviews_data)} 条评论到: {filepath}")
    return filepath


# ============================================================
# 主程序
# ============================================================
def main():
    print("=" * 70)
    print("携程评论爬虫 - 本地版")
    print("=" * 70)
    
    # 创建输出目录
    output_dir = create_output_directory()
    print(f"输出目录: {output_dir}")
    
    # 读取Excel文件（请修改为您的文件路径）
    excel_path = "4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx"  # 修改为您的文件路径
    
    if not os.path.exists(excel_path):
        print(f"❌ 文件不存在: {excel_path}")
        print("请将Excel文件放在当前目录下，或修改excel_path变量")
        return
    
    try:
        df = pd.read_excel(excel_path)
        print(f"✓ 成功读取Excel文件，共 {len(df)} 个景点")
        print(f"列名: {df.columns.tolist()}")
    except Exception as e:
        print(f"❌ 读取Excel文件失败: {e}")
        return
    
    # 初始化浏览器
    driver = setup_driver()
    print("✓ 浏览器初始化完成")
    
    # 统计信息
    total_attractions = len(df)
    successful_attractions = 0
    
    try:
        for index, row in df.iterrows():
            attraction_name = row['name']
            url = row['url']
            
            print(f"\n{'='*50}")
            print(f"处理第 {index+1}/{total_attractions} 个景点: {attraction_name}")
            print(f"URL: {url}")
            print(f"{'='*50}")
            
            # 检查URL有效性
            if not isinstance(url, str) or not url.startswith("http"):
                print(f"❌ 跳过 {attraction_name}: URL无效")
                continue
            
            try:
                # 爬取该景点的所有评论
                reviews = scrape_all_pages(driver, url, attraction_name, max_pages=300)
                
                # 保存单个景点的评论
                if reviews:
                    save_single_attraction_reviews(reviews, attraction_name, output_dir)
                    successful_attractions += 1
                else:
                    print(f"  ⚠️ {attraction_name} 未获取到评论")
                
                # 景点间延迟
                delay = random.uniform(5, 10)
                print(f"  等待 {delay:.1f} 秒后处理下一个景点...")
                time.sleep(delay)
                
            except Exception as e:
                print(f"❌ 处理 {attraction_name} 时出错: {e}")
                continue
        
        # 最终统计
        print(f"\n{'='*70}")
        print("爬取完成!")
        print(f"总景点数: {total_attractions}")
        print(f"成功爬取: {successful_attractions}")
        print(f"输出目录: {output_dir}")
        print(f"{'='*70}")
        
    finally:
        driver.quit()
        print("✓ 浏览器已关闭")


# ============================================================
# 运行
# ============================================================
if __name__ == "__main__":
    main()

携程评论爬虫 - 本地版
输出目录: ctrip_reviews_20251111_225233
✓ 成功读取Excel文件，共 80 个景点
列名: ['page', 'name', 'url', 'free', '分类', '评分（0-5分）', '点评数', '所在城市']
✓ 浏览器初始化完成

处理第 1/80 个景点: 外滩
URL: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  开始访问: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  ✓ 点击'用户点评'成功
    正在查找时间排序按钮...
    ✓ 点击时间排序成功
    --- 正在爬取第 1 页 ---
    找到 10 个评论块
      第1条评论: 评分3, 内容长度23
      第2条评论: 评分5, 内容长度11
      第3条评论: 评分5, 内容长度8
      第4条评论: 评分5, 内容长度7
      第5条评论: 评分3, 内容长度66
      第6条评论: 评分4, 内容长度8
      第7条评论: 评分5, 内容长度200
      第8条评论: 评分5, 内容长度5
      第9条评论: 评分4, 内容长度12
      第10条评论: 评分5, 内容长度9
    ✓ 本页提取 10 条评论，累计 10 条
    → 点击下一页成功
    --- 正在爬取第 2 页 ---
    找到 10 个评论块
      第1条评论: 评分5, 内容长度58
      第2条评论: 评分5, 内容长度4
      第3条评论: 评分5, 内容长度3
      第4条评论: 评分5, 内容长度13
      第5条评论: 评分5, 内容长度17
      第6条评论: 评分4, 内容长度130
      第7条评论: 评分5, 内容长度27
      第8条评论: 评分5, 内容长度14
      第9条评论: 评分5, 内容长度10
      第10条评论: 评分5, 内容长度22
    ✓ 本页提取 10 条评论，累计 20 条
    → 点击下一页成

In [13]:
# ============================================================
# 携程评论爬虫（本地版 - 每个景点单独CSV）
# ============================================================

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import random
import re
import os
from datetime import datetime


# ============================================================
# 浏览器配置
# ============================================================
def setup_driver():
    chrome_options = Options()
    # chrome_options.add_argument("--headless=new")  # 本地运行时可以注释掉便于调试
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option("useAutomationExtension", False)

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver


# ============================================================
# 创建输出目录
# ============================================================
def create_output_directory():
    """创建输出目录"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f"ctrip_reviews_{timestamp}"
    os.makedirs(output_dir, exist_ok=True)
    return output_dir


# ============================================================
# 提取当前页评论函数
# ============================================================
def extract_comments_from_page(driver, attraction_name):
    """从当前页面提取评论数据"""
    try:
        comment_blocks = driver.find_elements(By.XPATH, "//div[contains(@class,'commentItem')]")
    except:
        comment_blocks = []

    reviews_data = []
    print(f"    找到 {len(comment_blocks)} 个评论块")

    for idx, block in enumerate(comment_blocks, 1):
        try:
            # 评分
            rating = None
            try:
                rating_elem = block.find_element(By.XPATH, ".//span[contains(@class,'averageScore')]")
                rating_text = rating_elem.text.strip()
                match = re.search(r"(\d+)", rating_text)
                if match:
                    rating = int(match.group(1))
            except:
                pass

            # 评论内容
            comment_text = ""
            try:
                text_elem = block.find_element(By.XPATH, ".//div[contains(@class,'commentDetail')]")
                comment_text = text_elem.text.strip()
            except:
                pass

            # 清理评论内容
            comment_text = re.sub(r".*", "", comment_text)
            comment_text = re.sub(r"\d{4}-\d{2}-\d{2}.*", "", comment_text)
            comment_text = comment_text.replace("\n", "").strip()

            # 用户名（可选）
            user_name = "匿名用户"
            try:
                user_elem = block.find_element(By.XPATH, ".//span[contains(@class,'userName')]")
                user_name = user_elem.text.strip()
            except:
                pass

            if comment_text and rating is not None:
                reviews_data.append({
                    "景点名称": attraction_name,
                    "用户名称": user_name,
                    "评分": rating,
                    "评论内容": comment_text,
                    "爬取时间": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
                print(f"      第{idx}条评论: 评分{rating}, 内容长度{len(comment_text)}")
                
        except Exception as e:
            print(f"      解析第{idx}条评论时出错: {e}")
            continue

    return reviews_data


# ============================================================
# 点击时间排序
# ============================================================
def click_time_sort(driver):
    """点击时间排序按钮"""
    try:
        print("    正在查找时间排序按钮...")
        
        # 多种可能的选择器
        sort_selectors = [
            "//*[contains(text(), '时间排序')]",
            "//span[contains(text(), '时间排序')]",
            "//a[contains(text(), '时间排序')]",
            "//button[contains(text(), '时间排序')]"
        ]
        
        for selector in sort_selectors:
            try:
                sort_btn = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, selector))
                )
                driver.execute_script("arguments[0].click();", sort_btn)
                print("    ✓ 点击时间排序成功")
                time.sleep(3)
                return True
            except:
                continue
        
        print("    ⚠️ 未找到时间排序按钮")
        return False
        
    except Exception as e:
        print(f"    ✗ 点击时间排序失败: {e}")
        return False


# ============================================================
# 自动翻页爬取
# ============================================================
def scrape_all_pages(driver, url, attraction_name, max_pages=300):
    """爬取单个景点的所有评论页面"""
    print(f"  开始访问: {url}")
    driver.get(url)
    time.sleep(6)

    # 点击"用户点评"标签（如果存在）
    try:
        tab = driver.find_element(By.XPATH, "//a[contains(text(),'用户点评') or contains(text(),'点评')]")
        driver.execute_script("arguments[0].click();", tab)
        print("  ✓ 点击'用户点评'成功")
        time.sleep(4)
    except:
        print("  ⚠️ 未找到'用户点评'标签，继续当前页面")

    # 点击时间排序
    click_time_sort(driver)

    # 滚动加载第一页
    driver.execute_script("window.scrollBy(0, 800);")
    time.sleep(3)

    all_reviews = []
    page = 1
    consecutive_empty_pages = 0  # 连续空页计数器

    while page <= max_pages:
        print(f"    --- 正在爬取第 {page} 页 ---")
        
        # 滚动到页面底部以加载更多内容
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        
        page_reviews = extract_comments_from_page(driver, attraction_name)
        
        if page_reviews:
            all_reviews.extend(page_reviews)
            consecutive_empty_pages = 0  # 重置空页计数器
            print(f"    ✓ 本页提取 {len(page_reviews)} 条评论，累计 {len(all_reviews)} 条")
        else:
            consecutive_empty_pages += 1
            print(f"    ⚠️ 第{page}页未找到评论")
            
            # 如果连续3页都没有评论，停止爬取
            if consecutive_empty_pages >= 3:
                print("    ⚠️ 连续多页无评论，停止爬取")
                break

        # 尝试点击"下一页"
        try:
            next_selectors = [
                "//a[contains(text(),'下一页')]",
                "//button[contains(text(),'下一页')]",
                "//span[contains(text(),'下一页')]",
                "//a[@aria-label='Next']",
                "//li[contains(@class,'next')]//a"
            ]
            
            next_found = False
            for selector in next_selectors:
                try:
                    next_btn = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable((By.XPATH, selector))
                    )
                    
                    # 检查下一页按钮是否被禁用
                    if "disabled" in next_btn.get_attribute("class") or not next_btn.is_enabled():
                        print("    ⚠️ 下一页按钮被禁用，到达最后一页")
                        break
                    
                    driver.execute_script("arguments[0].click();", next_btn)
                    print("    → 点击下一页成功")
                    time.sleep(random.uniform(3, 5))
                    page += 1
                    next_found = True
                    break
                    
                except:
                    continue
            
            if not next_found:
                print("    ⚠️ 未找到可用的下一页按钮，停止爬取")
                break
                
        except Exception as e:
            print(f"    ✗ 翻页时出错: {e}")
            break

    print(f"  ✅ {attraction_name} 爬取完成，共 {len(all_reviews)} 条评论（{page} 页）")
    return all_reviews


# ============================================================
# 保存单个景点的评论
# ============================================================
def save_single_attraction_reviews(reviews_data, attraction_name, output_dir):
    """保存单个景点的评论到CSV文件"""
    if not reviews_data:
        print(f"  ⚠️ {attraction_name} 没有评论数据，跳过保存")
        return None
    
    # 清理文件名中的非法字符
    safe_filename = re.sub(r'[\\/*?:"<>|]', "", attraction_name)
    filename = f"{safe_filename}_评论.csv"
    filepath = os.path.join(output_dir, filename)
    
    df = pd.DataFrame(reviews_data)
    df.to_csv(filepath, index=False, encoding='utf-8-sig')
    
    print(f"  ✅ 已保存 {len(reviews_data)} 条评论到: {filepath}")
    return filepath


# ============================================================
# 主程序
# ============================================================
def main():
    print("=" * 70)
    print("携程评论爬虫 - 本地版")
    print("=" * 70)
    
    # 创建输出目录
    output_dir = create_output_directory()
    print(f"输出目录: {output_dir}")
    
    # 读取Excel文件（请修改为您的文件路径）
    excel_path = "4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx"  # 修改为您的文件路径
    
    if not os.path.exists(excel_path):
        print(f"❌ 文件不存在: {excel_path}")
        print("请将Excel文件放在当前目录下，或修改excel_path变量")
        return
    
    try:
        df = pd.read_excel(excel_path)
        print(f"✓ 成功读取Excel文件，共 {len(df)} 个景点")
        print(f"列名: {df.columns.tolist()}")
    except Exception as e:
        print(f"❌ 读取Excel文件失败: {e}")
        return
    
    # 初始化浏览器
    driver = setup_driver()
    print("✓ 浏览器初始化完成")
    
    # 统计信息
    total_attractions = len(df)
    successful_attractions = 0
    
    try:
        for index, row in df.iterrows():
            attraction_name = row['name']
            url = row['url']
            
            print(f"\n{'='*50}")
            print(f"处理第 {index+1}/{total_attractions} 个景点: {attraction_name}")
            print(f"URL: {url}")
            print(f"{'='*50}")
            
            # 检查URL有效性
            if not isinstance(url, str) or not url.startswith("http"):
                print(f"❌ 跳过 {attraction_name}: URL无效")
                continue
            
            try:
                # 爬取该景点的所有评论
                reviews = scrape_all_pages(driver, url, attraction_name, max_pages=5)
                
                # 保存单个景点的评论
                if reviews:
                    save_single_attraction_reviews(reviews, attraction_name, output_dir)
                    successful_attractions += 1
                else:
                    print(f"  ⚠️ {attraction_name} 未获取到评论")
                
                # 景点间延迟
                delay = random.uniform(5, 10)
                print(f"  等待 {delay:.1f} 秒后处理下一个景点...")
                time.sleep(delay)
                
            except Exception as e:
                print(f"❌ 处理 {attraction_name} 时出错: {e}")
                continue
        
        # 最终统计
        print(f"\n{'='*70}")
        print("爬取完成!")
        print(f"总景点数: {total_attractions}")
        print(f"成功爬取: {successful_attractions}")
        print(f"输出目录: {output_dir}")
        print(f"{'='*70}")
        
    finally:
        driver.quit()
        print("✓ 浏览器已关闭")


# ============================================================
# 运行
# ============================================================
if __name__ == "__main__":
    main()

携程评论爬虫 - 本地版
输出目录: ctrip_reviews_20251111_225822
✓ 成功读取Excel文件，共 80 个景点
列名: ['page', 'name', 'url', 'free', '分类', '评分（0-5分）', '点评数', '所在城市']
✓ 浏览器初始化完成

处理第 1/80 个景点: 外滩
URL: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  开始访问: https://you.ctrip.com/sight/shanghai2/736.html?scene=online
  ✓ 点击'用户点评'成功
    正在查找时间排序按钮...
    ✓ 点击时间排序成功
    --- 正在爬取第 1 页 ---
    找到 10 个评论块
      第1条评论: 评分3, 内容长度23
      第2条评论: 评分5, 内容长度11
      第3条评论: 评分5, 内容长度8
      第4条评论: 评分5, 内容长度7
      第5条评论: 评分3, 内容长度66
      第6条评论: 评分4, 内容长度8
      第7条评论: 评分5, 内容长度200
      第8条评论: 评分5, 内容长度5
      第9条评论: 评分4, 内容长度12
      第10条评论: 评分5, 内容长度9
    ✓ 本页提取 10 条评论，累计 10 条
    → 点击下一页成功
    --- 正在爬取第 2 页 ---
    找到 10 个评论块
      第1条评论: 评分5, 内容长度58
      第2条评论: 评分5, 内容长度4
      第3条评论: 评分5, 内容长度3
      第4条评论: 评分5, 内容长度13
      第5条评论: 评分5, 内容长度17
      第6条评论: 评分4, 内容长度130
      第7条评论: 评分5, 内容长度27
      第8条评论: 评分5, 内容长度14
      第9条评论: 评分5, 内容长度10
      第10条评论: 评分5, 内容长度22
    ✓ 本页提取 10 条评论，累计 20 条
    → 点击下一页成

In [15]:
# ============================================================
# 携程评论爬虫（本地版 - 每个景点单独CSV）
# ============================================================

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import random
import re
import os
from datetime import datetime


# ============================================================
# 浏览器配置
# ============================================================
def setup_driver():
    chrome_options = Options()
    # chrome_options.add_argument("--headless=new")  # 本地运行时可以注释掉便于调试
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option("useAutomationExtension", False)

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver


# ============================================================
# 创建输出目录
# ============================================================
def create_output_directory():
    """创建输出目录"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f"ctrip_reviews_{timestamp}"
    os.makedirs(output_dir, exist_ok=True)
    return output_dir


# ============================================================
# 提取当前页评论函数
# ============================================================
def extract_comments_from_page(driver, attraction_name):
    """从当前页面提取评论数据"""
    try:
        comment_blocks = driver.find_elements(By.XPATH, "//div[contains(@class,'commentItem')]")
    except:
        comment_blocks = []

    reviews_data = []
    print(f"    找到 {len(comment_blocks)} 个评论块")

    for idx, block in enumerate(comment_blocks, 1):
        try:
            # 评分
            rating = None
            try:
                rating_elem = block.find_element(By.XPATH, ".//span[contains(@class,'averageScore')]")
                rating_text = rating_elem.text.strip()
                match = re.search(r"(\d+)", rating_text)
                if match:
                    rating = int(match.group(1))
            except:
                pass

            # 评论内容
            comment_text = ""
            try:
                text_elem = block.find_element(By.XPATH, ".//div[contains(@class,'commentDetail')]")
                comment_text = text_elem.text.strip()
            except:
                pass

            # 清理评论内容
            comment_text = re.sub(r".*", "", comment_text)
            comment_text = re.sub(r"\d{4}-\d{2}-\d{2}.*", "", comment_text)
            comment_text = comment_text.replace("\n", "").strip()

            # 用户名（可选）
            user_name = "匿名用户"
            try:
                user_elem = block.find_element(By.XPATH, ".//span[contains(@class,'userName')]")
                user_name = user_elem.text.strip()
            except:
                pass

            if comment_text and rating is not None:
                reviews_data.append({
                    "景点名称": attraction_name,
                    "用户名称": user_name,
                    "评分": rating,
                    "评论内容": comment_text,
                    "爬取时间": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
                print(f"      第{idx}条评论: 评分{rating}, 内容长度{len(comment_text)}")
                
        except Exception as e:
            print(f"      解析第{idx}条评论时出错: {e}")
            continue

    return reviews_data


# ============================================================
# 点击时间排序
# ============================================================
def click_time_sort(driver):
    """点击时间排序按钮"""
    try:
        print("    正在查找时间排序按钮...")
        
        # 多种可能的选择器
        sort_selectors = [
            "//*[contains(text(), '时间排序')]",
            "//span[contains(text(), '时间排序')]",
            "//a[contains(text(), '时间排序')]",
            "//button[contains(text(), '时间排序')]"
        ]
        
        for selector in sort_selectors:
            try:
                sort_btn = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, selector))
                )
                driver.execute_script("arguments[0].click();", sort_btn)
                print("    ✓ 点击时间排序成功")
                time.sleep(3)
                return True
            except:
                continue
        
        print("    ⚠️ 未找到时间排序按钮")
        return False
        
    except Exception as e:
        print(f"    ✗ 点击时间排序失败: {e}")
        return False


# ============================================================
# 自动翻页爬取
# ============================================================
def scrape_all_pages(driver, url, attraction_name, max_pages=300):
    """爬取单个景点的所有评论页面"""
    print(f"  开始访问: {url}")
    driver.get(url)
    time.sleep(6)

    # 点击"用户点评"标签（如果存在）
    try:
        tab = driver.find_element(By.XPATH, "//a[contains(text(),'用户点评') or contains(text(),'点评')]")
        driver.execute_script("arguments[0].click();", tab)
        print("  ✓ 点击'用户点评'成功")
        time.sleep(4)
    except:
        print("  ⚠️ 未找到'用户点评'标签，继续当前页面")

    # 点击时间排序
    click_time_sort(driver)

    # 滚动加载第一页
    driver.execute_script("window.scrollBy(0, 800);")
    time.sleep(3)

    all_reviews = []
    page = 1
    consecutive_empty_pages = 0  # 连续空页计数器

    while page <= max_pages:
        print(f"    --- 正在爬取第 {page} 页 ---")
        
        # 滚动到页面底部以加载更多内容
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        
        page_reviews = extract_comments_from_page(driver, attraction_name)
        
        if page_reviews:
            all_reviews.extend(page_reviews)
            consecutive_empty_pages = 0  # 重置空页计数器
            print(f"    ✓ 本页提取 {len(page_reviews)} 条评论，累计 {len(all_reviews)} 条")
        else:
            consecutive_empty_pages += 1
            print(f"    ⚠️ 第{page}页未找到评论")
            
            # 如果连续3页都没有评论，停止爬取
            if consecutive_empty_pages >= 3:
                print("    ⚠️ 连续多页无评论，停止爬取")
                break

        # 尝试点击"下一页"
        try:
            next_selectors = [
                "//a[contains(text(),'下一页')]",
                "//button[contains(text(),'下一页')]",
                "//span[contains(text(),'下一页')]",
                "//a[@aria-label='Next']",
                "//li[contains(@class,'next')]//a"
            ]
            
            next_found = False
            for selector in next_selectors:
                try:
                    next_btn = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable((By.XPATH, selector))
                    )
                    
                    # 检查下一页按钮是否被禁用
                    if "disabled" in next_btn.get_attribute("class") or not next_btn.is_enabled():
                        print("    ⚠️ 下一页按钮被禁用，到达最后一页")
                        break
                    
                    driver.execute_script("arguments[0].click();", next_btn)
                    print("    → 点击下一页成功")
                    time.sleep(random.uniform(3, 5))
                    page += 1
                    next_found = True
                    break
                    
                except:
                    continue
            
            if not next_found:
                print("    ⚠️ 未找到可用的下一页按钮，停止爬取")
                break
                
        except Exception as e:
            print(f"    ✗ 翻页时出错: {e}")
            break

    print(f"  ✅ {attraction_name} 爬取完成，共 {len(all_reviews)} 条评论（{page} 页）")
    return all_reviews


# ============================================================
# 保存单个景点的评论
# ============================================================
def save_single_attraction_reviews(reviews_data, attraction_name, output_dir):
    """保存单个景点的评论到CSV文件"""
    if not reviews_data:
        print(f"  ⚠️ {attraction_name} 没有评论数据，跳过保存")
        return None
    
    # 清理文件名中的非法字符
    safe_filename = re.sub(r'[\\/*?:"<>|]', "", attraction_name)
    filename = f"{safe_filename}_评论.csv"
    filepath = os.path.join(output_dir, filename)
    
    df = pd.DataFrame(reviews_data)
    df.to_csv(filepath, index=False, encoding='utf-8-sig')
    
    print(f"  ✅ 已保存 {len(reviews_data)} 条评论到: {filepath}")
    return filepath


# ============================================================
# 主程序
# ============================================================
def main():
    print("=" * 70)
    print("携程评论爬虫 - 本地版")
    print("=" * 70)
    
    # 创建输出目录
    output_dir = create_output_directory()
    print(f"输出目录: {output_dir}")
    
    # 读取Excel文件（请修改为您的文件路径）
    excel_path = "4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx"  # 修改为您的文件路径
    
    if not os.path.exists(excel_path):
        print(f"❌ 文件不存在: {excel_path}")
        print("请将Excel文件放在当前目录下，或修改excel_path变量")
        return
    
    try:
        df = pd.read_excel(excel_path)
        print(f"✓ 成功读取Excel文件，共 {len(df)} 个景点")
        print(f"列名: {df.columns.tolist()}")
    except Exception as e:
        print(f"❌ 读取Excel文件失败: {e}")
        return
    
    # 初始化浏览器
    driver = setup_driver()
    print("✓ 浏览器初始化完成")
    
    # 统计信息
    total_attractions = len(df)
    successful_attractions = 0
    
    try:
        for index, row in df.iterrows():
            attraction_name = row['name']
            url = row['url']
            
            print(f"\n{'='*50}")
            print(f"处理第 {index+1}/{total_attractions} 个景点: {attraction_name}")
            print(f"URL: {url}")
            print(f"{'='*50}")
            
            # 检查URL有效性
            if not isinstance(url, str) or not url.startswith("http"):
                print(f"❌ 跳过 {attraction_name}: URL无效")
                continue
            
            try:
                # 爬取该景点的所有评论
                reviews = scrape_all_pages(driver, url, attraction_name, max_pages=300)
                
                # 保存单个景点的评论
                if reviews:
                    save_single_attraction_reviews(reviews, attraction_name, output_dir)
                    successful_attractions += 1
                else:
                    print(f"  ⚠️ {attraction_name} 未获取到评论")
                
                # 景点间延迟
                delay = random.uniform(5, 10)
                print(f"  等待 {delay:.1f} 秒后处理下一个景点...")
                time.sleep(delay)
                
            except Exception as e:
                print(f"❌ 处理 {attraction_name} 时出错: {e}")
                continue
        
        # 最终统计
        print(f"\n{'='*70}")
        print("爬取完成!")
        print(f"总景点数: {total_attractions}")
        print(f"成功爬取: {successful_attractions}")
        print(f"输出目录: {output_dir}")
        print(f"{'='*70}")
        
    finally:
        driver.quit()
        print("✓ 浏览器已关闭")


# ============================================================
# 运行
# ============================================================
if __name__ == "__main__":
    main()

携程评论爬虫 - 本地版
输出目录: ctrip_reviews_20251112_123733
✓ 成功读取Excel文件，共 65 个景点
列名: ['page', 'name', 'url', 'free', '分类', '评分（0-5分）', '点评数', '所在城市']
✓ 浏览器初始化完成

处理第 1/65 个景点: 河南博物院
URL: https://you.ctrip.com/sight/zhengzhou157/9593.html?scene=online
  开始访问: https://you.ctrip.com/sight/zhengzhou157/9593.html?scene=online
  ✓ 点击'用户点评'成功
    正在查找时间排序按钮...
    ✓ 点击时间排序成功
    --- 正在爬取第 1 页 ---
    找到 10 个评论块
      第1条评论: 评分5, 内容长度32
      第2条评论: 评分5, 内容长度11
      第3条评论: 评分5, 内容长度13
      第4条评论: 评分5, 内容长度16
      第5条评论: 评分5, 内容长度80
      第6条评论: 评分5, 内容长度9
      第7条评论: 评分5, 内容长度27
      第8条评论: 评分5, 内容长度7
      第9条评论: 评分5, 内容长度53
      第10条评论: 评分5, 内容长度6
    ✓ 本页提取 10 条评论，累计 10 条
    → 点击下一页成功
    --- 正在爬取第 2 页 ---
    找到 10 个评论块
      第1条评论: 评分5, 内容长度8
      第2条评论: 评分1, 内容长度480
      第3条评论: 评分5, 内容长度11
      第4条评论: 评分5, 内容长度6
      第5条评论: 评分5, 内容长度7
      第6条评论: 评分5, 内容长度41
      第7条评论: 评分5, 内容长度15
      第8条评论: 评分5, 内容长度23
      第9条评论: 评分5, 内容长度13
      第10条评论: 评分5, 内容长度17
    ✓ 本页提取 10 条评论，累计 20 条
 

KeyboardInterrupt: 

In [18]:
# ============================================================
# 携程评论爬虫（本地版 - 每个景点单独CSV）
# ============================================================

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import random
import re
import os
from datetime import datetime


# ============================================================
# 浏览器配置
# ============================================================
def setup_driver():
    chrome_options = Options()
    # chrome_options.add_argument("--headless=new")  # 本地运行时可以注释掉便于调试
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option("useAutomationExtension", False)

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver


# ============================================================
# 创建输出目录
# ============================================================
def create_output_directory():
    """创建输出目录"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f"ctrip_reviews_{timestamp}"
    os.makedirs(output_dir, exist_ok=True)
    return output_dir


# ============================================================
# 提取当前页评论函数
# ============================================================
def extract_comments_from_page(driver, attraction_name):
    """从当前页面提取评论数据"""
    try:
        comment_blocks = driver.find_elements(By.XPATH, "//div[contains(@class,'commentItem')]")
    except:
        comment_blocks = []

    reviews_data = []
    print(f"    找到 {len(comment_blocks)} 个评论块")

    for idx, block in enumerate(comment_blocks, 1):
        try:
            # 评分
            rating = None
            try:
                rating_elem = block.find_element(By.XPATH, ".//span[contains(@class,'averageScore')]")
                rating_text = rating_elem.text.strip()
                match = re.search(r"(\d+)", rating_text)
                if match:
                    rating = int(match.group(1))
            except:
                pass

            # 评论内容
            comment_text = ""
            try:
                text_elem = block.find_element(By.XPATH, ".//div[contains(@class,'commentDetail')]")
                comment_text = text_elem.text.strip()
            except:
                pass

            # 清理评论内容
            comment_text = re.sub(r".*", "", comment_text)
            comment_text = re.sub(r"\d{4}-\d{2}-\d{2}.*", "", comment_text)
            comment_text = comment_text.replace("\n", "").strip()

            # 用户名（可选）
            user_name = "匿名用户"
            try:
                user_elem = block.find_element(By.XPATH, ".//span[contains(@class,'userName')]")
                user_name = user_elem.text.strip()
            except:
                pass

            if comment_text and rating is not None:
                reviews_data.append({
                    "景点名称": attraction_name,
                    "用户名称": user_name,
                    "评分": rating,
                    "评论内容": comment_text,
                    "爬取时间": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
                print(f"      第{idx}条评论: 评分{rating}, 内容长度{len(comment_text)}")
                
        except Exception as e:
            print(f"      解析第{idx}条评论时出错: {e}")
            continue

    return reviews_data


# ============================================================
# 点击时间排序
# ============================================================
def click_time_sort(driver):
    """点击时间排序按钮"""
    try:
        print("    正在查找时间排序按钮...")
        
        # 多种可能的选择器
        sort_selectors = [
            "//*[contains(text(), '时间排序')]",
            "//span[contains(text(), '时间排序')]",
            "//a[contains(text(), '时间排序')]",
            "//button[contains(text(), '时间排序')]"
        ]
        
        for selector in sort_selectors:
            try:
                sort_btn = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, selector))
                )
                driver.execute_script("arguments[0].click();", sort_btn)
                print("    ✓ 点击时间排序成功")
                time.sleep(3)
                return True
            except:
                continue
        
        print("    ⚠️ 未找到时间排序按钮")
        return False
        
    except Exception as e:
        print(f"    ✗ 点击时间排序失败: {e}")
        return False


# ============================================================
# 检查是否到达最后一页
# ============================================================
def is_last_page(driver):
    """检查是否到达最后一页"""
    try:
        # 检查下一页按钮是否被禁用
        disabled_selectors = [
            "//a[contains(text(),'下一页')][contains(@class,'disabled')]",
            "//button[contains(text(),'下一页')][@disabled]",
            "//li[contains(@class,'next') and contains(@class,'disabled')]",
            "//a[@aria-label='Next' and contains(@class,'disabled')]"
        ]
        
        for selector in disabled_selectors:
            try:
                disabled_btn = driver.find_element(By.XPATH, selector)
                if disabled_btn:
                    print("    ⚠️ 检测到下一页按钮被禁用，到达最后一页")
                    return True
            except:
                continue
        
        # 检查是否有"没有更多数据"的提示
        no_data_selectors = [
            "//*[contains(text(), '没有更多')]",
            "//*[contains(text(), '暂无数据')]",
            "//*[contains(text(), '没有数据')]",
            "//*[contains(text(), '加载完毕')]"
        ]
        
        for selector in no_data_selectors:
            try:
                no_data_elem = driver.find_element(By.XPATH, selector)
                if no_data_elem.is_displayed():
                    print("    ⚠️ 检测到没有更多数据的提示，到达最后一页")
                    return True
            except:
                continue
                
        return False
        
    except Exception as e:
        print(f"    检查最后一页时出错: {e}")
        return False


# ============================================================
# 点击下一页
# ============================================================
def click_next_page(driver):
    """尝试点击下一页按钮"""
    try:
        next_selectors = [
            "//a[contains(text(),'下一页')]",
            "//button[contains(text(),'下一页')]",
            "//span[contains(text(),'下一页')]",
            "//a[@aria-label='Next']",
            "//li[contains(@class,'next')]//a",
            "//*[contains(@class, 'next-btn')]",
            "//*[contains(@class, 'next-page')]"
        ]
        
        for selector in next_selectors:
            try:
                next_btn = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, selector))
                )
                
                # 检查按钮是否可见和可用
                if next_btn.is_displayed() and next_btn.is_enabled():
                    driver.execute_script("arguments[0].click();", next_btn)
                    print("    → 点击下一页成功")
                    time.sleep(random.uniform(3, 5))
                    return True
                    
            except:
                continue
        
        print("    ⚠️ 未找到可用的下一页按钮")
        return False
        
    except Exception as e:
        print(f"    ✗ 点击下一页时出错: {e}")
        return False


# ============================================================
# 自动翻页爬取
# ============================================================
def scrape_all_pages(driver, url, attraction_name, max_pages=300):
    """爬取单个景点的所有评论页面"""
    print(f"  开始访问: {url}")
    driver.get(url)
    time.sleep(6)

    # 点击"用户点评"标签（如果存在）
    try:
        tab = driver.find_element(By.XPATH, "//a[contains(text(),'用户点评') or contains(text(),'点评')]")
        driver.execute_script("arguments[0].click();", tab)
        print("  ✓ 点击'用户点评'成功")
        time.sleep(4)
    except:
        print("  ⚠️ 未找到'用户点评'标签，继续当前页面")

    # 点击时间排序
    click_time_sort(driver)

    # 滚动加载第一页
    driver.execute_script("window.scrollBy(0, 800);")
    time.sleep(3)

    all_reviews = []
    page = 1
    consecutive_empty_pages = 0  # 连续空页计数器
    last_page_reached = False

    while page <= max_pages and not last_page_reached:
        print(f"    --- 正在爬取第 {page} 页 ---")
        
        # 滚动到页面底部以加载更多内容
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        
        page_reviews = extract_comments_from_page(driver, attraction_name)
        
        if page_reviews:
            all_reviews.extend(page_reviews)
            consecutive_empty_pages = 0  # 重置空页计数器
            print(f"    ✓ 本页提取 {len(page_reviews)} 条评论，累计 {len(all_reviews)} 条")
        else:
            consecutive_empty_pages += 1
            print(f"    ⚠️ 第{page}页未找到评论")
            
            # 如果连续3页都没有评论，停止爬取
            if consecutive_empty_pages >= 3:
                print("    ⚠️ 连续多页无评论，停止爬取")
                break

        # 检查是否到达最后一页
        if is_last_page(driver):
            print("    ✅ 已到达最后一页")
            last_page_reached = True
            break

        # 尝试点击"下一页"
        if page < max_pages and not last_page_reached:
            if not click_next_page(driver):
                print("    ⚠️ 无法点击下一页，可能已到达最后一页")
                last_page_reached = True
            else:
                page += 1
        else:
            break

    print(f"  ✅ {attraction_name} 爬取完成，共 {len(all_reviews)} 条评论（{page} 页）")
    return all_reviews


# ============================================================
# 保存单个景点的评论
# ============================================================
def save_single_attraction_reviews(reviews_data, attraction_name, output_dir):
    """保存单个景点的评论到CSV文件"""
    if not reviews_data:
        print(f"  ⚠️ {attraction_name} 没有评论数据，跳过保存")
        return None
    
    # 清理文件名中的非法字符
    safe_filename = re.sub(r'[\\/*?:"<>|]', "", attraction_name)
    filename = f"{safe_filename}_评论.csv"
    filepath = os.path.join(output_dir, filename)
    
    df = pd.DataFrame(reviews_data)
    df.to_csv(filepath, index=False, encoding='utf-8-sig')
    
    print(f"  ✅ 已保存 {len(reviews_data)} 条评论到: {filepath}")
    return filepath


# ============================================================
# 主程序
# ============================================================
def main():
    print("=" * 70)
    print("携程评论爬虫 - 本地版")
    print("=" * 70)
    
    # 创建输出目录
    output_dir = create_output_directory()
    print(f"输出目录: {output_dir}")
    
    # 读取Excel文件（请修改为您的文件路径）
    excel_path = "4. 下一步研究的景点列表（80个免费4.8分以上）.xlsx"  # 修改为您的文件路径
    
    if not os.path.exists(excel_path):
        print(f"❌ 文件不存在: {excel_path}")
        print("请将Excel文件放在当前目录下，或修改excel_path变量")
        return
    
    try:
        df = pd.read_excel(excel_path)
        print(f"✓ 成功读取Excel文件，共 {len(df)} 个景点")
        print(f"列名: {df.columns.tolist()}")
    except Exception as e:
        print(f"❌ 读取Excel文件失败: {e}")
        return
    
    # 初始化浏览器
    driver = setup_driver()
    print("✓ 浏览器初始化完成")
    
    # 统计信息
    total_attractions = len(df)
    successful_attractions = 0
    
    try:
        for index, row in df.iterrows():
            attraction_name = row['name']
            url = row['url']
            
            print(f"\n{'='*50}")
            print(f"处理第 {index+1}/{total_attractions} 个景点: {attraction_name}")
            print(f"URL: {url}")
            print(f"{'='*50}")
            
            # 检查URL有效性
            if not isinstance(url, str) or not url.startswith("http"):
                print(f"❌ 跳过 {attraction_name}: URL无效")
                continue
            
            try:
                # 爬取该景点的所有评论
                reviews = scrape_all_pages(driver, url, attraction_name, max_pages=300)
                
                # 保存单个景点的评论
                if reviews:
                    save_single_attraction_reviews(reviews, attraction_name, output_dir)
                    successful_attractions += 1
                else:
                    print(f"  ⚠️ {attraction_name} 未获取到评论")
                
                # 景点间延迟
                delay = random.uniform(5, 10)
                print(f"  等待 {delay:.1f} 秒后处理下一个景点...")
                time.sleep(delay)
                
            except Exception as e:
                print(f"❌ 处理 {attraction_name} 时出错: {e}")
                continue
        
        # 最终统计
        print(f"\n{'='*70}")
        print("爬取完成!")
        print(f"总景点数: {total_attractions}")
        print(f"成功爬取: {successful_attractions}")
        print(f"输出目录: {output_dir}")
        print(f"{'='*70}")
        
    finally:
        driver.quit()
        print("✓ 浏览器已关闭")


# ============================================================
# 运行
# ============================================================
if __name__ == "__main__":
    main()

携程评论爬虫 - 本地版
输出目录: ctrip_reviews_20251112_205651
✓ 成功读取Excel文件，共 1 个景点
列名: ['page', 'name', 'url', 'free', '分类', '评分（0-5分）', '点评数', '所在城市']
✓ 浏览器初始化完成

处理第 1/1 个景点: 敦煌書局
URL: https://you.ctrip.com/sight/dunhuang8/140039556.html?scene=online
  开始访问: https://you.ctrip.com/sight/dunhuang8/140039556.html?scene=online
  ✓ 点击'用户点评'成功
    正在查找时间排序按钮...
    ✓ 点击时间排序成功
    --- 正在爬取第 1 页 ---
    找到 10 个评论块
      第1条评论: 评分5, 内容长度8
      第2条评论: 评分5, 内容长度19
      第3条评论: 评分5, 内容长度30
      第4条评论: 评分3, 内容长度7
      第5条评论: 评分5, 内容长度13
      第6条评论: 评分5, 内容长度49
      第7条评论: 评分5, 内容长度23
      第8条评论: 评分4, 内容长度9
      第9条评论: 评分4, 内容长度9
      第10条评论: 评分5, 内容长度35
    ✓ 本页提取 10 条评论，累计 10 条
    → 点击下一页成功
    --- 正在爬取第 2 页 ---
    找到 10 个评论块
      第1条评论: 评分5, 内容长度26
      第2条评论: 评分5, 内容长度6
      第3条评论: 评分5, 内容长度21
      第4条评论: 评分5, 内容长度63
      第5条评论: 评分4, 内容长度10
      第6条评论: 评分5, 内容长度4
      第7条评论: 评分3, 内容长度77
      第8条评论: 评分5, 内容长度16
      第9条评论: 评分5, 内容长度8
      第10条评论: 评分5, 内容长度17
    ✓ 本页提取 10 条评论，累计 20 条
  