In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
携程景点爬取 - 只爬第二个类别 + 修复位置信息 + 爬取300页
"""

import re
import time
import random
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from datetime import datetime

# 常量设置
BASE_URL = "https://you.ctrip.com/sight/china110000/s0-p{}.html"

# 请求头
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Referer": "https://you.ctrip.com/sight/china110000.html"
}

def setup_session():
    """设置请求会话"""
    session = requests.Session()
    session.headers.update(HEADERS)
    return session

def fetch_page(session, url, page_no, retry_count=3):
    """获取页面内容，支持重试"""
    print(f"[INFO] 正在加载第 {page_no} 页")
    
    for attempt in range(retry_count):
        try:
            time.sleep(random.uniform(2, 5))
            response = session.get(url, timeout=30)
            response.raise_for_status()
            response.encoding = 'utf-8'
            html = response.text
            
            if len(html) < 1000:
                print(f"[WARNING] 第 {page_no} 页HTML内容过短，尝试重试... ({attempt + 1}/{retry_count})")
                continue
                
            print(f"[SUCCESS] 第 {page_no} 页获取成功")
            return html
            
        except Exception as e:
            print(f"[ERROR] 第 {page_no} 页请求失败 (尝试 {attempt + 1}/{retry_count}): {e}")
            if attempt < retry_count - 1:
                time.sleep(5)
    
    return None

def extract_category_info(card):
    """提取类别信息 - 修复：只爬取第二个类别，忽略第一个"""
    category_info = {
        "category_tags": [],  # 所有类别标签列表（现在只包含第二个类别）
        "primary_category": None,  # 主要类别（现在设为第二个类别）
        "all_categories": []  # 所有类别
    }
    
    try:
        # 使用精确的CSS选择器查找所有类别标签
        category_selectors = [
            '.rankInfoModule_tag_list_view__4_nZC .rankInfoModule_tag_view__u3_W3 .rankInfoModule_tag_text__FCSHe',
            '.rankInfoModule_tag_list_view__4_nZC .rankInfoModule_tag_text__FCSHe',
            '.rankInfoModule_tag_text__FCSHe'
        ]
        
        for selector in category_selectors:
            tag_elements = card.select(selector)
            if tag_elements and len(tag_elements) >= 2:
                # 只取第二个类别，忽略第一个
                second_tag = tag_elements[1]  # 索引1表示第二个元素
                tag_text = second_tag.get_text(strip=True)
                
                if tag_text and len(tag_text) < 50:
                    category_info["category_tags"] = [tag_text]  # 只包含第二个类别
                    category_info["all_categories"] = [tag_text]  # 只包含第二个类别
                    category_info["primary_category"] = tag_text  # 第二个类别作为主要类别
                    print(f"[SUCCESS] 找到第二个类别: {tag_text} (忽略第一个类别)")
                    break
                else:
                    print(f"[DEBUG] 第二个类别文本无效: {tag_text}")
            
            elif tag_elements and len(tag_elements) == 1:
                # 如果只有一个类别，就不提取（因为您只要第二个）
                print(f"[DEBUG] 只有一个类别，跳过提取")
                break
        
        # 如果没有找到第二个类别，尝试其他方法查找
        if not category_info["category_tags"]:
            # 查找包含tag相关类名的所有元素
            tag_elements = card.find_all(class_=re.compile(r'tag'))
            if len(tag_elements) >= 2:
                second_tag = tag_elements[1]
                tag_text = second_tag.get_text(strip=True)
                if tag_text and len(tag_text) < 20:
                    category_info["category_tags"] = [tag_text]
                    category_info["all_categories"] = [tag_text]
                    category_info["primary_category"] = tag_text
                    print(f"[SUCCESS] 通过通用方法找到第二个类别: {tag_text}")
        
    except Exception as e:
        print(f"[ERROR] 提取类别信息时出错: {e}")
    
    return category_info

def extract_location_info(card):
    """提取位置信息 - 修复位置信息提取"""
    location = None
    
    try:
        # 方法1: 使用CSS选择器查找位置信息
        location_selectors = [
            '.distanceView_desc-text__jb8H9',
            '[class*="location"]',
            '[class*="address"]',
            '[class*="位置"]',
            '[class*="地址"]'
        ]
        
        for selector in location_selectors:
            location_elem = card.select_one(selector)
            if location_elem:
                location_text = location_elem.get_text(strip=True)
                if location_text and len(location_text) > 0:
                    location = location_text
                    print(f"[DEBUG] 使用选择器 '{selector}' 找到位置: {location}")
                    break
        
        # 方法2: 如果没有找到，从整个卡片文本中搜索位置关键词
        if not location:
            card_text = card.get_text(' ', strip=True)
            
            # 常见的位置模式
            location_patterns = [
                r'距离([^，。！？!?]{5,30}?)约',
                r'位于([^，。！？!?]{5,30}?)附近',
                r'靠近([^，。！？!?]{5,30}?)',
                r'在([^，。！？!?]{5,30}?)周边'
            ]
            
            for pattern in location_patterns:
                match = re.search(pattern, card_text)
                if match:
                    location = match.group(1).strip()
                    print(f"[DEBUG] 从文本中找到位置: {location}")
                    break
        
        # 方法3: 查找包含距离信息的元素
        if not location:
            distance_elems = card.select('[class*="distance"], [class*="位置"]')
            for elem in distance_elems:
                text = elem.get_text(strip=True)
                if text and ('距离' in text or '位于' in text or '靠近' in text):
                    location = text
                    print(f"[DEBUG] 从距离元素找到位置: {location}")
                    break
                    
    except Exception as e:
        print(f"[ERROR] 提取位置信息时出错: {e}")
    
    return location

def parse_html_with_css(html, page_no):
    """使用精准CSS选择器解析 - 修复：只爬第二个类别 + 修复位置信息"""
    if not html:
        return []
        
    try:
        soup = BeautifulSoup(html, "html.parser")
        items = []
        
        # 查找所有景点卡片
        sight_cards = soup.find_all('div', class_=re.compile(r'sightItemCard_box__'))
        print(f"[DEBUG] 找到 {len(sight_cards)} 个景点卡片")
        
        for card in sight_cards:
            try:
                # 提取景点名称
                name_elem = card.select_one('.titleModule_name__Li4Tv')
                if not name_elem:
                    continue
                    
                name = name_elem.get_text(strip=True)
                if not name:
                    continue
                
                # 提取景点链接
                link_elem = card.find('a', href=re.compile(r'/sight/'))
                href = link_elem.get('href', '') if link_elem else ''
                if href and not href.startswith('http'):
                    href = 'https://you.ctrip.com' + href
                
                # ========== 修复评论数提取 ==========
                reviews = None
                card_text = card.get_text(' ', strip=True)
                
                # 从整个卡片文本中搜索评论数
                patterns = [
                    r'(\d+(?:\.\d+)?)(万?)条点评',
                    r'(\d+(?:\.\d+)?)(万?)条评论',
                    r'点评\((\d+(?:\.\d+)?)(万?)\)',
                    r'评论\((\d+(?:\.\d+)?)(万?)\)',
                    r'(\d+(?:\.\d+)?)(万?)条评价'
                ]
                
                for pattern in patterns:
                    match = re.search(pattern, card_text)
                    if match:
                        try:
                            num = float(match.group(1))
                            unit = match.group(2)
                            if unit:  # 如果有"万"字
                                reviews = int(num * 10000)
                            else:
                                reviews = int(num)
                            break
                        except ValueError:
                            pass
                
                # ========== 提取评分 ==========
                rating = None
                rating_elem = card.select_one('.commentInfoModule_comment-score_value__iUsa8')
                if rating_elem:
                    rating_text = rating_elem.get_text(strip=True)
                    try:
                        rating = float(rating_text)
                    except ValueError:
                        pass
                
                # 如果评分元素没找到，从其他位置查找
                if rating is None:
                    score_elems = card.select('[class*="score"], [class*="rating"]')
                    for elem in score_elems:
                        text = elem.get_text(strip=True)
                        match = re.search(r'(\d+\.?\d*)', text)
                        if match:
                            try:
                                rating = float(match.group(1))
                                break
                            except ValueError:
                                pass
                
                # ========== 修复位置信息提取 ==========
                location = extract_location_info(card)
                
                # 提取价格信息
                price = None
                free = False
                
                # 检查是否免费
                free = any(word in card_text for word in ['免费', '免票', '无需门票'])
                
                # 提取价格数字
                if not free:
                    price_match = re.search(r'¥\s*(\d+(?:\.\d+)?)', card_text)
                    if price_match:
                        try:
                            price = float(price_match.group(1))
                        except ValueError:
                            pass
                
                # 提取等级信息
                level = None
                level_match = re.search(r'(\d+[Aa])', card_text)
                if level_match:
                    level = level_match.group(1)
                
                # ========== 修复类别信息提取：只爬第二个类别 ==========
                category_info = extract_category_info(card)
                
                # 构建景点数据
                item_data = {
                    "page": page_no,
                    "name": name,
                    "url": href,
                    "rating": rating,
                    "reviews": reviews,
                    "price": price,
                    "free": free,
                    "location": location,  # 修复：添加位置信息
                    "level": level,
                    "primary_category": category_info["primary_category"],  # 现在这是第二个类别
                    "category_tags": ", ".join(category_info["category_tags"]) if category_info["category_tags"] else "",
                    "all_categories": ", ".join(category_info["all_categories"]) if category_info["all_categories"] else "",
                    "crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }
                
                items.append(item_data)
                
                # 简化的调试信息（包含位置和类别）
                rating_debug = rating if rating else "无"
                reviews_debug = reviews if reviews else "无"
                price_debug = "免费" if free else f"¥{price}" if price else "未知"
                location_debug = location if location else "无"
                category_debug = category_info["primary_category"] if category_info["primary_category"] else "无"
                print(f"  ✓ {name}")
                print(f"    评分: {rating_debug} | 评论: {reviews_debug} | 价格: {price_debug}")
                print(f"    位置: {location_debug} | 类别: {category_debug}")
                
            except Exception as e:
                print(f"[ERROR] 解析景点卡片时出错: {e}")
                continue
        
        print(f"[INFO] 第 {page_no} 页: 成功解析 {len(items)} 个景点")
        return items
        
    except Exception as e:
        print(f"[ERROR] 解析第 {page_no} 页HTML失败: {e}")
        return []

def check_page_end(html, page_no):
    """检查是否到达最后一页"""
    if not html:
        return True
        
    try:
        soup = BeautifulSoup(html, "html.parser")
        
        # 检查是否有"没有找到相关景点"等提示
        no_result_texts = ['没有找到相关景点', '暂无数据', '未找到相关内容']
        page_text = soup.get_text()
        
        for text in no_result_texts:
            if text in page_text:
                print(f"[INFO] 第 {page_no} 页提示 '{text}'，可能已到达最后一页")
                return True
        
        # 检查景点卡片数量
        sight_cards = soup.find_all('div', class_=re.compile(r'sightItemCard_box__'))
        if len(sight_cards) == 0:
            print(f"[INFO] 第 {page_no} 页没有找到景点卡片，可能已到达最后一页")
            return True
            
        return False
        
    except Exception as e:
        print(f"[ERROR] 检查页面结束状态失败: {e}")
        return False

def save_final_csv(data, filename=None):
    """保存最终完整的CSV文件"""
    if not data:
        print("[WARNING] 没有数据可保存")
        return None
    
    try:
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"ctrip_sights_300pages_{timestamp}.csv"
        
        # 转换为DataFrame
        df = pd.DataFrame(data)
        
        # 保存CSV
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        
        # 统计信息
        total_items = len(data)
        items_with_rating = df['rating'].notna().sum()
        items_with_reviews = df['reviews'].notna().sum()
        items_with_price = (df['price'].notna() | df['free']).sum()
        items_with_location = df['location'].notna().sum()
        items_with_category = df['primary_category'].notna().sum()
        
        print(f"\n" + "="*60)
        print(f"[SUCCESS] 完整CSV文件已保存!")
        print(f"文件: {filename}")
        print(f"总景点数: {total_items}")
        print(f"有评分的景点: {items_with_rating} ({items_with_rating/total_items*100:.1f}%)")
        print(f"有评论数的景点: {items_with_reviews} ({items_with_reviews/total_items*100:.1f}%)")
        print(f"有价格信息的景点: {items_with_price} ({items_with_price/total_items*100:.1f}%)")
        print(f"有位置信息的景点: {items_with_location} ({items_with_location/total_items*100:.1f}%)")
        print(f"有类别信息的景点: {items_with_category} ({items_with_category/total_items*100:.1f}%)")
        print("="*60)
        
        return filename
        
    except Exception as e:
        print(f"[ERROR] 保存CSV失败: {e}")
        return None

def main():
    """主函数"""
    print("=" * 60)
    print("携程景点爬虫 - 只爬第二个类别 + 修复位置信息 + 爬取300页")
    print("=" * 60)
    
    all_items = []
    session = setup_session()
    start_time = time.time()
    
    # 爬取300页
    total_pages = 300
    
    try:
        print(f"[INFO] 开始爬取，从第1页到第{total_pages}页...")
        print(f"[INFO] 预计时间: {total_pages * 5 / 60:.1f} 分钟")
        
        for pno in range(1, total_pages + 1):
            url = BASE_URL.format(pno)
            
            html = fetch_page(session, url, pno)
            if not html:
                continue
            
            # 检查是否到达最后一页
            if check_page_end(html, pno):
                print(f"[INFO] 检测到第 {pno} 页为最后一页，停止爬取")
                break
            
            # 解析页面
            items = parse_html_with_css(html, pno)
            all_items.extend(items)
            
            # 显示进度
            elapsed_time = time.time() - start_time
            items_per_minute = len(all_items) / (elapsed_time / 60) if elapsed_time > 0 else 0
            remaining_pages = total_pages - pno
            estimated_remaining = (remaining_pages * 5) / 60  # 预估剩余时间（分钟）
            
            print(f"[PROGRESS] 第{pno:03d}/{total_pages}页 | 累计{len(all_items):04d}景点 | "
                  f"速度:{items_per_minute:.1f}个/分钟 | 剩余:{estimated_remaining:.1f}分钟")
            
            # 每10页显示一次详细统计
            if pno % 10 == 0:
                current_rating = len([x for x in all_items if x['rating'] is not None])
                current_reviews = len([x for x in all_items if x['reviews'] is not None])
                current_location = len([x for x in all_items if x['location'] is not None])
                current_category = len([x for x in all_items if x['primary_category'] is not None])
                print(f"[STATS] 当前统计 - 有评分: {current_rating} | 有评论: {current_reviews} | 有位置: {current_location} | 有类别: {current_category}")
            
            # 随机延迟
            time.sleep(random.uniform(3, 6))
                
    except KeyboardInterrupt:
        print("\n[INFO] 用户中断程序")
    except Exception as e:
        print(f"[ERROR] 运行出错: {e}")
    
    # 保存最终完整的CSV文件
    if all_items:
        csv_file = save_final_csv(all_items)
        
        # 显示数据预览
        print(f"\n=== 数据预览（前10个景点）===")
        for i, item in enumerate(all_items[:10], 1):
            rating_str = f"{item['rating']:.1f}" if item['rating'] else '无'
            reviews_str = f"{item['reviews']:,}" if item['reviews'] else '无'
            price_str = "免费" if item['free'] else f"¥{item['price']}" if item['price'] else '未知'
            location_str = item['location'] if item['location'] else '无'
            category_str = item['primary_category'] if item['primary_category'] else '无'
            print(f"{i:2d}. {item['name']}")
            print(f"    评分: {rating_str} | 评论: {reviews_str} | 价格: {price_str}")
            print(f"    位置: {location_str} | 类别: {category_str}")
            
    else:
        print("[ERROR] 没有找到任何景点数据")

if __name__ == "__main__":
    main()

携程景点爬虫 - 只爬第二个类别 + 修复位置信息 + 爬取300页
[INFO] 开始爬取，从第1页到第300页...
[INFO] 预计时间: 25.0 分钟
[INFO] 正在加载第 1 页
[SUCCESS] 第 1 页获取成功
[DEBUG] 找到 10 个景点卡片
[DEBUG] 使用选择器 '.distanceView_desc-text__jb8H9' 找到位置: 北京 · 天安门/王府井地区
[SUCCESS] 找到第二个类别: 展馆展览 (忽略第一个类别)
  ✓ 故宫博物院5A
    评分: 4.8 | 评论: 187000 | 价格: 未知
    位置: 北京 · 天安门/王府井地区 | 类别: 展馆展览
[DEBUG] 使用选择器 '.distanceView_desc-text__jb8H9' 找到位置: 北京 · 八达岭长城/野生动物园地区
[SUCCESS] 找到第二个类别: 历史建筑 (忽略第一个类别)
  ✓ 八达岭长城5A
    评分: 4.7 | 评论: 55000 | 价格: ¥35.0
    位置: 北京 · 八达岭长城/野生动物园地区 | 类别: 历史建筑
[DEBUG] 使用选择器 '.distanceView_desc-text__jb8H9' 找到位置: 上海 · 外滩核心区
[SUCCESS] 找到第二个类别: 遛娃宝藏地 (忽略第一个类别)
  ✓ 外滩
    评分: 4.8 | 评论: 158000 | 价格: 免费
    位置: 上海 · 外滩核心区 | 类别: 遛娃宝藏地
[DEBUG] 使用选择器 '.distanceView_desc-text__jb8H9' 找到位置: 西安 · 兵马俑/华清宫旅游区
[SUCCESS] 找到第二个类别: 历史建筑 (忽略第一个类别)
  ✓ 秦始皇帝陵博物院(兵马俑)5A
    评分: 4.7 | 评论: 129000 | 价格: ¥120.0
    位置: 西安 · 兵马俑/华清宫旅游区 | 类别: 历史建筑
[DEBUG] 使用选择器 '.distanceView_desc-text__jb8H9' 找到位置: 香港 · 香港迪士尼乐园度假区
[SUCCESS] 找到第二个类别: 嗨玩乐园 (忽略第一个类别)
  ✓ 香港迪士尼乐园
    评