In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# 设置请求头模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'en-US, en;q=0.5'
}

def scrape_amazon_eco_products(pages=3):
    """爬取多页亚马逊环保商品数据"""
    base_url = "https://www.amazon.com/s?k=eco+friendly&i=aps"
    products = []
    
    for page in range(1, pages+1):
        print(f"正在爬取第 {page} 页...")
        url = f"{base_url}&page={page}"
        
        try:
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            items = soup.select('div[data-component-type="s-search-result"]')
            
            for item in items:
                # 提取商品基本信息
                title_elem = item.select_one('h2 a span')
                title = title_elem.text.strip() if title_elem else None
                
                price_elem = item.select_one('.a-price .a-offscreen')
                price = price_elem.text.strip() if price_elem else None
                
                rating_elem = item.select_one('i.a-icon-star-small .a-icon-alt')
                rating = rating_elem.text.split()[0] if rating_elem else None
                
                review_elem = item.select_one('span.a-size-base')
                reviews = review_elem.text.replace(',', '') if review_elem else "0"
                
                # 提取商品ASIN用于构建详情页链接
                asin = item.get('data-asin')
                detail_url = f"https://www.amazon.com/dp/{asin}"
                
                # 获取商品详情页数据
                product_detail = get_product_details(detail_url)
                
                products.append({
                    'title': title,
                    'price': price,
                    'rating': rating,
                    'reviews': reviews,
                    'asin': asin,
                    'url': detail_url,
                    **product_detail
                })
                
                # 随机延迟防止被封
                time.sleep(random.uniform(1.0, 3.0))
                
        except Exception as e:
            print(f"第 {page} 页爬取出错: {str(e)}")
    
    return pd.DataFrame(products)

def get_product_details(url):
    """获取商品详情页信息"""
    details = {
        'description': None,
        'features': [],
        'eco_certifications': []
    }
    
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # 提取商品描述
        description_elem = soup.select_one('#productDescription')
        if description_elem:
            details['description'] = description_elem.text.strip()
        
        # 提取产品特性
        feature_bullets = soup.select('#feature-bullets li')
        details['features'] = [li.text.strip() for li in feature_bullets]
        
        # 识别环保认证关键词
        eco_keywords = ['organic', 'eco-friendly', 'sustainable', 'recycled', 
                       'biodegradable', 'certified', 'green', 'energy star']
        text_content = soup.get_text().lower()
        
        for keyword in eco_keywords:
            if keyword in text_content:
                details['eco_certifications'].append(keyword)
        
        # 提取技术规格
        tech_specs = {}
        for row in soup.select('#productDetails_techSpec_section_1 tr'):
            cols = row.find_all('td')
            if len(cols) == 2:
                key = cols[0].text.strip().lower()
                value = cols[1].text.strip()
                tech_specs[key] = value
        
        details['technical_specs'] = tech_specs
        
        # 随机延迟
        time.sleep(random.uniform(0.5, 1.5))
        
    except Exception as e:
        print(f"详情页爬取出错: {str(e)}")
    
    return details

# 执行爬虫
if __name__ == "__main__":
    # 爬取3页数据（约60件商品）
    eco_df = scrape_amazon_eco_products(pages=3)
    
    # 保存数据到CSV
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    filename = f"amazon_eco_products_{timestamp}.csv"
    eco_df.to_csv(filename, index=False)
    
    print(f"爬取完成! 共获取 {len(eco_df)} 件商品数据")
    print(f"数据已保存至: {filename}")

正在爬取第 1 页...
正在爬取第 2 页...
正在爬取第 3 页...
爬取完成! 共获取 180 件商品数据
数据已保存至: amazon_eco_products_20250530-202714.csv
