In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import csv
import time
import re

def convert_sales_volume(sales_str):
    if 'K' in sales_str:
        num = sales_str.replace('+', '').replace('K', '000')
    else:
        num = sales_str.replace('+', '')
    try:
        return int(num)
    except ValueError:
        return None  # 如果转换失败返回 None

# 设置Chrome驱动程序
driver = webdriver.Chrome()

# 打开Amazon的搜索页面
url = 'https://www.amazon.com/s?k=pet+memorial&language=en_US&crid=2XEALO4SBIQ18&sprefix=pet+memorial%2Caps%2C707&ref=nb_sb_noss_1'
driver.get(url)
wait = WebDriverWait(driver, 10)

# 打开CSV文件，准备写入
with open('amazon_pet_memorial.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Title", "Price", "Rating", "Rating Count", "ASIN", "Monthly Sales", "Image URL", "Product URL"])

    # 循环处理每页数据
    while True:
        # 等待页面加载
        time.sleep(3)

        # 计算15秒后的时间点
        end_time = time.time() + 15

        # 滚动页面，直到时间结束
        while time.time() < end_time:
            driver.execute_script("window.scrollBy(0, 1000);")
            time.sleep(1)  # 暂停1秒以等待页面加载

        # 定位包含所有商品的大div
        container = driver.find_element(By.CLASS_NAME, 's-main-slot.s-result-list.s-search-results.sg-row')

        # 获取所有商品信息
        items = container.find_elements(By.XPATH, './/div[@data-asin]')

        for item in items:
            try:
                # 提取ASIN
                asin = item.get_attribute('data-asin')

                # 标题
                title = item.find_element(By.XPATH, './/h2/a/span').text if item.find_elements(By.XPATH, './/h2/a/span') else "N/A"

                # 价格
                price = "N/A"
                price_elements = item.find_elements(By.XPATH, './/span[@class="a-price-whole"]')
                if price_elements:
                    price_whole = price_elements[0].text.replace(',', '')
                    price_fraction = item.find_element(By.XPATH, './/span[@class="a-price-fraction"]').text
                    price = f"{price_whole}.{price_fraction}"

                # 评分
                rating = item.find_element(By.XPATH, './/i[contains(@class, "a-icon-star")]/span').get_attribute("innerHTML").split(' ')[0] if item.find_elements(By.XPATH, './/i[contains(@class, "a-icon-star")]') else "N/A"

                # 评分数量
                rating_count = item.find_element(By.XPATH, './/span[@class="a-size-base s-underline-text"]').text.replace(',', '') if item.find_elements(By.XPATH, './/span[@class="a-size-base s-underline-text"]') else "N/A"

                # 月销量
                monthly_sales_elements = item.find_elements(By.XPATH,
                                                             './/span[contains(@class, "a-color-secondary") and contains(text(), "bought in past month")]')
                monthly_sales = monthly_sales_elements[0].text if monthly_sales_elements else "N/A"
                monthly_sales_number = convert_sales_volume(re.search(r'\d+K?\+', monthly_sales).group(0)) if monthly_sales != "N/A" else "N/A"

                # 图片URL
                image_url = item.find_element(By.XPATH, './/img[@class="s-image"]').get_attribute("src") if item.find_elements(By.XPATH, './/img[@class="s-image"]') else "N/A"

                # 商品链接
                product_url = item.find_element(By.XPATH, './/h2/a').get_attribute("href") if item.find_elements(By.XPATH, './/h2/a') else "N/A"

                if not any([title, price, rating, rating_count, image_url, product_url]) or all(
                        [x == "N/A" for x in [title, price, rating, rating_count, monthly_sales, image_url, product_url]]):
                    continue
                # 写入CSV文件
                writer.writerow([title, price, rating, rating_count, asin, monthly_sales, image_url, product_url])
            except Exception as e:
                print(f"Error processing item with ASIN {asin}: {e}")

        # 尝试找到“Next”按钮并点击进入下一页
        try:
            next_button = driver.find_element(By.XPATH, '//a[contains(@class, "s-pagination-next")]')
            driver.execute_script("arguments[0].click();", next_button)  # 点击按钮
            time.sleep(3)  # 等待新页面加载
        except:
            print("No 'Next' button found or unable to click. Ending extraction.")
            break

print("Data extraction completed and saved to 'amazon_pet_memorial.csv'")

import csv
import re

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time

def clean_text(text):
    """清除多余的换行符、空格和特殊字符"""
    return re.sub(r'\s+', ' ', text).strip()


def extract_star_rating(text):
    """从星级文本提取数值"""
    match = re.search(r'(\d+\.\d+|\d+)', text)
    return match.group(0) if match else "N/A"


def extract_percentage(text):
    """提取百分比值"""
    return text.strip('%') if text else "N/A"


def extract_date(text):
    """清理和标准化日期"""
    # 假设日期格式为 'Reviewed in the United States on June 11, 2023'
    match = re.search(r'on (.*)', text)
    return match.group(1).strip() if match else "N/A"

def scroll_until_element_found(driver, selector, timeout=30):
    # 滚动页面直到找到指定的元素或超时
    end_time = time.time() + timeout
    while time.time() < end_time:
        try:
            element = driver.find_element(By.CSS_SELECTOR, selector)
            print(f"找到元素，选择器：'{selector}'")
            return element
        except NoSuchElementException:
            print(f"滚动页面 - 选择器 '{selector}' 的元素尚未找到")
            driver.execute_script("window.scrollBy(0, 1000);")
            time.sleep(1)
    raise TimeoutException(f"在 {timeout} 秒内未找到选择器为 {selector} 的元素")


def scroll_to_bottom(driver):
    # 滚动页面到底部，用于加载更多内容
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)  # 等待内容加载


def append_star_ratings(input_file, output_file):
    # 主函数，用于读取输入文件、提取星级和评论数据并保存到输出文件
    driver = webdriver.Chrome()
    driver.get("https://www.amazon.com/sspa/click?ie=UTF8&spc=MTo3MDU3Nzg5NDQyOTA5OTk4OjE3MzEwNjU5NjI6c3BfYXRmOjIwMDA0MTU2OTMzNDMyMTo6MDo6&url=%2FMemorial-Markers-Tombstones-Outdoor-Sympaty%2Fdp%2FB07PDBY43T%2Fref%3Dsr_1_1_sspa%3Fcrid%3D2XEALO4SBIQ18%26dib%3DeyJ2IjoiMSJ9.hF8za7UiCXUjLAl8wSr5Dy_xXEyVFY0k8wIXA91D9QVeZm27E-1RowcJh-LR-oyp4xx6y4WLp3j-JHGrVpvXnS4am11gqNJXxjNxPS9wFSWJp7wSO77zUUSRRXP9r0iXo08-wH0pnqcA8HCdsaZA_3o27pxeua9C1EPC4w_1z1hHr9kdYsxNJPWRWPYG4s4AxDk1Beo4yMySWuloyDevBueeSN-F935k5A7lQxHOxdbLMuER9BZ07cQ_J0c3tMsQQmuN6kmIVG_o2eLrMw7vgln08Rhd1QB9a5qdfnWMmrs.FP6LiiV8CmZgG5y8AzdEC-JpHzJ5C3eYLPXMxclBV2k%26dib_tag%3Dse%26keywords%3Dpet%2Bmemorial%26qid%3D1731065962%26sprefix%3Dpet%2Bmemorial%252Caps%252C707%26sr%3D8-1-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1")
    if input("是否已登录完毕，语言是否已切换完毕？（完成请摁1）") == 1:
        pass

    with open(input_file, 'r', newline='', encoding='utf-8') as infile, \
            open(output_file, 'w', newline='', encoding='utf-8') as outfile:

        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        headers = next(reader)
        writer.writerow(headers + ["5 stars", "4 stars", "3 stars", "2 stars", "1 star", "Total Ratings with Reviews"])

        for row in reader:
            product_url = row[7]  # 假设产品URL在第8列
            asin = row[4]  # 假设ASIN在第5列
            if product_url == "N/A":
                # 如果产品URL为空，直接写入“N/A”
                writer.writerow(row + ["N/A"] * 6)
                continue

            print(f"打开产品页面: {product_url}")
            driver.get(product_url)
            time.sleep(5)  # 等待页面加载

            # 尝试点击“See all reviews”链接以查看所有评论
            try:
                see_more_reviews_link = scroll_until_element_found(driver, 'a[data-hook="see-all-reviews-link-foot"]')
                see_more_reviews_link.click()
                print("点击 'See more reviews'")
                time.sleep(5)  # 等待页面加载
            except TimeoutException:
                print(f"找不到 'See more reviews' 链接，跳过该产品: {product_url}")
                writer.writerow(row + ["N/A"] * 6)
                continue

            current_url = driver.current_url  # 获取当前页面的URL
            # 提取星级比例
            try:
                scroll_until_element_found(driver, 'ul#histogramTable')  # 滚动到星级数据部分
                stars = driver.find_elements(By.CSS_SELECTOR,
                                             'ul#histogramTable .a-align-center .a-text-right span')
                star_data = [stars[i].get_attribute("textContent").strip().strip('%') for i in range(5)]  # 提取5个星级比例
                print(star_data)
                # 如果不足5个星级比例，用“N/A”补充
                while len(star_data) < 5:
                    star_data.append("N/A")

                # 切换到“包含图片和视频的评论”页面
                media_only_url = current_url.replace("ref=cm_cr_arp_d_viewopt_actns","ref=cm_cr_arp_d_viewopt_mdrvw").replace("mediaType=all_contents", "mediaType=media_reviews_only")
                print(f"切换到“仅图片和视频评论”页面，URL: {media_only_url}")
                driver.get(media_only_url)
                time.sleep(5)  # 等待页面加载

                # 尝试获取带图片和视频的总评论数量
                try:
                    total_ratings_text = driver.find_element(By.CSS_SELECTOR,
                                                             'div[data-hook="cr-filter-info-review-rating-count"]').text
                    print(f"带图片和视频的评论总数: {total_ratings_text}")
                except NoSuchElementException:
                    total_ratings_text = "N/A"
                    print("未找到带图片和视频的评论总数，添加 'N/A'。")

            except TimeoutException:
                print(f"未找到星级数据，跳过该产品: {product_url}")
                writer.writerow(row + ["N/A"] * 6)
                continue

            # 写入星级数据和带图片的评论总数
            writer.writerow(row + star_data + [total_ratings_text])

            # 切换到全部评论页面
            media_only_url = current_url.replace("ref=cm_cr_arp_d_viewopt_mdrvw",
                                                 "ref=cm_cr_arp_d_viewopt_actns").replace(
                "mediaType=media_reviews_only", "mediaType=all_contents")
            print(f"切换到“全部评论”网址: {media_only_url}")
            driver.get(media_only_url)
            time.sleep(5)  # 等待页面加载

            # 提取每条评论的详细信息
            review_data = []
            while True:
                scroll_to_bottom(driver)  # 滚动到底部以加载更多评论
                reviews = driver.find_element(By.CSS_SELECTOR, "#cm_cr-review_list").find_elements(By.CSS_SELECTOR, 'div[data-hook="review"]')[:10]

                for review in reviews:
                    try:
                        reviewer = clean_text(review.find_element(By.CSS_SELECTOR, 'span.a-profile-name').text)
                    except NoSuchElementException:
                        reviewer = "N/A"

                    # 提取评分
                    try:
                        rating = extract_star_rating(
                            review.find_element(By.CSS_SELECTOR,
                                                'i[data-hook="review-star-rating"] span').get_attribute(
                                "textContent"))
                    except NoSuchElementException:
                        rating = "N/A"

                    # 清洗标题
                    try:
                        title = clean_text(
                            review.find_element(By.CSS_SELECTOR, 'a[data-hook="review-title"]').get_attribute(
                                "textContent"))
                    except NoSuchElementException:
                        title = "N/A"

                    # 提取日期
                    try:
                        date = extract_date(
                            review.find_element(By.CSS_SELECTOR, 'span[data-hook="review-date"]').text)
                    except NoSuchElementException:
                        date = "N/A"

                    # 清洗评论内容
                    try:
                        body = clean_text(
                            review.find_element(By.CSS_SELECTOR, 'span[data-hook="review-body"] span').text)
                    except NoSuchElementException:
                        body = "N/A"

                    # 处理图片链接
                    try:
                        images = "; ".join([img.get_attribute('src')
                                            for img in
                                            review.find_elements(By.CSS_SELECTOR, 'img[data-hook="review-image-tile"]')
                                            if img.get_attribute('src')])
                    except NoSuchElementException:
                        images = "N/A"

                    print([reviewer, rating, title, date, body, "; ".join(images)])
                    review_data.append([reviewer, rating, title, date, body, "; ".join(images)])

                # 检查是否有下一页评论
                try:
                    next_page = driver.find_element(By.CSS_SELECTOR, 'li.a-last a')
                    next_page_url = next_page.get_attribute('href')
                    driver.get(next_page_url)
                    time.sleep(5)  # 等待页面加载
                except NoSuchElementException:
                    print("没有更多评论页。")
                    break

            # 将评论数据写入一个以 ASIN 命名的单独 CSV 文件
            output_file_name = f"reviews/{asin}.csv"
            with open(output_file_name, 'w', newline='', encoding='utf-8') as review_file:
                review_writer = csv.writer(review_file)
                review_writer.writerow(["Reviewer", "Rating", "Title", "Date", "Review Text", "Image URLs"])
                review_writer.writerows(review_data)

            print(f"评论已保存到 {output_file_name}")

    driver.quit()


# 示例用法
append_star_ratings('amazon_pet_memorial.csv', 'updated_amazon_pet_memorial.csv')