# Create CSV file for Review data

In [1]:
import re
from bs4 import BeautifulSoup
import requests
import os

token = os.getenv("AMAZON_TOKEN")


def extract_integer(s):
    """
    This regex pattern looks for digits, possibly separated by commas
    """
    pattern = r"(\d{1,3}(?:,\d{3})*)"
    match = re.search(pattern, s)

    if match:
        # Remove commas and return the integer as a string
        return match.group(0).replace(",", "")
    return None


def extract_float_from_phrase(s):
    """
    Input string example: "2.0 out of 5 stars"
    Use regular expression to find the float value
    """
    match = re.search(r"\d+\.\d+|\d+", s)
    if match:
        rating = float(match.group())
        return rating
    else:
        return None


def scrape_review_page(page: str):
    rtn = {
        "product_name": "",
        "overall_rating": "",
        "total_review_count": "",
        "review_list": [],
    }
    try:
        html = BeautifulSoup(page, "html.parser")
        product_name = html.find(attrs={"data-hook": "product-link"})
        rtn["product_name"] = product_name.get_text()

        overall_rating = html.find(attrs={"data-hook": "rating-out-of-text"})
        rtn["overall_rating"] = extract_float_from_phrase(overall_rating.get_text())

        total_review_count = html.find(attrs={"data-hook": "total-review-count"})
        rtn["total_review_count"] = extract_integer(total_review_count.get_text())

        review_list = html.find_all(attrs={"data-hook": "review"})
        for review in review_list:
            review_id = review.get("id", "N/A")
            review_title = review.find(attrs={"data-hook": "review-title"})
            review_date_field = review.find(attrs={"data-hook": "review-date"})
            match = (
                re.search(r"Reviewed in (.+?) on (.+)", review_date_field.get_text())
                if review_date_field
                else None
            )
            if match:
                country = match.group(1)
                date = match.group(2)
            else:
                print("Pattern not found for review date.")
            review_body = review.find(attrs={"data-hook": "review-body"})
            verified_purchase = review.find(attrs={"data-hook": "avp-badge"})
            found_helpful = review.find(attrs={"data-hook": "helpful-vote-statement"})
            rtn["review_list"].append(
                {
                    "review_id": review_id,
                    "review_rating": extract_float_from_phrase(
                        review_title.contents[0].get_text()
                    ),
                    "review_title": review_title.contents[3].get_text(),
                    "review_href": review_title.get("href", "N/A"),
                    "review_country": country if country else "N/A",
                    "review_date": date if date else "N/A",
                    "review_body": review_body.get_text() if review_body else "N/A",
                    "verified_purchase": (
                        verified_purchase.get_text() if verified_purchase else "N/A"
                    ),
                    "found_helpful": (
                        found_helpful.get_text() if found_helpful else "N/A"
                    ),
                }
            )
        return rtn

    except Exception as e:
        print(f"Error scraping review page: {e}")


# Define the URL
url = "https://www.amazon.com/product-reviews/B09C6HGX1W"

# Define the headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
}

# Define the cookies
cookies = {
    "session-id": "131-2797845-9695924",
    "i18n-prefs": "USD",
    "skin": "noskin",
    "ubid-main": "132-8139505-0179230",
    "lc-main": "en_US",
    "av-timezone": "America/Indianapolis",
    "at-main": "Atza|IwEBIKwUfNTXyLhyUtWDwLm-OhLSMEFWVqf5ZJ-9zWNgUDAG1Cy3vsUsBNftXekacwztgitKcEMAZ9OgVMifQyuDBq8dOSinK8NDQfC5Z6KJnDtTMERcHujfQf2WDEogJy7Pq8c57tEn6fH_RA7GvDWcbM_jzguokpTuv4uMq_lnDw5HYGXPbm6GeuwgXHFZ_jtU1X2SIeGGqbHxHkl2qIQdjZ__KNanE26HyU5AK6nqX6_xA5wJKLiEntqX3-qPOF-QQZQ",
    "sess-at-main": '"liFtyHKrXtJyyOs0gYL3So4t7TlyUlC6vEaEmeITiQM="',
    "sst-main": "Sst1|PQFwQGHFc_NTcqmeWoTc_H52CRf7n7_SHNX4FjJX9Erboq8CVVlx0Vz50qLGqk_uTTWL7VdF3yZAVIYjv0UMnaCP1EvI2Vuh0oWesOyAgPxUkAxJZ8Rg6hI2NcLy5wam4GZWLKHb8-ls8i3tuHySA9N9UUcbEO_QiHNjmM7CgekGHBK5bckNP5TmSz_LSXW6QfrlmD9Q1be1nP_zJJNc9B5e_ZRCeYyfhT9IxSvpqFXNve1M-CCSvgWPnIPLNL0nAi_mLcaV3-EsKF1zobNtdd9QemtvvrS3TK4j4ksO97e6CCU",
    "session-id-time": "2082787201l",
    "x-main": "8wVqNxiOZwbL6bxOoTFJI@d9o@B4gPuL5a3Ru7QDH4FHIVAkKl7t9mEkUhZq8EJL",
    "appstore-devportal-locale": "en_US",
    "_mkto_trk": "id:365-EFI-026&token:_mch-amazon.com-de91e154c048b644782603e76c4486b2",
    "at_check": "true",
    "s_plt": "2.47",
    "s_pltp": "undefined",
    "AMCVS_4A8581745834114C0A495E2B%40AdobeOrg": "1",
    "s_ips": "1134",
    "s_cc": "true",
    "AMCV_4A8581745834114C0A495E2B%40AdobeOrg": "179643557%7CMCIDTS%7C20049%7CMCMID%7C80552200343015942363501208162927885838%7CMCAAMLH-1732807500%7C9%7CMCAAMB-1732807500%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1732209901s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C5.5.0",
    "session-token": token,
}

# Send the GET request
response = requests.get(
    url,
    headers=headers,
    cookies=cookies,
)

# Check the response
if response.status_code == 200:
    rtn = scrape_review_page(response.text)
    # print(rtn)
    print(len(rtn.get("review_list")))
else:
    print(f"Request failed with status code: {response.status_code}")

10
