In [None]:
import dotenv
import os
from playwright.async_api import async_playwright
import time
import random
import json
import hashlib

In [10]:
# load environment variables
env = dotenv.load_dotenv()

In [11]:
# Configurations
amazon_home = "https://www.amazon.com/"

In [18]:
# Launch Browser
async def start_browser():
    p = await async_playwright().start()
    browser = await p.chromium.launch(
        headless=False
        # executable_path="/Users/sahilhadke/Library/Caches/ms-playwright/firefox-1471/firefox/firefox"
    )
    context = await browser.new_context()
    page = await context.new_page()
    return p, browser, context, page

# Execute this block to start the browser
p, browser, context, page = await start_browser()
await page.goto(amazon_home)

<Response url='https://www.amazon.com/' request=<Request url='https://www.amazon.com/' method='GET'>>

In [19]:
# Login Manually
async def login():
    # click on sign in button
    sign_in_button = await page.wait_for_selector("//a[@data-nav-role='signin']")
    await sign_in_button.click()

    # enter email
    email_input = await page.wait_for_selector("input[type='email']")
    await email_input.fill(os.getenv("AMAZON_EMAIL"))

    # click on continue
    continue_button = await page.wait_for_selector("//span[@id='continue']//input")

    # click on continue
    await continue_button.click()

    # enter password
    password_input = await page.wait_for_selector("input[type='password']")
    await password_input.fill(os.getenv("AMAZON_PASSWORD"))

    # click on sign in
    sign_in_button = await page.wait_for_selector("//input[@id='signInSubmit']")
    await sign_in_button.click()

# Execute this block to login
await login()

In [None]:
reviewer_list_array = []

# Load existing reviewer list if exists
if os.path.exists("output/reviewer_list.json"):
    with open("output/reviewer_list.json", "r") as file:
        reviewer_list = json.load(file)

for filename in os.listdir("output"):
    if filename.endswith(".json") and filename != "reviewer_list.json":
        with open(os.path.join("output", filename), "r") as file:
            data = json.load(file)

        # loop through each product
        for product in data["products"]:
            # loop through each review
            for review in product["reviews"]:
                name = review["name"]
                profile_url = review["profile_url"]

                # hash of the profile url one way hashing
                hash_object = hashlib.sha256(profile_url.encode())
                hash_hex = hash_object.hexdigest()

                # Optionally shorten (first 12 characters for example)
                unique_id = hash_hex[:5]


                reviews = []

                # Visit reviewer's profile
                await page.goto(profile_url)
                time.sleep(random.randint(1, 3))
                current_url = page.url

                count = 1
                while True:
                    # Find the review URL
                    review_link = await page.query_selector(
                        f"(//div[contains(@class, 'review-card-container')]//a)[{count}]"
                    )
                    if review_link is None:
                        break

                    await review_link.click()
                    time.sleep(random.randint(1, 3))

                    single_review = {}

                    # Get product name and URL
                    product_name_el = await page.query_selector("//a[@data-hook='product-link']")
                    if product_name_el:
                        product_url = await product_name_el.get_attribute("href")
                        product_name_text = await product_name_el.inner_text()
                        single_review["product_name"] = product_name_text
                        single_review["product_url"] = f"https://www.amazon.com{product_url}" 

                    # Get review title
                    review_title_el = await page.query_selector("//a[@data-hook='review-title']//span[2]")
                    if review_title_el:
                        review_title_text = await review_title_el.inner_text()
                        single_review["review_title"] = review_title_text

                    # Get stars
                    stars_el = await page.query_selector("//i[@data-hook='review-star-rating']//span")
                    if stars_el:
                        stars_text = await stars_el.inner_text()
                        # str(stars).split(" ")[0]
                        stars_text = stars_text.split(" ")[0]
                        single_review["stars"] = stars_text

                    # Get date
                    date_el = await page.query_selector("//span[@data-hook='review-date']")
                    if date_el:
                        date_text = await date_el.inner_text()
                        date_text = date_text.split("on ")[1]
                        single_review["date"] = date_text

                    # Get review text
                    review_body_el = await page.query_selector("//span[@data-hook='review-body']")
                    if review_body_el:
                        review_body_text = await review_body_el.inner_text()
                        single_review["review_text"] = review_body_text

                    reviews.append(single_review)

                    count += 1

                    # Go back to profile page
                    await page.goto(current_url)
                    time.sleep(random.randint(1, 2))

                # Save this reviewer’s data
                reviewer_list = {
                    "name": name,
                    "profile_url": profile_url,
                    "reviews": reviews,
                }
                reviewer_list_array.append(reviewer_list)

                # Save to file after every reviewer
                output_dir = f"output/reviewer_info/{filename}"
                os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

                file_path = f"{output_dir}/reviewer_{name}_{unique_id}.json"
                with open(file_path, "w") as file:
                    json.dump(reviewer_list_array, file, indent=4)

                print(f"Saved {name}'s reviews to {file_path}.")

                print(f"Saved {name}'s reviews.")


Saved MB's reviews.
Saved MARCUS RITTER's reviews.
Saved Dominic Barone's reviews.
Saved Heather Johnson's reviews.
Saved Krista S.'s reviews.
Saved Lauren Reed's reviews.
Saved Avid Amazon Enthusiast's reviews.
Saved Danielle's reviews.
Saved g's reviews.
Saved Bryceson Charlton's reviews.
Saved Eric Niederhelman's reviews.
Saved Samantha McHugh's reviews.
Saved jeremy walker's reviews.
Saved Audrey's reviews.
Saved Hannah H's reviews.
Saved Lorrie's reviews.
Saved Sean Burnette's reviews.
Saved K Harinezumi's reviews.
Saved Ella Niasoff's reviews.
Saved Our Favorite Find's reviews.
Saved Alex Cano's reviews.
Saved Melissa Helmig's reviews.
Saved Christi loves's reviews.
Saved Kylee L's reviews.
Saved 10's reviews.
Saved Amazon Customer's reviews.
Saved Ervin's reviews.
Saved Beth R's reviews.
Saved TH's reviews.
Saved Gwen Ramey's reviews.
Saved Ashley Connolly's reviews.
Saved J_Reyes's reviews.
Saved DB's reviews.
Saved Kim Wainwright's reviews.
Saved Rachael's reviews.
Saved Rafae

CancelledError: 

In [17]:
# close
await browser.close()