### Scraping Amazon Reviews

In [4]:
import random
from datetime import datetime

import pandas as pd
import requests
from bs4 import BeautifulSoup

## Selenium

In [5]:
from selenium import webdriver
import xlsxwriter
import requests
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.service import Service
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Set up EdgeOptions
edge_options = Options()
edge_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54")
edge_options.add_argument("window-size=1920x1080")
edge_options.add_argument("disable-extensions")
edge_options.add_argument("--disable-blink-features=AutomationControlled")
edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])

def get_amazon_reviews(url, pages):
    star_filters = ['one_star', 'two_star', 'three_star', 'four_star', 'five_star']
    all_reviews = []

    driver = webdriver.Edge()  # Use a single browser instance

    for star_filter in star_filters:
        filtered_url = f"{url}&filterByStar={star_filter}"
        driver.get(filtered_url)
        time.sleep(5)

        reviews = []

        try:
            for page in range(pages):
                WebDriverWait(driver, 25).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '#cm_cr-review_list'))
                )
                review_elements = driver.find_elements(By.CSS_SELECTOR, '#cm_cr-review_list .review')
                for element in review_elements:
                    # print(element)
                    try:
                        name = element.find_element(By.CSS_SELECTOR, '.a-profile-name').text
                        title = element.find_element(By.CSS_SELECTOR, '[data-hook="review-title"]').text
                        date = element.find_element(By.CSS_SELECTOR, '[data-hook="review-date"]').text.split(' on ')[-1]
                        stars = element.find_element(By.CSS_SELECTOR, '.a-icon-alt').text.split(' out')[0]
                        body = element.find_element(By.CSS_SELECTOR, '[data-hook="review-body"] span').text

                        reviews.append({
                            'Name': name,
                            'Title': title,
                            'Date': date,
                            'Stars': stars,
                            'Review': body
                        })
                    except NoSuchElementException:
                        continue

                print(f"Collected {len(reviews)} reviews for {star_filter} on page {page + 1}.")

                try:
                    next_button = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, '.a-last a'))
                    )
                    driver.execute_script("arguments[0].click();", next_button)
                    time.sleep(2)
                except TimeoutException:
                    print(f"No more pages for {star_filter}.")
                    break

            all_reviews.extend(reviews)

        except TimeoutException:
            print(f"Timeout for {star_filter}.")
            continue

    driver.quit()
    return all_reviews

# Example usage:
url = 'https://www.amazon.com/product-reviews/B09XMYFTB7/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews'
reviews = get_amazon_reviews(url, 10)
print(f"Total reviews collected: {len(reviews)}")

Timeout for one_star.
Collected 10 reviews for two_star on page 1.
Collected 20 reviews for two_star on page 2.
Collected 30 reviews for two_star on page 3.
Collected 40 reviews for two_star on page 4.
Collected 50 reviews for two_star on page 5.
Collected 60 reviews for two_star on page 6.
Collected 70 reviews for two_star on page 7.
Collected 80 reviews for two_star on page 8.
Collected 89 reviews for two_star on page 9.
Collected 99 reviews for two_star on page 10.
No more pages for two_star.
Collected 10 reviews for three_star on page 1.
Collected 20 reviews for three_star on page 2.
Collected 30 reviews for three_star on page 3.
Collected 40 reviews for three_star on page 4.
Collected 50 reviews for three_star on page 5.
Collected 60 reviews for three_star on page 6.
Collected 70 reviews for three_star on page 7.
Collected 80 reviews for three_star on page 8.
Collected 90 reviews for three_star on page 9.
Collected 100 reviews for three_star on page 10.
No more pages for three_sta

In [6]:
# driver = webdriver.Edge(options=edge_options)
# driver.get(url)

In [28]:
df = pd.DataFrame(reviews)

In [29]:
df.head()

Unnamed: 0,Name,Title,Date,Stars,Review
0,Lynn Thomas,Too sticky,"May 3, 2025",,Very liquid consistency that feels sticky to t...
1,Amazon Customer,Feels sticky,"February 21, 2025",,This product did not meet my expectations. It ...
2,:(,Glue,"May 10, 2025",,Feels and smells like glue
3,Lisa,Not really thrilled,"January 2, 2025",,I love this brand so was looking forward to tr...
4,jen,Meh,"March 12, 2025",,Doesn't hold my makeup on & it actually makes ...


In [30]:
df['Source'] = 'amazon.au'
df['Product'] = 'E.L.F. COSMETICS POWER GRIP PRIMER'

In [31]:
df.head()

Unnamed: 0,Name,Title,Date,Stars,Review,Source,Product
0,Lynn Thomas,Too sticky,"May 3, 2025",,Very liquid consistency that feels sticky to t...,amazon.au,E.L.F. COSMETICS POWER GRIP PRIMER
1,Amazon Customer,Feels sticky,"February 21, 2025",,This product did not meet my expectations. It ...,amazon.au,E.L.F. COSMETICS POWER GRIP PRIMER
2,:(,Glue,"May 10, 2025",,Feels and smells like glue,amazon.au,E.L.F. COSMETICS POWER GRIP PRIMER
3,Lisa,Not really thrilled,"January 2, 2025",,I love this brand so was looking forward to tr...,amazon.au,E.L.F. COSMETICS POWER GRIP PRIMER
4,jen,Meh,"March 12, 2025",,Doesn't hold my makeup on & it actually makes ...,amazon.au,E.L.F. COSMETICS POWER GRIP PRIMER


In [36]:
# Specify the desired column order
desired_columns = ["Date", "Title", "Review", "Source", "Product"]

# Reorder the columns
df = df[desired_columns]

In [37]:
df = df.drop_duplicates()
df.shape

(399, 5)

In [41]:
df.head()

Unnamed: 0,Date,Title,Review,Source,Product
0,"May 3, 2025",Too sticky,Very liquid consistency that feels sticky to t...,amazon.au,E.L.F. COSMETICS POWER GRIP PRIMER
1,"February 21, 2025",Feels sticky,This product did not meet my expectations. It ...,amazon.au,E.L.F. COSMETICS POWER GRIP PRIMER
2,"May 10, 2025",Glue,Feels and smells like glue,amazon.au,E.L.F. COSMETICS POWER GRIP PRIMER
3,"January 2, 2025",Not really thrilled,I love this brand so was looking forward to tr...,amazon.au,E.L.F. COSMETICS POWER GRIP PRIMER
4,"March 12, 2025",Meh,Doesn't hold my makeup on & it actually makes ...,amazon.au,E.L.F. COSMETICS POWER GRIP PRIMER


In [42]:
df.to_excel(f'2_{df['Product'].iloc[0]}.xlsx', index=False)