### Scraping Amazon Reviews

In [1]:
import random
from datetime import datetime

import pandas as pd
import requests
from bs4 import BeautifulSoup

## Selenium

In [2]:
from selenium import webdriver
import xlsxwriter
import requests
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.service import Service
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Set up EdgeOptions
edge_options = Options()
edge_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54")
edge_options.add_argument("window-size=1920x1080")
edge_options.add_argument("disable-extensions")
edge_options.add_argument("--disable-blink-features=AutomationControlled")
edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])

def get_amazon_reviews(url, pages):
    star_filters = ['one_star', 'two_star', 'three_star', 'four_star', 'five_star']
    all_reviews = []

    driver = webdriver.Edge()  # Use a single browser instance

    for star_filter in star_filters:
        filtered_url = f"{url}&filterByStar={star_filter}"
        driver.get(filtered_url)
        time.sleep(5)

        reviews = []

        try:
            for page in range(pages):
                WebDriverWait(driver, 25).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '#cm_cr-review_list'))
                )
                review_elements = driver.find_elements(By.CSS_SELECTOR, '#cm_cr-review_list .review')
                for element in review_elements:
                    # print(element)
                    try:
                        name = element.find_element(By.CSS_SELECTOR, '.a-profile-name').text
                        title = element.find_element(By.CSS_SELECTOR, '[data-hook="review-title"]').text
                        date = element.find_element(By.CSS_SELECTOR, '[data-hook="review-date"]').text.split(' on ')[-1]
                        stars = element.find_element(By.CSS_SELECTOR, '.a-icon-alt').text.split(' out')[0]
                        body = element.find_element(By.CSS_SELECTOR, '[data-hook="review-body"] span').text

                        reviews.append({
                            'Name': name,
                            'Title': title,
                            'Date': date,
                            'Stars': stars,
                            'Review': body
                        })
                    except NoSuchElementException:
                        continue

                print(f"Collected {len(reviews)} reviews for {star_filter} on page {page + 1}.")

                try:
                    next_button = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, '.a-last a'))
                    )
                    driver.execute_script("arguments[0].click();", next_button)
                    time.sleep(2)
                except TimeoutException:
                    print(f"No more pages for {star_filter}.")
                    break

            all_reviews.extend(reviews)

        except TimeoutException:
            print(f"Timeout for {star_filter}.")
            continue

    driver.quit()
    return all_reviews

# Example usage:
url = 'https://https://www.amazon.com.au/product-reviews/B0DFMW6VZV/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews'
reviews = get_amazon_reviews(url, 10)
print(f"Total reviews collected: {len(reviews)}")

WebDriverException: Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: MicrosoftEdge=136.0.3240.92)
Stacktrace:
	GetHandleVerifier [0x00007FF61BD0F935+25029]
	(No symbol) [0x00007FF61BC64940]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF61BF6AA3A+1947706]
	(No symbol) [0x00007FF61BA28A30]
	(No symbol) [0x00007FF61BA1CE6F]
	(No symbol) [0x00007FF61BA1E4CD]
	(No symbol) [0x00007FF61BA1D0D9]
	(No symbol) [0x00007FF61BA1CCDC]
	(No symbol) [0x00007FF61BA1CA28]
	(No symbol) [0x00007FF61BA1AA69]
	(No symbol) [0x00007FF61BA1B03C]
	(No symbol) [0x00007FF61BA2F06A]
	(No symbol) [0x00007FF61BAB6D1E]
	(No symbol) [0x00007FF61BA9908A]
	(No symbol) [0x00007FF61BA6E15D]
	(No symbol) [0x00007FF61BAB6698]
	(No symbol) [0x00007FF61BA98DF3]
	(No symbol) [0x00007FF61BA6D6A6]
	(No symbol) [0x00007FF61BA6CBB3]
	(No symbol) [0x00007FF61BA6D4D3]
	(No symbol) [0x00007FF61BB75F0D]
	(No symbol) [0x00007FF61BB83AAF]
	(No symbol) [0x00007FF61BB7C49F]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF61BDD49EA+284650]
	(No symbol) [0x00007FF61BC722D1]
	(No symbol) [0x00007FF61BC6AD74]
	(No symbol) [0x00007FF61BC6AEC3]
	(No symbol) [0x00007FF61BC5CB06]
	BaseThreadInitThunk [0x00007FFEAE32E8D7+23]
	RtlUserThreadStart [0x00007FFEAF41C5DC+44]


In [None]:
# driver = webdriver.Edge(options=edge_options)
# driver.get(url)

In [None]:
df = pd.DataFrame(reviews)

In [None]:
df.head()

In [None]:
df['Source'] = 'amazon.au'
df['Product'] = 'E.L.F. GLOW REVIVER LIP OIL'

In [None]:
df.head()

In [None]:
# Specify the desired column order
desired_columns = ["Date", "Title", "Review", "Source", "Product"]

# Reorder the columns
df = df[desired_columns]

In [None]:
df = df.drop_duplicates()
df.shape

In [None]:
df.head()

In [None]:
df.to_excel(f'5_{df['Product'].iloc[0]}.xlsx', index=False)