## Selenium

In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Set up EdgeOptions
edge_options = Options()
edge_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54")
edge_options.add_argument("window-size=1920x1080")
edge_options.add_argument("disable-extensions")
edge_options.add_argument("--disable-blink-features=AutomationControlled")
edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])

def login_amazon(driver, username, password):
    if 'https://www.amazon.in/ap/signin' in driver.current_url:
        user_field = driver.find_element(By.ID, 'ap_user')
        user_field.send_keys(username)

        continue_button = driver.find_element(By.ID, 'continue')
        continue_button.click()
        time.sleep(2)

        password_field = driver.find_element(By.ID, 'ap_password')
        password_field.send_keys(password)

        sign_in_button = driver.find_element(By.ID, 'signInSubmit')
        sign_in_button.click()
        time.sleep(5)

def collect_reviews(driver, url, star_filter):
    filtered_url = f"{url}&filterByStar={star_filter}"
    driver.get(filtered_url)
    time.sleep(10)
    
    reviews = []
    try:
        for _ in range(11):
            WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[data-hook="review"]'))
            )
            review_elements = driver.find_elements(By.CSS_SELECTOR, 'div[data-hook="review"]')

            for element in review_elements:
                name = element.find_element(By.CSS_SELECTOR, '.a-profile-name').text
                title = element.find_element(By.CSS_SELECTOR, '.review-title').text
                date = element.find_element(By.CSS_SELECTOR, '.review-date').text.split(' on ')[-1]
                stars = element.find_element(By.CSS_SELECTOR, '.review-rating').text.split(' out')[0]
                body = element.find_element(By.CSS_SELECTOR, '.review-text').text

                reviews.append({
                    'Name': name,
                    'Title': title,
                    'Date': date,
                    'Stars': stars,
                    'Review': body
                })

            try:
                next_button = WebDriverWait(driver, 3).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, '.a-last a'))
                )
                driver.execute_script("arguments[0].click();", next_button)
                time.sleep(2)  # Respectful delay
            except TimeoutException:
                break  # No more pages to load for this star filter

    except TimeoutException:
        print(f"No reviews found for {star_filter}, skipping to the next filter.")
    
    return reviews

## Scraping Amazon Reviews

In [3]:
# Amazon account
username = 'bee050643@gmail.com'
password = 'Bee050643'

# URL Product
url = 'https://www.amazon.com/product-reviews/B07N7672D4/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews'
star_filters = ['one_star', 'two_star', 'three_star', 'four_star', 'five_star']
all_reviews = []

# Initialize WebDriver
driver = webdriver.Edge(options=edge_options)
driver.get(url)
time.sleep(10)

login_amazon(driver, username, password)

for star_filter in star_filters:
    reviews = collect_reviews(driver, url, star_filter)
    all_reviews.extend(reviews)
    print(f"Collected {len(reviews)} reviews for {star_filter}")

# driver.quit()

No reviews found for one_star, skipping to the next filter.
Collected 0 reviews for one_star


InvalidSessionIdException: Message: invalid session id: session deleted as the browser has closed the connection
from disconnected: not connected to DevTools
  (Session info: MicrosoftEdge=135.0.3179.98)
Stacktrace:
	GetHandleVerifier [0x00007FF7B9817A55+24981]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF7B976AC60+444240]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF7B9A9E0EA+2011274]
	(No symbol) [0x00007FF7B952B880]
	(No symbol) [0x00007FF7B954A24A]
	(No symbol) [0x00007FF7B95AED66]
	(No symbol) [0x00007FF7B95C61EA]
	(No symbol) [0x00007FF7B95A8E03]
	(No symbol) [0x00007FF7B957D686]
	(No symbol) [0x00007FF7B957CB92]
	(No symbol) [0x00007FF7B957D4B3]
	(No symbol) [0x00007FF7B967900D]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF7B9686943+41539]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF7B967FC79+13689]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF7B98FA04A+290794]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF7B9777D31+497697]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF7B9770BB4+468644]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF7B9770D03+468979]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF7B9762E16+411910]
	BaseThreadInitThunk [0x00007FFFD953E8D7+23]
	RtlUserThreadStart [0x00007FFFDA8FC5DC+44]


In [None]:
print(f"Total reviews collected: {len(all_reviews)}")

In [None]:
df = pd.DataFrame(all_reviews)
df.head()

In [None]:
product = url.split('/')[3].replace('-', ' ')
print(product)

In [None]:
df['Index'] = range(1, len(df) + 1)
df['Product'] = product
df['Source'] = 'Amazon'

desired_columns = ['Index', 'Name', 'Title', 'Date', 'Review', 'Product', 'Source']
df = df[desired_columns]
df = df.drop_duplicates()
df.head()

In [None]:
df.shape

In [None]:
df.to_excel(f"{product}_review.xlsx", index=False)