In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np

# Define the headers you want to use
headers = {
    "Accept-Language": "en-GB,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15"
}

total_pages = 10  # Example value; replace with the actual number of pages
final_list = []

# Initialize lists for the entire scraping process
scraped_reviews = []
r_author = []
r_rating = []
r_title = []
r_content = []
r_date = []
r_verified = []
r_image = []

# Iterate through all pages
for page in range(1, total_pages + 1):
    # Set up Chrome options for each iteration
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (no browser UI)
    chrome_options.add_argument("--disable-gpu")

    # Initialize the Chrome WebDriver for each page
    driver = webdriver.Chrome(options=chrome_options)

    # Use Chrome DevTools Protocol (CDP) to set custom headers
    driver.execute_cdp_cmd('Network.enable', {})
    driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})

    url = f"https://www.amazon.com/BERIBES-Cancelling-Transparent-Soft-Earpads-Charging-Black/product-reviews/B0CDC4X65Q/?pageNumber={page}"

    try:
        # Navigate to the URL
        driver.get(url)

        # Clear cookies
        driver.delete_all_cookies()

        # Get the page source after the JavaScript has executed
        html = driver.page_source

        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')

        review_elements = soup.select("div.review")

        for review in review_elements:
            r_author_element = review.select_one("span.a-profile-name")
            r_author.append(r_author_element.text if r_author_element else np.nan)

            r_rating_element = review.select_one("i.review-rating")
            r_rating.append(r_rating_element.text.replace("out of 5 stars", "") if r_rating_element else np.nan)

            r_title_element = review.select_one("a.review-title")
            r_title_span_element = r_title_element.select_one("span:not([class])") if r_title_element else np.nan
            r_title.append(r_title_span_element.text if r_title_span_element else np.nan)

            r_content_element = review.select_one("span.review-text")
            r_content.append(r_content_element.text if r_content_element else np.nan)

            r_date_element = review.select_one("span.review-date")
            r_date.append(r_date_element.text if r_date_element else np.nan)

            r_verified_element = review.select_one("span.a-size-mini")
            r_verified.append(r_verified_element.text if r_verified_element else np.nan)

            r_image_element = review.select_one("img.review-image-tile")
            r_image.append(r_image_element.attrs["src"] if r_image_element else np.nan)

        # Print the extracted information for the current page (for debugging purposes)
        print(f"Page: {page}")
        print("Author Name:", r_author[0], ':', r_author[-1])
        print("Ratings:", r_rating[0], ':', r_rating[-1])
        print("Title:", r_title[0], ':', r_title[-1])
        print("Content:", r_content[0], ':', r_content[-1])
        print("Date:", r_date[0], ':', r_date[-1])
        print("Verified:", r_verified[0], ':', r_verified[-1])
        print("Image:", r_image[0], ':', r_image[-1])

        # Add a sleep interval to avoid overwhelming the server with requests
        time.sleep(5)

    except Exception as e:
        print(f"An error occurred on page {page}: {e}")

    finally:
        # Close the browser after processing each page
        driver.quit()

# Combine all the data into a single DataFrame
review_dict = {
    'r_author': r_author,
    'r_rating': r_rating,
    'r_title': r_title,
    'r_content': r_content,
    'r_date': r_date,
    'r_verified': r_verified,
    'r_image': r_image,
}

df = pd.DataFrame(review_dict)
final_list.append(df)

# Concatenate all DataFrames in the list
if final_list:
    final = pd.concat(final_list, ignore_index=True)
    # Save the final DataFrame to a CSV file
    final.to_csv('/home/rakesh/Desktop/amazon_product_review.csv', index=False)
    print("Data saved to amazon_product_review.csv")
else:
    print("No data to concatenate into the final DataFrame.")


Page: 1
Author Name: RJR : Dragongravy
Ratings: 5.0  : 4.0 
Title: Great Headphones! : great set of headphones for the money
Content: 
Wow! I ordered these over-the-ear headphones because my Samsung eat buds were never comfortable for me and they would often fall out of my ear, which is really a problem when that happens on a plane and the darn things rolls to who knows where under someone else's seat! Well, I just decided to forget about stupid ear buds and just get a set of over-the-ear noise canceling headphones. I was not about to shell out big bucks for them because I didn't think I would use them so much, mostly just when I'm traveling. So I looked over many options in the under $50 range and I chose these. I received them same day from Amazon. That was awesome! I opened the box, charged them fully in 2 hours. The usb to usb-c cord is included but you have to supply your own electrical power plug to plug it into a wall outlet to get the fastest charging. It also came with a 3.5mm