In [1]:
pip install selenium-stealth

Note: you may need to restart the kernel to use updated packages.


In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium_stealth import stealth
from bs4 import BeautifulSoup
import psycopg2
import time
import random

# Function to scrape data using Selenium and insert into PostgreSQL
def scrape_amazon_watches():
    options = webdriver.ChromeOptions()
    options.add_argument("start-maximized")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)

    # Set up Chrome driver with Selenium Stealth to bypass CAPTCHAs
    cService = ChromeService(executable_path='chromedriver.exe') 
    driver = webdriver.Chrome(service=cService, options=options)
    driver.get('https://www.amazon.com/')

    # Apply stealth to bypass detection
    stealth(driver,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
    )

    # Visit the Amazon page for watches
    url = "https://www.amazon.com/s?i=fashion-mens-intl-ship&bbn=16225019011&rh=n%3A6358539011%2Cp_123%3A179010&dc&ds=v1%3AXSc7rAHtUNQhBoQ%2F5bRQK%2F4DbOdWCWJSkGhWifMVdhs&qid=1729009350"
    driver.get(url)
    time.sleep(random.uniform(3, 5))  # Random sleep to mimic human behavior

    # Extract page source and parse it with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Connect to PostgreSQL database
    conn = psycopg2.connect(
        host="localhost",
        database="watch_db1",  
        user="postgres",  
        password="1234" 
    )
    cursor = conn.cursor()

    # Create tables if they don't exist
    cursor.execute('''CREATE TABLE IF NOT EXISTS watches (
        id SERIAL PRIMARY KEY,
        title TEXT NOT NULL,
        price TEXT NOT NULL,
        specifications TEXT,
        image_url TEXT
    );''')

    cursor.execute('''CREATE TABLE IF NOT EXISTS reviews (
        id SERIAL PRIMARY KEY,
        watch_id INTEGER REFERENCES watches(id),
        rating TEXT
    );''')
    conn.commit()

    # Extract watch details
    for product in soup.select('.s-result-item'):
        # Extract the title of the watch
        title_element = product.select_one('h2 .a-size-base-plus.a-color-base.a-text-normal')
        title_text = title_element.get_text(strip=True) if title_element else 'Unknown'

        # Extract the price of the watch
        price_element = product.select_one('.a-price-whole')
        price_text = price_element.get_text(strip=True) if price_element else 'Unavailable'

        # Extract specifications or any additional details if available
        specifications = 'N/A' 
        
        # Extract image URL if available
        image_element = product.select_one('img')
        image_url = image_element['src'] if image_element else 'N/A'

        # Print extracted data to verify
        print(f"Title: {title_text}, Price: {price_text}, Image URL: {image_url}")

        try:
            # Insert watch data into the database
            cursor.execute('''INSERT INTO watches (title, price, specifications, image_url)
                              VALUES (%s, %s, %s, %s) RETURNING id;''', (title_text, price_text, specifications, image_url))
            watch_id = cursor.fetchone()[0]
            conn.commit()
            print(f"Watch data inserted successfully with ID: {watch_id}")

            # Extract average rating from the product listing
            rating_element = product.select_one('span[aria-label*="out of 5 stars"]')
            average_rating = rating_element.get_text(strip=True) if rating_element else 'N/A'
            print(f"Average Rating: {average_rating}")

            # Insert average rating into the reviews table
            cursor.execute('''INSERT INTO reviews (watch_id, rating)
                              VALUES (%s, %s);''', (watch_id, average_rating))
            conn.commit()
            print(f"Average rating inserted for watch ID {watch_id}")

        except Exception as e:
            print(f"Error inserting data: {e}")
            conn.rollback()  # Rollback the transaction to avoid blocking further inserts

    # Close the database connection
    cursor.close()
    conn.close()
    
    # Close the Selenium driver
    driver.quit()


scrape_amazon_watches()


Title: Unknown, Price: Unavailable, Image URL: N/A
Watch data inserted successfully with ID: 70
Average Rating: N/A
Average rating inserted for watch ID 70
Title: Fossil Nate Men's Watch with Oversized Chronograph Watch Dial and Stainless Steel or Leather Band, Price: 90., Image URL: https://m.media-amazon.com/images/I/71B-vrIIA3L._AC_UL320_.jpg
Watch data inserted successfully with ID: 71
Average Rating: 4.5 out of 5 stars
Average rating inserted for watch ID 71
Title: Fossil Grant Men's Watch with Chronograph or Automatic Display and Genuine Leather or Stainless Steel Band, Price: 80., Image URL: https://m.media-amazon.com/images/I/818lBoWqXtL._AC_UL320_.jpg
Watch data inserted successfully with ID: 72
Average Rating: 4.6 out of 5 stars
Average rating inserted for watch ID 72
Title: Fossil Men's Coachman Quartz Stainless Steel and Leather Chronograph Watch, Color: Silver, Brown (Model: CH2891), Price: 101., Image URL: https://m.media-amazon.com/images/I/71taSx89wUL._AC_UL320_.jpg
Wat