In [39]:
import random
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

In [40]:
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 16_3 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
]

In [41]:
def get_random_user_agent():
    """Return a random user agent from the list."""
    return random.choice(USER_AGENTS)

In [42]:
def initialize_driver():
    """Initialize the Selenium WebDriver with Edge."""
    edge_driver_path = r'C:\Users\hp\Downloads\msedgedriver.exe'  # Update this path if needed
    service = Service(edge_driver_path)
    options = webdriver.EdgeOptions()
    
    # Anti-detection measures
    options.add_argument(f"user-agent={get_random_user_agent()}")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    
    driver = webdriver.Edge(service=service, options=options)
    return driver

In [43]:
def extract_car_info(driver):
    """Extract car information from the current page."""
    car_data = []
    
    # Wait for car elements to load
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="results-container"]/ul/li'))
        )
    except TimeoutException:
        print("Timed out waiting for car elements to load.")
        return car_data
    
    car_elements = driver.find_elements(By.XPATH, '//*[@id="results-container"]/ul/li')
    print(f"Found {len(car_elements)} car elements on the page.")
    
    for car_element in car_elements:
        try:
            year_make_model = car_element.find_element(By.XPATH, './/div[1]/h2/a/div[1]').text
            trim = car_element.find_element(By.XPATH, './/div[1]/h2/a/div[2]').text 
            price = car_element.find_element(By.XPATH, './/div[2]/div[2]/div[1]/span').text
            mileage = car_element.find_element(By.XPATH, './/ul[1]/li[1]/span[2]').text
            
            car_data.append({
                'Year_Make_Model': year_make_model,
                'Trim': trim,
                'Price': price,
                'Mileage': mileage
            })
        except Exception as e:
            print(f"Error extracting data: {e}")
    
    return car_data

In [44]:
def scrape_all_pages(base_url, max_pages=40):
    """Scrape car data from multiple pages."""
    driver = initialize_driver()
    all_car_data = []
    page_number = 1

    while page_number <= max_pages:
        print(f"Navigating to page {page_number}")
        driver.get(f"{base_url}?page={page_number}")
        time.sleep(random.uniform(10, 20))  # Random delay
        
        # Extract car information from the current page
        car_data = extract_car_info(driver)
        all_car_data.extend(car_data)
        
        # Check if there is a next page
        try:
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="results-container"]//a/span[contains(text(), "Next")]'))
            )
            print("Found the 'Next' button.")
            
            if "disabled" in next_button.get_attribute("class"):
                print("Reached the last page.")
                break
            
            next_button.click()
            time.sleep(random.uniform(10, 20))  # Random delay
        except (NoSuchElementException, TimeoutException):
            print("No more pages to load.")
            break
        
        # Move to the next page
        page_number += 1

    # Close the driver
    driver.quit()
    
    return all_car_data



In [45]:
def save_to_csv(data, filename="cars_data.csv"):
    """Save the scraped data to a CSV file."""
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")

In [46]:
def main():
    """Main function to run the scraping process."""
    base_url = 'https://www.edmunds.com/inventory/srp.html'
    cars_data = scrape_all_pages(base_url)
    save_to_csv(cars_data)

In [47]:
if __name__ == "__main__":
    main()

Navigating to page 1
Found 22 car elements on the page.
Error extracting data: Message: no such element: Unable to locate element: {"method":"xpath","selector":".//div[1]/h2/a/div[1]"}
  (Session info: MicrosoftEdge=132.0.2957.127); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF705660AD5+13637]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF7058EBC04+2078900]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF7058466C6+1401718]
	(No symbol) [0x00007FF70541D84C]
	(No symbol) [0x00007FF70541DA0B]
	(No symbol) [0x00007FF705413E0C]
	(No symbol) [0x00007FF70543DE1F]
	(No symbol) [0x00007FF705413DB7]
	(No symbol) [0x00007FF705413C7D]
	(No symbol) [0x00007FF70543E0C0]
	(No symbol) [0x00007FF705413DB7]
	(No symbol) [0x00007FF70545890D]
	(No symbol) [0x00007FF70543DA43]
	(No symbol) [0x00007FF705413304]
	(No symbol) [0x00007F