In [3]:
!pip install selenium
!pip install beautifulsoup4
!pip install pandas



In [4]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import re
import json

# 1. Setup Chrome Driver if chromedriver.exe preinstalled on the computer and added to path variables
# chromedriver_path = './chromedriver.exe' ## if not added to path variables

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

# service = Service(chromedriver_path) ## if not added to path variables
driver = webdriver.Chrome(options=chrome_options)
print("WebDriver initialized successfully.")

driver.set_page_load_timeout(60)

# 2. Define lists for brands and cities
brands = ["honda"] # Choosing only hsonda for current exercise
cities = ["mumbai", "bangalore", "new-delhi"]

# 3. Create a base URL with placeholders for brand and city
base_url = "https://www.cars24.com/buy-used-{brand}-cars-{city}/?sort=bestmatch&serveWarrantyCount=true&listingSource=Homepage_Filters"

# 4. Generate the list of URLs dynamically using a nested loop
urls = []
for brand in brands:
    for city in cities:
        url = base_url.format(brand=brand, city=city)
        urls.append(url)

# Creating a list to store all the car data from all URLs
all_cars_data = []

# Looping through each URL
for url in urls:
    try:
        print(f"Attempting to load URL: {url}")
        driver.get(url)

        # A static delay can be useful to mimic human behavior
        time.sleep(5)

        # Waiting for a specific element to be present which is required for dynamic loading.
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, "styles_carCardWrapper__sXLIp"))
            )

        print("Initial page loaded successfully.")

        # Waiting for all listings to load before FAQs on page

        '''
        print("Scrolling to the end of the listings...")

        # Wait for the FAQ element to be present on the page.
        # This ensures all content above it has loaded.
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CLASS_NAME, "styles_flex__ILjoT"))
        )

        # Find the FAQ element once it's present
        faq_element = driver.find_element(By.CLASS_NAME, "styles_flex__ILjoT")

        # Scroll to that element
        driver.execute_script("arguments[0].scrollIntoView();", faq_element)

        # Add a short sleep to ensure the final elements render
        time.sleep(2)
        '''

        # Extract dynamic text from page
        dynamic_html = driver.page_source
        soup = BeautifulSoup(dynamic_html, 'html.parser')

        # Finding all the car listings
        car_listings = soup.find_all('a', class_='styles_carCardWrapper__sXLIp')

        # Looping through each car listing and extract the information
        for car in car_listings:

            car_info = {}

            name_tag = car.find('span', class_='sc-braxZu kjFjan')
            model_tag = car.find('span', class_='sc-braxZu lmmumg')

            if name_tag and model_tag:
                try:
                    full_name = name_tag.get_text(strip=True)
                    full_name_split = full_name.split()
                    car_info['Year'] = full_name_split[0]
                    car_info['Make'] = full_name_split[2]
                    car_info['Model'] = model_tag.get_text(strip=True)
                except (AttributeError, IndexError):
                    car_info['Year'] = 'N/A'
                    car_info['Make'] = 'N/A'
                    car_info['Model'] = 'N/A'
            else:
                car_info['Year'] = 'N/A'
                car_info['Make'] = 'N/A'
                car_info['Model'] = 'N/A'

            price_tag = car.find('p', class_='cyPhJl')
            car_info['Price'] = price_tag.get_text(strip=True) if price_tag else 'N/A'

            details_p_tags = car.find_all('p', class_='kvfdZL')
            if details_p_tags:
                details = [p.get_text(strip=True) for p in details_p_tags]
                car_info['KM Driven'] = details[0] if len(details) > 0 else 'N/A'
                car_info['Fuel Type'] = details[1] if len(details) > 1 else 'N/A'
                car_info['Transmission'] = details[2] if len(details) > 2 else 'N/A'
                car_info['Location_code'] = details[3] if len(details) > 3 else 'N/A'
            else:
                car_info['KM Driven'] = 'N/A'
                car_info['Fuel Type'] = 'N/A'
                car_info['Transmission'] = 'N/A'
                car_info['Location_code'] = 'N/A'

            city_tag_div = car.find('div', class_='styles_ellipsis__uatjG')
            city_tag_p = city_tag_div.find('p')
            car_info['Location'] = city_tag_p.get_text(strip=True) if city_tag_p else 'N/A'

            car_info['Link'] = car['href'] if car.has_attr('href') else 'N/A'

            all_cars_data.append(car_info)

        print(f"Successfully scraped {len(car_listings)} cars from {url}")

    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")


# Putting driver.quit() outside the loop to avoid the drive opening and closing for each url
if 'driver' in locals():
        driver.quit()
        print("WebDriver closed.")

# Printing all the data collected for QC in the notebook first
for car in all_cars_data:
    print(car)

# Saving it to a dataframe
df = pd.DataFrame(all_cars_data)

# Clean up the data
# Function to clean up 'KM driven' column and convert to numerical format
def convert_km_to_numeric(km_string):
    if pd.isna(km_string):
        return None

    km_string = str(km_string).replace(',', '')  # Remove any commas

    if 'k km' in km_string:
        value_str = km_string.replace('k km', '').strip()
        try:
            return float(value_str) * 1000
        except ValueError:
            return None

    elif 'L km' in km_string:
        value_str = km_string.replace('L km', '').strip()
        try:
            return float(value_str) * 100000
        except ValueError:
            return None

    else:
        # Handle cases where the format is different (e.g., just a number)
        try:
            return float(km_string.replace(' km', '').strip())
        except ValueError:
            return None

# Apply the function to the 'KM Driven' column
df['KM Driven'] = df['KM Driven'].apply(convert_km_to_numeric)

#df['KM Driven'] = df['KM Driven'].str.replace('km', '').str.strip().str.replace('k', '')
df['KM Driven'] = df['KM Driven'].round()
df['Fuel Type'] = df['Fuel Type'].str.strip()
df['Transmission'] = df['Transmission'].str.strip()
df['Location_code'] = df['Location_code'].str.strip()
df['Price'] = df['Price'].str.replace('₹', '').str.replace(' lakh', '').str.strip()

# Correcting format of columns which should be numeric types
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')* 100000
df['Price'] = df['Price'].round()
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Converting dataframe to a CSV/JSON file for sharing and further work
df.to_csv("all_cars.csv", index=False)
print("Data saved to all_cars.csv")

WebDriver initialized successfully.
Attempting to load URL: https://www.cars24.com/buy-used-honda-cars-mumbai/?sort=bestmatch&serveWarrantyCount=true&listingSource=Homepage_Filters
Initial page loaded successfully.
Successfully scraped 20 cars from https://www.cars24.com/buy-used-honda-cars-mumbai/?sort=bestmatch&serveWarrantyCount=true&listingSource=Homepage_Filters
Attempting to load URL: https://www.cars24.com/buy-used-honda-cars-bangalore/?sort=bestmatch&serveWarrantyCount=true&listingSource=Homepage_Filters
Initial page loaded successfully.
Successfully scraped 20 cars from https://www.cars24.com/buy-used-honda-cars-bangalore/?sort=bestmatch&serveWarrantyCount=true&listingSource=Homepage_Filters
Attempting to load URL: https://www.cars24.com/buy-used-honda-cars-new-delhi/?sort=bestmatch&serveWarrantyCount=true&listingSource=Homepage_Filters
Initial page loaded successfully.
Successfully scraped 40 cars from https://www.cars24.com/buy-used-honda-cars-new-delhi/?sort=bestmatch&serve