In [8]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

# Function to scrape car information from a given URL
def scrape_car_info(url, location):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive'
        }
        print("Scraping car info from:", url)
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            car_listings = soup.find_all("a", href=re.compile("^/vehicledetail/"))
            if not car_listings:
                print("No car listings found on this page.")
                return None
            car_data = []
            for car_link in car_listings:
                car_url = "https://www.cars.com" + car_link['href']
                car_details = scrape_car_details(car_url, location)
                if car_details:
                    car_data.append(car_details)
            return car_data
        else:
            print(f"Failed to connect to: {url}")
            return None
    except Exception as e:
        print("Error:", e)
        return None
        
# Function to parse car details from a car's page
def scrape_car_details(url, location):
    try:
        if url in parsed_urls:
            return None
        
        parsed_urls.add(url)  # Add the URL to the list of parsed URLs
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive'
        }
        
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")

            # Find elements
            comfort_element = soup.find("span", class_="sds-definition-list__display-name", string="Comfort")
            interior_element = soup.find("span", class_="sds-definition-list__display-name", string="Interior")
            performance_element = soup.find("span", class_="sds-definition-list__display-name", string="Performance")
            value_element = soup.find("span", class_="sds-definition-list__display-name", string="Value")
            exterior_element = soup.find("span", class_="sds-definition-list__display-name", string="Exterior")
            reliability_element = soup.find("span", class_="sds-definition-list__display-name", string="Reliability")

            # Extract text from elements
            comfort = comfort_element.find_next("span", class_="sds-definition-list__value").text.strip() if comfort_element else None
            interior = interior_element.find_next("span", class_="sds-definition-list__value").text.strip() if interior_element else None
            performance = performance_element.find_next("span", class_="sds-definition-list__value").text.strip() if performance_element else None
            value = value_element.find_next("span", class_="sds-definition-list__value").text.strip() if value_element else None
            exterior = exterior_element.find_next("span", class_="sds-definition-list__value").text.strip() if exterior_element else None
            reliability = reliability_element.find_next("span", class_="sds-definition-list__value").text.strip() if reliability_element else None

            # Extract year, brand, and model from title
            title_element = soup.find("h1", class_="listing-title")
            title = title_element.text.strip() if title_element else None
            year = None
            brand = None
            model = None
            if title:
                title_parts = title.split()
                year = title_parts[0] if title_parts else None
                brand = title_parts[1] if len(title_parts) > 1 else None
                model = " ".join(title_parts[2:]) if len(title_parts) > 2 else None

            # Extract other details
            exterior_color_element = soup.find("dt", string="Exterior color")
            interior_color_element = soup.find("dt", string="Interior color")
            drive_train_element = soup.find("dt", string="Drivetrain")
            mpg_element = soup.find("dt", string="MPG")
            fuel_type_element = soup.find("dt", string="Fuel type")
            transmission_element = soup.find("dt", string="Transmission")
            engine_element = soup.find("dt", string="Engine")
            mileage_element = soup.find("dt", string="Mileage")
            status_element = soup.find("p", class_="new-used")  # New element

            exterior_color = exterior_color_element.find_next("dd").text.strip() if exterior_color_element else None
            interior_color = interior_color_element.find_next("dd").text.strip() if interior_color_element else None
            drive_train = drive_train_element.find_next("dd").text.strip() if drive_train_element else None
            mpg_text = mpg_element.find_next("dd").text.strip() if mpg_element else None
            mpg = "-".join(re.findall(r'\b\d+-?\d*\b', mpg_text)) if mpg_text else 'NULL'
            fuel_type = fuel_type_element.find_next("dd").text.strip() if fuel_type_element else None
            transmission = transmission_element.find_next("dd").text.strip() if transmission_element else None
            engine = engine_element.find_next("dd").text.strip() if engine_element else None
            mileage = mileage_element.find_next("dd").text.strip() if mileage_element else None
            status = status_element.text.strip() if status_element else None  # Extract status

            # Extract price
            price_element = soup.find("span", {"data-qa": "primary-price"})
            price = price_element.text.strip() if price_element else None

            # Extract seller name
            seller_name_element = soup.find("h3", class_="spark-heading-5 heading seller-name")
            seller_name = seller_name_element.text.strip() if seller_name_element else None

            # Extract seller rating
            seller_rating_element = soup.find_all("span", class_="sds-rating__count")
            seller_rating = seller_rating_element[0].text.strip() if seller_rating_element else None

            # Extract consumer rating
            consumer_rating = seller_rating_element[1].text.strip() if len(seller_rating_element) > 1 else None

            # Extract accidents or damage
            accidents_or_damage_element = soup.find("dd", {"data-qa": "accidents-or-damage-value"})
            accidents_or_damage = accidents_or_damage_element.text.strip() if accidents_or_damage_element else None

            # Extract 1 owner vehicle
            one_owner_vehicle_element = soup.find("dd", {"data-qa": "one-owner-value"})
            one_owner_vehicle = one_owner_vehicle_element.text.strip() if one_owner_vehicle_element else None

            # Extract stock number and VIN
            stock_number_element = soup.find("dt", string="Stock #")
            stock_number = stock_number_element.find_next("dd").text.strip() if stock_number_element else None
            vin_element = soup.find("dt", string="VIN")
            vin = vin_element.find_next("dd").text.strip() if vin_element else None

            return {
                "Location": location,
                "Zip_Code": locations[location],
                "Status": status,
                "Stock_Number": stock_number,
                "VIN": vin,
                "Year": year,
                "Brand": brand,
                "Model": model,
                "Mileage": mileage,
                "Exterior_Color": exterior_color,
                "Interior_Color": interior_color,
                "Drive_Train": drive_train,
                "MPG": mpg,
                "Fuel_Type": fuel_type,
                "Transmission": transmission,
                "Engine": engine,
                "Price": price,
                "Seller_Name": seller_name,
                "Seller_Rating": seller_rating,
                "Consumer_Rating": consumer_rating,
                "Comfort": comfort,
                "Interior": interior,
                "Performance": performance,
                "Value": value,
                "Exterior": exterior,
                "Reliability": reliability,
                "Accidents_or_Damage": accidents_or_damage,
                "1_Owner_Vehicle": one_owner_vehicle
            }
        else:
            print(f"Failed to connect to: {url}")
    except Exception as e:
        print("Error:", e)

# List of locations to scrape along with their zip codes
locations = {
    "atlanta-ga": "30371",
    "chicago-il": "60608",
    "columbus-oh": "43201",
    "dallas-tx": "75371",
    "denver-co": "80238",
    "houston-tx": "77009",
    "los_angeles-ca": "90134",
    "new_york-ny": "11221",
    "philadelphia-pa": "19140",
    "phoenix-az": "85069",
    "san_diego-ca": "92142",
    "seattle-wa": "98191"
}

# Set to keep track of parsed URLs
parsed_urls = set()

# Set the maximum number of pages to scrape for each location
max_pages = 10

all_car_data = []  # List to store data for all locations

for location in locations:
    page_count = 1
    while page_count <= max_pages:
        url = f"https://www.cars.com/shopping/results/?zip={locations[location]}&page={page_count}&include_shippable=false"
        car_data = scrape_car_info(url, location)
        if not car_data:
            break
        all_car_data.extend(car_data)
        page_count += 1

# Convert data to DataFrame
df = pd.DataFrame(all_car_data)

# Save DataFrame to CSV
df.to_csv("cars_by_popular_city.csv", index=False)


Scraping car info from: https://www.cars.com/shopping/results/?zip=30371&page=1&include_shippable=false
Scraping car info from: https://www.cars.com/shopping/results/?zip=30371&page=2&include_shippable=false
Scraping car info from: https://www.cars.com/shopping/results/?zip=30371&page=3&include_shippable=false
Scraping car info from: https://www.cars.com/shopping/results/?zip=30371&page=4&include_shippable=false
Scraping car info from: https://www.cars.com/shopping/results/?zip=30371&page=5&include_shippable=false
Scraping car info from: https://www.cars.com/shopping/results/?zip=30371&page=6&include_shippable=false
Scraping car info from: https://www.cars.com/shopping/results/?zip=30371&page=7&include_shippable=false
Scraping car info from: https://www.cars.com/shopping/results/?zip=30371&page=8&include_shippable=false
Scraping car info from: https://www.cars.com/shopping/results/?zip=30371&page=9&include_shippable=false
Scraping car info from: https://www.cars.com/shopping/results/?z