In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

# Grab all vehicles on each location's vehicle page
def scrape_car_info(url, location):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive'
        }
        print("Scraping car info from:", url)
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            #print("Successfully connected to:", url) For debugging
            soup = BeautifulSoup(response.content, "html.parser")
            car_listings = soup.find_all("a", href=re.compile("^/vehicledetail/"))
            if not car_listings:
                print("No car listings found on this page.")
                return None
            car_data = []
            for car_link in car_listings:
                car_url = "https://www.cars.com" + car_link['href']
                # print("Found car detail URL:", car_url) For debugging
                car_details = scrape_car_details(car_url, location)
                if car_details:
                    car_data.append(car_details)
            return car_data
        else:
            print(f"Failed to connect to: {url}")
            return None
    except Exception as e:
        print("Error:", e)
        return None
        
# Parse each individual vehicle's detail page for data
def scrape_car_details(url, location):
    try:
       # print("Parsing car details:", URL) For debugging
        if url in parsed_urls:
            #print("URL already parsed. Skipping...")
            return None
        
        parsed_urls.add(url)  # Add the URL to the list of parsed URLs
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive'
        }
        
        # print("Attempting to get response from:", url)  # New debug print
        response = requests.get(url, headers=headers)
        # print("Response status code:", response.status_code)  # Debug print
        if response.status_code == 200:
           # print("Successfully connected to:", url) For debugging
            soup = BeautifulSoup(response.content, "html.parser")

            # Find elements
            exterior_color_element = soup.find("dt", string="Exterior color")
            interior_color_element = soup.find("dt", string="Interior color")
            drive_train_element = soup.find("dt", string="Drivetrain")
            mpg_element = soup.find("dt", string="MPG")
            fuel_type_element = soup.find("dt", string="Fuel type")
            transmission_element = soup.find("dt", string="Transmission")
            engine_element = soup.find("dt", string="Engine")
            mileage_element = soup.find("dt", string="Mileage")
            status_element = soup.find("p", class_="new-used")  # New element
            title_element = soup.find("h1", class_="listing-title")  # New element

            # Extract text from elements
            exterior_color = exterior_color_element.find_next("dd").text.strip() if exterior_color_element else None
            interior_color = interior_color_element.find_next("dd").text.strip() if interior_color_element else None
            drive_train = drive_train_element.find_next("dd").text.strip() if drive_train_element else None
            mpg_text = mpg_element.find_next("dd").text.strip() if mpg_element else None
            mpg = "-".join(re.findall(r'\b\d+-?\d*\b', mpg_text)) if mpg_text else 'NULL'
            fuel_type = fuel_type_element.find_next("dd").text.strip() if fuel_type_element else None
            transmission = transmission_element.find_next("dd").text.strip() if transmission_element else None
            engine = engine_element.find_next("dd").text.strip() if engine_element else None
            mileage = mileage_element.find_next("dd").text.strip() if mileage_element else None
            status = status_element.text.strip() if status_element else None  # Extract status
            title = title_element.text.strip() if title_element else None  # Extract title

            # Extract year, brand, and model from title
            year = None
            brand = None
            model = None
            if title:
                title_parts = title.split()
                year = title_parts[0] if title_parts else None
                brand = title_parts[1] if len(title_parts) > 1 else None
                model = " ".join(title_parts[2:]) if len(title_parts) > 2 else None

            # Print parsed data for debugging
            # print("Parsed car details:")
            # print("Location:", location)
            # print("Exterior Color:", exterior_color)
            # print("Interior Color:", interior_color)
            # print("Drive Train:", drive_train)
            # print("MPG:", mpg)
            # print("Fuel Type:", fuel_type)
            # print("Transmission:", transmission)
            # print("Engine:", engine)
            # print("Mileage:", mileage)
            # print("Status:", status)
            # print("Year:", year)
            # print("Brand:", brand)
            # print("Model:", model)

            return {
                "Location": location,
                "Exterior_Color": exterior_color,
                "Interior_Color": interior_color,
                "Drive_Train": drive_train,
                "MPG": mpg,
                "Fuel_Type": fuel_type,
                "Transmission": transmission,
                "Engine": engine,
                "Mileage": mileage,
                "Status": status,
                "Year": year,
                "Brand": brand,
                "Model": model
            }
        else:
            print(f"Failed to connect to: {url}")
    except Exception as e:
        print("Error:", e)

# List of locations to scrape
locations = [
    "atlanta-ga",
    "chicago-il",
    "columbus-oh",
    "dallas-tx",
    "denver-co",
    "houston-tx",
    "los-angeles-ca",
    "new-york-ny",
    "philadelphia-pa",
    "phoenix-az",
    "san_diego-ca",
    "seattle-wa"
]

# Set to keep track of parsed URLs
parsed_urls = set()

# Set the maximum number of pages to scrape for each location
max_pages = 10

all_car_data = []  # List to store data for all locations

for location in locations:
    page_count = 1
    while page_count <= max_pages:
        url = f"https://www.cars.com/shopping/{location}/?page={page_count}"
        car_data = scrape_car_info(url, location)
        if not car_data:
            break
        all_car_data.extend(car_data)
        page_count += 1

# Convert data to DataFrame
df = pd.DataFrame(all_car_data)

# Save DataFrame to CSV
df.to_csv("cars_by_popular_city.csv", index=False)


Scraping car info from: https://www.cars.com/shopping/atlanta-ga/?page=1
Scraping car info from: https://www.cars.com/shopping/atlanta-ga/?page=2
Scraping car info from: https://www.cars.com/shopping/atlanta-ga/?page=3
Scraping car info from: https://www.cars.com/shopping/atlanta-ga/?page=4
Scraping car info from: https://www.cars.com/shopping/atlanta-ga/?page=5
Scraping car info from: https://www.cars.com/shopping/atlanta-ga/?page=6
Scraping car info from: https://www.cars.com/shopping/atlanta-ga/?page=7
Scraping car info from: https://www.cars.com/shopping/atlanta-ga/?page=8
Scraping car info from: https://www.cars.com/shopping/atlanta-ga/?page=9
Scraping car info from: https://www.cars.com/shopping/atlanta-ga/?page=10
Scraping car info from: https://www.cars.com/shopping/chicago-il/?page=1
Scraping car info from: https://www.cars.com/shopping/chicago-il/?page=2
Scraping car info from: https://www.cars.com/shopping/chicago-il/?page=3
Scraping car info from: https://www.cars.com/shopp