## CV Final Project
## Scraping Car Images (2013 & Newer) from Craigslist

In [None]:
# Imports
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

In [None]:
# Function to get the links of individual listings
def get_listing_links(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve page: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    # print(soup)
    listing_items = soup.find_all('li', class_='cl-static-search-result')   # 'li' tags contain the listing links
    # print(listing_items)
    links = [item.find('a')['href'] for item in listing_items if item.find('a')]
    # print(links)
    return links

In [None]:
# Test get_listing_links function
base_url = "https://chicago.craigslist.org/search/cta?bundleDuplicates=1&hasPic=1&min_auto_year=2013#search=1~gallery~0~0"
test_links = get_listing_links(base_url)

# Check the first few links
print(test_links[:5])

['https://chicago.craigslist.org/chc/ctd/d/charlotte-2024-biz-on-wheels/7805177852.html', 'https://chicago.craigslist.org/chc/ctd/d/charlotte-2024-biz-on-wheels/7805177737.html', 'https://chicago.craigslist.org/chc/cto/d/wilmette-nissan-altima-2017-for-sale/7805173207.html', 'https://chicago.craigslist.org/sox/ctd/d/oak-forest-2018-audi-a6-30t-quattro/7805169904.html', 'https://chicago.craigslist.org/chc/ctd/d/arlington-heights-2019-chevrolet/7805166523.html']


In [None]:
# Function to extract image URLs from the 'thumbs' section of the individual listings html
def extract_images_from_thumbs(soup):
    images = []
    thumbs_div = soup.find('div', id='thumbs')
    if thumbs_div:
        thumbs = thumbs_div.find_all('a')
        for thumb in thumbs:
            image_url = thumb.get('href')
            if image_url:
                images.append(image_url)
    return images

# Function to scrape images, year, make, model from each individual listing
def get_car_details(listing_url):
    response = requests.get(listing_url)
    if response.status_code != 200:
        print(f"Failed to retrieve page: {response.status_code} for {listing_url}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract the year
    year = soup.find('span', class_='valu year')
    year_text = year.get_text(strip=True) if year else 'Year not found'

    # Extract the make and model
    make_model = soup.find('span', class_='valu makemodel')
    make_model_text = make_model.get_text(strip=True) if make_model else 'Make/Model not found'

    # Get image URLs using the separate function
    image_urls = extract_images_from_thumbs(soup)[:5]  # Taking just the first 5 images as we only want images of the exterior of the car

    # Structure the details as a dictionary
    details = {
        'year': year_text,
        'make_model': make_model_text,
        'images': image_urls,
        'url': listing_url
    }

    return details

In [None]:
# Test get_car_details function with one of the links from test_links
test_details = get_car_details(test_links[0])
print(test_details)

{'year': '2024', 'make_model': 'Biz On Wheels', 'images': ['https://images.craigslist.org/00k0k_6V83nSbEMFc_0cU09G_600x450.jpg', 'https://images.craigslist.org/00808_lOBHHc8PeI2_0cU09G_600x450.jpg', 'https://images.craigslist.org/00u0u_f2jLPtZb26x_0cU09G_600x450.jpg', 'https://images.craigslist.org/00s0s_kaEcmmi9p8L_0cU09G_600x450.jpg', 'https://images.craigslist.org/00F0F_9HAgSe3ZfYc_0cU09G_600x450.jpg'], 'url': 'https://chicago.craigslist.org/chc/ctd/d/charlotte-2024-biz-on-wheels/7805177852.html'}


In [None]:
# Main function to scrape multiple listings and store results
def scrape_craigslist(base_url, max_listings=10, delay=2):
    listing_links = get_listing_links(base_url)
    print(f"Found {len(listing_links)} listings. Processing up to {max_listings}.")

    # Limit to the desired number of listings
    listing_links = listing_links[:max_listings]
    car_details_list = []

    for i, link in enumerate(listing_links, start=1):
        print(f"Processing listing {i}/{len(listing_links)}: {link}")
        details = get_car_details(link)

        if details:
            car_details_list.append(details)
            print(f"Retrieved details for listing {i}.")
        else:
            print(f"Skipping listing {i} due to errors.")

        time.sleep(delay)  # delay to avoid rate limiting

    return car_details_list

In [None]:
# Test main function
base_url = "https://chicago.craigslist.org/search/cta?bundleDuplicates=1&hasPic=1&min_auto_year=2013#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])

Found 342 listings. Processing up to 5.
Processing listing 1/5: https://chicago.craigslist.org/chc/ctd/d/charlotte-2024-biz-on-wheels/7805177852.html
Retrieved details for listing 1.
Processing listing 2/5: https://chicago.craigslist.org/chc/ctd/d/charlotte-2024-biz-on-wheels/7805177737.html
Retrieved details for listing 2.
Processing listing 3/5: https://chicago.craigslist.org/chc/cto/d/wilmette-nissan-altima-2017-for-sale/7805173207.html
Retrieved details for listing 3.
Processing listing 4/5: https://chicago.craigslist.org/sox/ctd/d/oak-forest-2018-audi-a6-30t-quattro/7805169904.html
Retrieved details for listing 4.
Processing listing 5/5: https://chicago.craigslist.org/chc/ctd/d/arlington-heights-2019-chevrolet/7805166523.html
Retrieved details for listing 5.
[{'year': '2024', 'make_model': 'Biz On Wheels', 'images': ['https://images.craigslist.org/00k0k_6V83nSbEMFc_0cU09G_600x450.jpg', 'https://images.craigslist.org/00808_lOBHHc8PeI2_0cU09G_600x450.jpg', 'https://images.craigslist

### Clean and transform the data

In [None]:
# Function to clean and transform the data
def clean_and_transform_data(car_data):
    rows = []

    for car in car_data:
        if len(car['images']) <= 1:   # Skip listings with only one image since the single image usually just says "details coming soon"
            continue

        year = car['year'].strip()
        make_model = car['make_model'].strip().lower()
        make, *model_parts = make_model.split()  # Assume the first word is the make, and the rest is the model
        model = '_'.join(model_parts)

        make_model_label = make_model.replace(' ', '_')  # Replace spaces in make_model with underscores for consistent labeling

        label = f"{year}_{make_model_label}"  # Create a label that combines year and make_model

        # Split each image into a separate row
        for image_url in car['images']:
            rows.append({
                'year': year,  # Keeping year as a string
                'make_model': make_model_label,
                'make': make,
                'model': model,
                'label': label,
                'image_url': image_url,
                'listing_url': car['url']
            })

    # Create a DataFrame from the rows
    df = pd.DataFrame(rows)
    print(f"Transformed data into {len(df)} rows.")
    return df

In [None]:
# Test clean and transform the data
# df = clean_and_transform_data(car_data)
# print(df.head(5))

#### Chicago

In [None]:
# Call main function for Chicago area
base_url_chicago = "https://chicago.craigslist.org/search/cta?bundleDuplicates=1&hasPic=1&min_auto_year=2013#search=1~gallery~0~0"
car_data_chicago = scrape_craigslist(base_url_chicago, max_listings=343, delay=2)
print(car_data_chicago[:5])

Found 343 listings. Processing up to 343.
Processing listing 1/343: https://chicago.craigslist.org/nwi/ctd/d/highland-2014-buick-lacrosse-leather/7805250709.html
Retrieved details for listing 1.
Processing listing 2/343: https://chicago.craigslist.org/wcl/cto/d/schiller-park-new-2023-ford-maverick/7805249697.html
Retrieved details for listing 2.
Processing listing 3/343: https://chicago.craigslist.org/chc/ctd/d/villa-park-all-credit-approved-buy-here/7805237960.html
Retrieved details for listing 3.
Processing listing 4/343: https://chicago.craigslist.org/chc/cto/d/chicago-hyundai-santa-fe/7805228802.html
Retrieved details for listing 4.
Processing listing 5/343: https://chicago.craigslist.org/nwc/ctd/d/eau-claire-500-horsepower-supercharged/7805209734.html
Retrieved details for listing 5.
Processing listing 6/343: https://chicago.craigslist.org/sox/ctd/d/oak-forest-2020-ford-expedition-max-xlt/7805209230.html
Retrieved details for listing 6.
Processing listing 7/343: https://chicago.cr

In [None]:
# Clean the collected data from chicago
chicago_car_df = clean_and_transform_data(car_data_chicago)

Transformed data into 1689 rows.


#### Milwaukee

In [None]:
# Test Milwaukee function
base_url = "https://milwaukee.craigslist.org/search/milwaukee-wi/cta?bundleDuplicates=1&hasPic=1&lat=43.07&lon=-87.975&min_auto_year=2013&search_distance=12#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 263 listings. Processing up to 5.
Processing listing 1/5: https://milwaukee.craigslist.org/cto/d/new-berlin-2018-honda-civic/7805256794.html
Retrieved details for listing 1.
Processing listing 2/5: https://milwaukee.craigslist.org/ctd/d/new-berlin-2018-ford-150-xlt-4x4/7805212555.html
Retrieved details for listing 2.
Processing listing 3/5: https://milwaukee.craigslist.org/ctd/d/new-berlin-2013-hyundai-tucson-gls-awd/7805212198.html
Retrieved details for listing 3.
Processing listing 4/5: https://milwaukee.craigslist.org/cto/d/hales-corners-2017-hyundai-elantra-se/7805187281.html
Retrieved details for listing 4.
Processing listing 5/5: https://milwaukee.craigslist.org/cto/d/greendale-jeep-compass-2015/7805131292.html
Retrieved details for listing 5.
[{'year': '2018', 'make_model': 'honda civic', 'images': ['https://images.craigslist.org/00c0c_4cW8Tqcp2fT_0t20CI_600x450.jpg', 'https://images.craigslist.org/00Z0Z_bdsEiD883ns_0t20CI_600x450.jpg', 'https://images.craigslist.org/00P0P

In [None]:
# Call main function for Milwaukee area
base_url_milwaukee = "https://milwaukee.craigslist.org/search/milwaukee-wi/cta?bundleDuplicates=1&hasPic=1&lat=43.07&lon=-87.975&min_auto_year=2013&search_distance=12#search=1~gallery~0~0"
car_data_milwaukee = scrape_craigslist(base_url_milwaukee, max_listings=263, delay=2)
print(car_data_milwaukee[:5])


Found 263 listings. Processing up to 263.
Processing listing 1/263: https://milwaukee.craigslist.org/cto/d/new-berlin-2018-honda-civic/7805256794.html
Retrieved details for listing 1.
Processing listing 2/263: https://milwaukee.craigslist.org/ctd/d/new-berlin-2018-ford-150-xlt-4x4/7805212555.html
Retrieved details for listing 2.
Processing listing 3/263: https://milwaukee.craigslist.org/ctd/d/new-berlin-2013-hyundai-tucson-gls-awd/7805212198.html
Retrieved details for listing 3.
Processing listing 4/263: https://milwaukee.craigslist.org/cto/d/hales-corners-2017-hyundai-elantra-se/7805187281.html
Retrieved details for listing 4.
Processing listing 5/263: https://milwaukee.craigslist.org/cto/d/greendale-jeep-compass-2015/7805131292.html
Retrieved details for listing 5.
Processing listing 6/263: https://milwaukee.craigslist.org/ctd/d/milwaukee-2017-chrysler-pacifica/7805041581.html
Retrieved details for listing 6.
Processing listing 7/263: https://milwaukee.craigslist.org/ctd/d/milwaukee-

In [None]:
# Clean the collected data from milwaukee
milwaukee_car_df = clean_and_transform_data(car_data_milwaukee)

Transformed data into 1269 rows.


#### Detroit

In [None]:
# Test Detroit function
base_url = "https://detroit.craigslist.org/search/detroit-mi/cta?bundleDuplicates=1&hasPic=1&lat=42.363&lon=-83.139&min_auto_year=2013&search_distance=13#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 338 listings. Processing up to 5.
Processing listing 1/5: https://detroit.craigslist.org/wyn/ctd/d/2016-chevy-traverse-awd-3rd-row-buy/7805261323.html
Retrieved details for listing 1.
Processing listing 2/5: https://detroit.craigslist.org/wyn/ctd/d/redford-2023-chevrolet-trailblazer-lt/7805259001.html
Retrieved details for listing 2.
Processing listing 3/5: https://detroit.craigslist.org/wyn/cto/d/dearborn-heights-2013-chevy-equinox-lt/7805255645.html
Retrieved details for listing 3.
Processing listing 4/5: https://detroit.craigslist.org/wyn/ctd/d/redford-2020-ford-explorer-xlt-4wd/7805246335.html
Retrieved details for listing 4.
Processing listing 5/5: https://detroit.craigslist.org/wyn/ctd/d/detroit-2022-chrysler-pacifica-touring/7805245387.html
Retrieved details for listing 5.
[{'year': '2016', 'make_model': 'chevy traverse lt', 'images': ['https://images.craigslist.org/00w0w_lo0CrLo7IWW_0CI0t2_600x450.jpg', 'https://images.craigslist.org/01414_70ikxziIgJC_0CI0t2_600x450.jpg',

In [None]:
# Call main function for Detroit area
base_url_detroit = "https://detroit.craigslist.org/search/detroit-mi/cta?bundleDuplicates=1&hasPic=1&lat=42.363&lon=-83.139&min_auto_year=2013&search_distance=13#search=1~gallery~0~0"
car_data_detroit = scrape_craigslist(base_url_detroit, max_listings=338, delay=2)
print(car_data_detroit[:5])

Found 338 listings. Processing up to 338.
Processing listing 1/338: https://detroit.craigslist.org/wyn/ctd/d/2016-chevy-traverse-awd-3rd-row-buy/7805261323.html
Retrieved details for listing 1.
Processing listing 2/338: https://detroit.craigslist.org/wyn/ctd/d/redford-2023-chevrolet-trailblazer-lt/7805259001.html
Retrieved details for listing 2.
Processing listing 3/338: https://detroit.craigslist.org/wyn/cto/d/dearborn-heights-2013-chevy-equinox-lt/7805255645.html
Retrieved details for listing 3.
Processing listing 4/338: https://detroit.craigslist.org/wyn/ctd/d/redford-2020-ford-explorer-xlt-4wd/7805246335.html
Retrieved details for listing 4.
Processing listing 5/338: https://detroit.craigslist.org/wyn/ctd/d/detroit-2022-chrysler-pacifica-touring/7805245387.html
Retrieved details for listing 5.
Processing listing 6/338: https://detroit.craigslist.org/wyn/ctd/d/detroit-2022-nissan-pathfinder-platinum/7805245183.html
Retrieved details for listing 6.
Processing listing 7/338: https://d

In [None]:
# Clean the collected data from detroit
detroit_car_df = clean_and_transform_data(car_data_detroit)


Transformed data into 1656 rows.


#### Cleveland

In [None]:
# Test Cleveland function
base_url = "https://cleveland.craigslist.org/search/cleveland-oh/cta?bundleDuplicates=1&hasPic=1&lat=41.467&lon=-81.649&min_auto_year=2013&search_distance=15#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 245 listings. Processing up to 5.
Processing listing 1/5: https://cleveland.craigslist.org/ctd/d/cleveland-2014-jeep-cherokee-latitude/7805261431.html
Retrieved details for listing 1.
Processing listing 2/5: https://cleveland.craigslist.org/ctd/d/cleveland-2016-ford-f250-4x4-super-cab/7805174315.html
Retrieved details for listing 2.
Processing listing 3/5: https://cleveland.craigslist.org/ctd/d/cleveland-2016-hyundai-sonata-se-low/7805160827.html
Retrieved details for listing 3.
Processing listing 4/5: https://toledo.craigslist.org/ctd/d/cleveland-2015-ram-1500-laramie-crew/7805156299.html
Retrieved details for listing 4.
Processing listing 5/5: https://cleveland.craigslist.org/cto/d/north-royalton-2015-white-ford-fusion/7805155580.html
Retrieved details for listing 5.
[{'year': '2014', 'make_model': 'Jeep Cherokee Latitude', 'images': ['https://images.craigslist.org/00202_gilglh9EaDo_0xZ0CI_600x450.jpg', 'https://images.craigslist.org/00Z0Z_74AOnKH6KsK_0CI0sS_600x450.jpg', 'http

In [None]:
# Call main function for Cleveland area
base_url_cleveland = "https://cleveland.craigslist.org/search/cleveland-oh/cta?bundleDuplicates=1&hasPic=1&lat=41.467&lon=-81.649&min_auto_year=2013&search_distance=15#search=1~gallery~0~0"
car_data_cleveland = scrape_craigslist(base_url_cleveland, max_listings=245, delay=2)
print(car_data_cleveland[:5])

Found 245 listings. Processing up to 245.
Processing listing 1/245: https://cleveland.craigslist.org/ctd/d/cleveland-2014-jeep-cherokee-latitude/7805261431.html
Retrieved details for listing 1.
Processing listing 2/245: https://cleveland.craigslist.org/ctd/d/cleveland-2016-ford-f250-4x4-super-cab/7805174315.html
Retrieved details for listing 2.
Processing listing 3/245: https://cleveland.craigslist.org/ctd/d/cleveland-2016-hyundai-sonata-se-low/7805160827.html
Retrieved details for listing 3.
Processing listing 4/245: https://toledo.craigslist.org/ctd/d/cleveland-2015-ram-1500-laramie-crew/7805156299.html
Retrieved details for listing 4.
Processing listing 5/245: https://cleveland.craigslist.org/cto/d/north-royalton-2015-white-ford-fusion/7805155580.html
Retrieved details for listing 5.
Processing listing 6/245: https://cleveland.craigslist.org/cto/d/broadview-heights-2013-ford-150-xlt-4wd/7805144567.html
Retrieved details for listing 6.
Processing listing 7/245: https://cleveland.crai

In [None]:
# Clean the collected data from detroit
cleveland_car_df = clean_and_transform_data(car_data_cleveland)


Transformed data into 1186 rows.


#### Indianapolis

In [None]:
# Test Indianapolis function
base_url = "https://indianapolis.craigslist.org/search/indianapolis-in/cta?bundleDuplicates=1&hasPic=1&lat=39.792&lon=-86.133&min_auto_year=2013&search_distance=19#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 287 listings. Processing up to 5.
Processing listing 1/5: https://indianapolis.craigslist.org/ctd/d/indianapolis-2016-chevrolet-suburban/7805266965.html
Retrieved details for listing 1.
Processing listing 2/5: https://indianapolis.craigslist.org/cto/d/westfield-2021-nissan-versa/7805230412.html
Retrieved details for listing 2.
Processing listing 3/5: https://indianapolis.craigslist.org/cto/d/greenwood-2016-jeep-compass-latitude-4x4/7805194947.html
Retrieved details for listing 3.
Processing listing 4/5: https://indianapolis.craigslist.org/ctd/d/indianapolis-2015-hyundai-accent/7805163037.html
Retrieved details for listing 4.
Processing listing 5/5: https://indianapolis.craigslist.org/ctd/d/indianapolis-2015-toyota-highlander-xle/7805162872.html
Retrieved details for listing 5.
[{'year': '2016', 'make_model': 'Chevrolet Suburban LTZ', 'images': ['https://images.craigslist.org/00N0N_gEE1TIXKCuM_0CI0t2_600x450.jpg', 'https://images.craigslist.org/00C0C_hKCcg5SQT44_0CI0t2_600x450.jpg

In [None]:
# Call main function for Indianapolis area
base_url_indianapolis = "https://indianapolis.craigslist.org/search/indianapolis-in/cta?bundleDuplicates=1&hasPic=1&lat=39.792&lon=-86.133&min_auto_year=2013&search_distance=19#search=1~gallery~0~0"
car_data_indianapolis = scrape_craigslist(base_url_indianapolis, max_listings=287, delay=2)
print(car_data_indianapolis[:5])

Found 287 listings. Processing up to 287.
Processing listing 1/287: https://indianapolis.craigslist.org/ctd/d/indianapolis-2016-chevrolet-suburban/7805266965.html
Retrieved details for listing 1.
Processing listing 2/287: https://indianapolis.craigslist.org/cto/d/westfield-2021-nissan-versa/7805230412.html
Retrieved details for listing 2.
Processing listing 3/287: https://indianapolis.craigslist.org/cto/d/greenwood-2016-jeep-compass-latitude-4x4/7805194947.html
Retrieved details for listing 3.
Processing listing 4/287: https://indianapolis.craigslist.org/ctd/d/indianapolis-2015-hyundai-accent/7805163037.html
Retrieved details for listing 4.
Processing listing 5/287: https://indianapolis.craigslist.org/ctd/d/indianapolis-2015-toyota-highlander-xle/7805162872.html
Retrieved details for listing 5.
Processing listing 6/287: https://indianapolis.craigslist.org/ctd/d/indianapolis-2020-kia-telluride-sx/7805162725.html
Retrieved details for listing 6.
Processing listing 7/287: https://indianap

In [None]:
# Clean the collected data from detroit
indianapolis_car_df = clean_and_transform_data(car_data_indianapolis)


Transformed data into 1430 rows.


#### Boston

In [None]:
# Test Boston function
base_url = "https://boston.craigslist.org/search/boston-ma/cta?bundleDuplicates=1&hasPic=1&lat=42.331&lon=-71.063&min_auto_year=2013&search_distance=9.5#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 282 listings. Processing up to 5.
Processing listing 1/5: https://boston.craigslist.org/gbs/cto/d/everett-nissan-altima-2013/7805183110.html
Retrieved details for listing 1.
Processing listing 2/5: https://boston.craigslist.org/gbs/cto/d/everett-ford-focus-2015/7805177269.html
Retrieved details for listing 2.
Processing listing 3/5: https://boston.craigslist.org/gbs/cto/d/everett-chevrolet-cruze-lt-sedan-4d/7805174502.html
Retrieved details for listing 3.
Processing listing 4/5: https://boston.craigslist.org/gbs/cto/d/everett-2014-nissan-sentra/7805172154.html
Retrieved details for listing 4.
Processing listing 5/5: https://boston.craigslist.org/gbs/cto/d/revere-2015-ford-explorer-awd-suv/7805142783.html
Retrieved details for listing 5.
[{'year': '2013', 'make_model': 'nissan altima', 'images': ['https://images.craigslist.org/00d0d_8Y8xPrlAMHp_0jm0pO_600x450.jpg', 'https://images.craigslist.org/00N0N_8G7D5ie4RDA_0jm0pO_600x450.jpg', 'https://images.craigslist.org/00d0d_2teb0mMKRc

In [None]:
# Call main function for Boston area
base_url_boston = "https://boston.craigslist.org/search/boston-ma/cta?bundleDuplicates=1&hasPic=1&lat=42.331&lon=-71.063&min_auto_year=2013&search_distance=9.5#search=1~gallery~0~0"
car_data_boston = scrape_craigslist(base_url_boston, max_listings=282, delay=2)
print(car_data_boston[:5])


Found 282 listings. Processing up to 282.
Processing listing 1/282: https://boston.craigslist.org/gbs/cto/d/everett-nissan-altima-2013/7805183110.html
Retrieved details for listing 1.
Processing listing 2/282: https://boston.craigslist.org/gbs/cto/d/everett-ford-focus-2015/7805177269.html
Retrieved details for listing 2.
Processing listing 3/282: https://boston.craigslist.org/gbs/cto/d/everett-chevrolet-cruze-lt-sedan-4d/7805174502.html
Retrieved details for listing 3.
Processing listing 4/282: https://boston.craigslist.org/gbs/cto/d/everett-2014-nissan-sentra/7805172154.html
Retrieved details for listing 4.
Processing listing 5/282: https://boston.craigslist.org/gbs/cto/d/revere-2015-ford-explorer-awd-suv/7805142783.html
Retrieved details for listing 5.
Processing listing 6/282: https://boston.craigslist.org/gbs/cto/d/boston-2015-acura-mdx-advance-and/7805136324.html
Retrieved details for listing 6.
Processing listing 7/282: https://boston.craigslist.org/gbs/ctd/d/somerville-2016-mazd

In [None]:
# Clean the collected data from boston
boston_car_df = clean_and_transform_data(car_data_boston)

Transformed data into 1367 rows.


#### New York

In [None]:
# Test New York function
base_url = "https://newyork.craigslist.org/search/new-york-ny/cta?bundleDuplicates=1&hasPic=1&lat=40.7741&lon=-73.9661&min_auto_year=2013&search_distance=10#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 331 listings. Processing up to 5.
Processing listing 1/5: https://newyork.craigslist.org/que/ctd/d/flushing-2019-mini-countryman-john/7805202874.html
Retrieved details for listing 1.
Processing listing 2/5: https://newjersey.craigslist.org/ctd/d/jersey-city-2020-chevrolet-chevy-malibu/7805201141.html
Retrieved details for listing 2.
Processing listing 3/5: https://newjersey.craigslist.org/ctd/d/hackensack-si-tienes-itin-licencia/7805197627.html
Retrieved details for listing 3.
Processing listing 4/5: https://newyork.craigslist.org/que/ctd/d/ridgewood-2016-chevrolet-chevy-cruze-lt/7805195514.html
Retrieved details for listing 4.
Processing listing 5/5: https://newjersey.craigslist.org/ctd/d/hackensack-tenemos-el-auto-que-buscas/7805195318.html
Retrieved details for listing 5.
[{'year': '2019', 'make_model': 'Mini Countryman John Cooper', 'images': ['https://images.craigslist.org/01515_i0qVVl49nRw_0fe0bq_600x450.jpg', 'https://images.craigslist.org/01414_5rJCrPZq0lo_0fe0bq_600x450.

In [None]:
# Call main function for New York area
base_url_new_york = "https://newyork.craigslist.org/search/new-york-ny/cta?bundleDuplicates=1&hasPic=1&lat=40.7741&lon=-73.9661&min_auto_year=2013&search_distance=10#search=1~gallery~0~0"
car_data_new_york = scrape_craigslist(base_url_new_york, max_listings=331, delay=2)
print(car_data_new_york[:5])

Found 331 listings. Processing up to 331.
Processing listing 1/331: https://newyork.craigslist.org/que/ctd/d/flushing-2019-mini-countryman-john/7805202874.html
Retrieved details for listing 1.
Processing listing 2/331: https://newjersey.craigslist.org/ctd/d/jersey-city-2020-chevrolet-chevy-malibu/7805201141.html
Retrieved details for listing 2.
Processing listing 3/331: https://newjersey.craigslist.org/ctd/d/hackensack-si-tienes-itin-licencia/7805197627.html
Retrieved details for listing 3.
Processing listing 4/331: https://newyork.craigslist.org/que/ctd/d/ridgewood-2016-chevrolet-chevy-cruze-lt/7805195514.html
Retrieved details for listing 4.
Processing listing 5/331: https://newjersey.craigslist.org/ctd/d/hackensack-tenemos-el-auto-que-buscas/7805195318.html
Retrieved details for listing 5.
Processing listing 6/331: https://newyork.craigslist.org/mnh/cto/d/hackensack-refrigerated-trucks-and-vans/7805190415.html
Retrieved details for listing 6.
Processing listing 7/331: https://newyor

In [None]:
# Clean the collected data from New York
new_york_car_df = clean_and_transform_data(car_data_new_york)


Transformed data into 1576 rows.


#### Los Angeles

In [None]:
# Test LA function
base_url = "https://losangeles.craigslist.org/search/los-angeles-ca/cta?bundleDuplicates=1&hasPic=1&lat=34.037&lon=-118.305&min_auto_year=2013&search_distance=14#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 346 listings. Processing up to 5.
Processing listing 1/5: https://losangeles.craigslist.org/lac/cto/d/los-angeles-2014-hino-268-class-for-sale/7805209984.html
Retrieved details for listing 1.
Processing listing 2/5: https://losangeles.craigslist.org/wst/cto/d/santa-monica-tesla-model-performance-awd/7805193239.html
Retrieved details for listing 2.
Processing listing 3/5: https://losangeles.craigslist.org/wst/ctd/d/hermosa-beach-2023-mercedes-benz-300/7805189783.html
Retrieved details for listing 3.
Processing listing 4/5: https://losangeles.craigslist.org/wst/cto/d/lynwood-2015-chevrolet-camaro/7805187732.html
Retrieved details for listing 4.
Processing listing 5/5: https://inlandempire.craigslist.org/ctd/d/los-angeles-2021-chevy-chevrolet-spark/7805170792.html
Retrieved details for listing 5.
[{'year': '2014', 'make_model': "Hino 268 Diesel with 26' box + lift", 'images': ['https://images.craigslist.org/00C0C_kMUxtxNe1Ev_0lM0t2_600x450.jpg', 'https://images.craigslist.org/00H0H_

In [None]:
# Call main function for LA area
base_url_LA = "https://losangeles.craigslist.org/search/los-angeles-ca/cta?bundleDuplicates=1&hasPic=1&lat=34.037&lon=-118.305&min_auto_year=2013&search_distance=14#search=1~gallery~0~0"
car_data_LA = scrape_craigslist(base_url_LA, max_listings=346, delay=2)
print(car_data_LA[:5])

Found 346 listings. Processing up to 346.
Processing listing 1/346: https://losangeles.craigslist.org/lac/cto/d/los-angeles-2014-hino-268-class-for-sale/7805209984.html
Retrieved details for listing 1.
Processing listing 2/346: https://losangeles.craigslist.org/wst/cto/d/santa-monica-tesla-model-performance-awd/7805193239.html
Retrieved details for listing 2.
Processing listing 3/346: https://losangeles.craigslist.org/wst/ctd/d/hermosa-beach-2023-mercedes-benz-300/7805189783.html
Retrieved details for listing 3.
Processing listing 4/346: https://losangeles.craigslist.org/wst/cto/d/lynwood-2015-chevrolet-camaro/7805187732.html
Retrieved details for listing 4.
Processing listing 5/346: https://inlandempire.craigslist.org/ctd/d/los-angeles-2021-chevy-chevrolet-spark/7805170792.html
Retrieved details for listing 5.
Processing listing 6/346: https://losangeles.craigslist.org/lac/ctd/d/los-angeles-2015-chevy-chevrolet/7805170693.html
Retrieved details for listing 6.
Processing listing 7/346:

In [None]:
# Clean the collected data from LA
LA_car_df = clean_and_transform_data(car_data_LA)


Transformed data into 1693 rows.


#### Seattle

In [None]:
# Test Seattle function
base_url = "https://seattle.craigslist.org/search/seattle-wa/cta?bundleDuplicates=1&hasPic=1&lat=47.606&lon=-122.332&min_auto_year=2013&search_distance=30#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 359 listings. Processing up to 5.
Processing listing 1/5: https://seattle.craigslist.org/est/ctd/d/kirkland-2024-subaru-crosstrek-awd-all/7805218175.html
Retrieved details for listing 1.
Processing listing 2/5: https://seattle.craigslist.org/skc/ctd/d/kent-2013-ford-taurus-awd-all-wheel/7805217936.html
Retrieved details for listing 2.
Processing listing 3/5: https://seattle.craigslist.org/tac/ctd/d/renton-2021-tesla-model-awd-all-wheel/7805217465.html
Retrieved details for listing 3.
Processing listing 4/5: https://seattle.craigslist.org/sno/ctd/d/everett-2014-chevrolet-silverado-x4-4wd/7805216312.html
Retrieved details for listing 4.
Processing listing 5/5: https://seattle.craigslist.org/tac/ctd/d/tacoma-lifted-2019-toyota-tundra-trd/7805215559.html
Retrieved details for listing 5.
[{'year': '2024', 'make_model': 'Subaru Crosstrek Premium', 'images': ['https://images.craigslist.org/00r0r_e5pZCbjlMfF_0gw0co_600x450.jpg', 'https://images.craigslist.org/00M0M_aNaSkLabtsC_0gw0co_600

In [None]:
# Call main function for Seattle area
base_url_seattle = "https://seattle.craigslist.org/search/seattle-wa/cta?bundleDuplicates=1&hasPic=1&lat=47.606&lon=-122.332&min_auto_year=2013&search_distance=30#search=1~gallery~0~0"
car_data_seattle = scrape_craigslist(base_url_seattle, max_listings=359, delay=2)
print(car_data_seattle[:5])

Found 359 listings. Processing up to 359.
Processing listing 1/359: https://seattle.craigslist.org/est/ctd/d/kirkland-2024-subaru-crosstrek-awd-all/7805218175.html
Retrieved details for listing 1.
Processing listing 2/359: https://seattle.craigslist.org/skc/ctd/d/kent-2013-ford-taurus-awd-all-wheel/7805217936.html
Retrieved details for listing 2.
Processing listing 3/359: https://seattle.craigslist.org/tac/ctd/d/renton-2021-tesla-model-awd-all-wheel/7805217465.html
Retrieved details for listing 3.
Processing listing 4/359: https://seattle.craigslist.org/sno/ctd/d/everett-2014-chevrolet-silverado-x4-4wd/7805216312.html
Retrieved details for listing 4.
Processing listing 5/359: https://seattle.craigslist.org/tac/ctd/d/tacoma-lifted-2019-toyota-tundra-trd/7805215559.html
Retrieved details for listing 5.
Processing listing 6/359: https://seattle.craigslist.org/tac/ctd/d/tacoma-2015-ram-x4-4wd-dodge-longhorn/7805213697.html
Retrieved details for listing 6.
Processing listing 7/359: https://

In [None]:
# Clean the collected data from seattle
seattle_car_df = clean_and_transform_data(car_data_seattle)


Transformed data into 1754 rows.


#### Phoenix

In [None]:
# Test phoenix function
base_url = "https://phoenix.craigslist.org/search/phoenix-az/cta?bundleDuplicates=1&hasPic=1&lat=33.531&lon=-112.078&min_auto_year=2013&search_distance=26#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 344 listings. Processing up to 5.
Processing listing 1/5: https://phoenix.craigslist.org/nph/cto/d/phoenix-23-toyota-prius-limited/7805223179.html
Retrieved details for listing 1.
Processing listing 2/5: https://phoenix.craigslist.org/nph/cto/d/phoenix-2018-chevy-equinox-lt-owner/7805222962.html
Retrieved details for listing 2.
Processing listing 3/5: https://phoenix.craigslist.org/cph/ctd/d/phoenix-2016-cadillac-escalade-4x4-4wd/7805221987.html
Retrieved details for listing 3.
Processing listing 4/5: https://phoenix.craigslist.org/evl/ctd/d/scottsdale-2020-ram-1500-rho-fresh-off/7805221898.html
Retrieved details for listing 4.
Processing listing 5/5: https://phoenix.craigslist.org/wvl/cto/d/phoenix-2015-ram-1500-laramie/7805220521.html
Retrieved details for listing 5.
[{'year': '2023', 'make_model': 'Toyota', 'images': ['https://images.craigslist.org/00q0q_9HqoCDQiqJw_0CI0t2_600x450.jpg', 'https://images.craigslist.org/00e0e_9sknG2MeYHl_0t20CI_600x450.jpg'], 'url': 'https://phoe

In [None]:
# Call main function for phoenix area
base_url_phoenix = "https://phoenix.craigslist.org/search/phoenix-az/cta?bundleDuplicates=1&hasPic=1&lat=33.531&lon=-112.078&min_auto_year=2013&search_distance=26#search=1~gallery~0~0"
car_data_phoenix = scrape_craigslist(base_url_phoenix, max_listings=344, delay=2)
print(car_data_phoenix[:5])

Found 344 listings. Processing up to 344.
Processing listing 1/344: https://phoenix.craigslist.org/nph/cto/d/phoenix-23-toyota-prius-limited/7805223179.html
Retrieved details for listing 1.
Processing listing 2/344: https://phoenix.craigslist.org/nph/cto/d/phoenix-2018-chevy-equinox-lt-owner/7805222962.html
Retrieved details for listing 2.
Processing listing 3/344: https://phoenix.craigslist.org/cph/ctd/d/phoenix-2016-cadillac-escalade-4x4-4wd/7805221987.html
Retrieved details for listing 3.
Processing listing 4/344: https://phoenix.craigslist.org/evl/ctd/d/scottsdale-2020-ram-1500-rho-fresh-off/7805221898.html
Retrieved details for listing 4.
Processing listing 5/344: https://phoenix.craigslist.org/wvl/cto/d/phoenix-2015-ram-1500-laramie/7805220521.html
Retrieved details for listing 5.
Processing listing 6/344: https://phoenix.craigslist.org/nph/ctd/d/phoenix-2014-honda-accord-sedan-ex-sedan/7805220313.html
Retrieved details for listing 6.
Processing listing 7/344: https://phoenix.cra

In [None]:
# Clean the collected data from phoenix
phoenix_car_df = clean_and_transform_data(car_data_phoenix)


Transformed data into 1698 rows.


#### Dallas

In [None]:
# Test Dallas function
base_url = "https://dallas.craigslist.org/search/dallas-tx/cta?bundleDuplicates=1&hasPic=1&lat=32.789&lon=-96.796&min_auto_year=2013&search_distance=23#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 342 listings. Processing up to 5.
Processing listing 1/5: https://dallas.craigslist.org/dal/ctd/d/dallas-2014-toyota-tacoma-4wd-double/7805229311.html
Retrieved details for listing 1.
Processing listing 2/5: https://dallas.craigslist.org/dal/ctd/d/dallas-2021-chevrolet-express-cargo-van/7805228233.html
Retrieved details for listing 2.
Processing listing 3/5: https://dallas.craigslist.org/dal/ctd/d/dallas-2016-chevrolet-express-cargo-van/7805227556.html
Retrieved details for listing 3.
Processing listing 4/5: https://dallas.craigslist.org/dal/ctd/d/dallas-2016-chevrolet-express-cargo-van/7805227220.html
Retrieved details for listing 4.
Processing listing 5/5: https://dallas.craigslist.org/ftw/ctd/d/arlington-tx-owner-2020-kia-soul-line/7805227203.html
Retrieved details for listing 5.
[{'year': '2014', 'make_model': 'Toyota Tacoma', 'images': ['https://images.craigslist.org/00W0W_j8msg8d8cqj_0cU09G_600x450.jpg', 'https://images.craigslist.org/00D0D_foibwCaXCP_0cU09G_600x450.jpg', '

In [None]:
# Call main function for Dallas area
base_url_dallas = "https://dallas.craigslist.org/search/dallas-tx/cta?bundleDuplicates=1&hasPic=1&lat=32.789&lon=-96.796&min_auto_year=2013&search_distance=23#search=1~gallery~0~0"
car_data_dallas = scrape_craigslist(base_url_dallas, max_listings=342, delay=2)
print(car_data_dallas[:5])

Found 342 listings. Processing up to 342.
Processing listing 1/342: https://dallas.craigslist.org/dal/ctd/d/dallas-2014-toyota-tacoma-4wd-double/7805229311.html
Retrieved details for listing 1.
Processing listing 2/342: https://dallas.craigslist.org/dal/ctd/d/dallas-2021-chevrolet-express-cargo-van/7805228233.html
Retrieved details for listing 2.
Processing listing 3/342: https://dallas.craigslist.org/dal/ctd/d/dallas-2016-chevrolet-express-cargo-van/7805227556.html
Retrieved details for listing 3.
Processing listing 4/342: https://dallas.craigslist.org/dal/ctd/d/dallas-2016-chevrolet-express-cargo-van/7805227220.html
Retrieved details for listing 4.
Processing listing 5/342: https://dallas.craigslist.org/ftw/ctd/d/arlington-tx-owner-2020-kia-soul-line/7805227203.html
Retrieved details for listing 5.
Processing listing 6/342: https://dallas.craigslist.org/dal/ctd/d/dallas-2018-lincoln-navigator-4x2/7805226599.html
Retrieved details for listing 6.
Processing listing 7/342: https://dalla

In [None]:
# Clean the collected data from dallas
dallas_car_df = clean_and_transform_data(car_data_dallas)


Transformed data into 1703 rows.


#### Denver

In [None]:
# Test Denver function
base_url = "https://denver.craigslist.org/search/denver-co/cta?bundleDuplicates=1&hasPic=1&lat=39.763&lon=-104.957&min_auto_year=2013&search_distance=20#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 325 listings. Processing up to 5.
Processing listing 1/5: https://denver.craigslist.org/ctd/d/parker-2022-subaru-outback-awd-all/7805234598.html
Retrieved details for listing 1.
Processing listing 2/5: https://cosprings.craigslist.org/ctd/d/denver-2017-mini-clubman-cooper-all4/7805234466.html
Retrieved details for listing 2.
Processing listing 3/5: https://denver.craigslist.org/ctd/d/denver-2022-ford-maverick-awd-all-wheel/7805233194.html
Retrieved details for listing 3.
Processing listing 4/5: https://denver.craigslist.org/ctd/d/denver-2019-ram-x4-4wd-truck-dodge-big/7805233112.html
Retrieved details for listing 4.
Processing listing 5/5: https://denver.craigslist.org/ctd/d/lafayette-2020-bmw-m2-cs-alpine-white/7805232862.html
Retrieved details for listing 5.
[{'year': '2022', 'make_model': 'Subaru Outback Limited', 'images': ['https://images.craigslist.org/00m0m_6YJV0iRMcR9_0gw0co_600x450.jpg', 'https://images.craigslist.org/00K0K_1vHSJHziO1n_0gw0co_600x450.jpg', 'https://image

In [None]:
# Call main function for Denver area
base_url_denver = "https://denver.craigslist.org/search/denver-co/cta?bundleDuplicates=1&hasPic=1&lat=39.763&lon=-104.957&min_auto_year=2013&search_distance=20#search=1~gallery~0~0"
car_data_denver = scrape_craigslist(base_url_denver, max_listings=325, delay=2)
print(car_data_denver[:5])

Found 325 listings. Processing up to 325.
Processing listing 1/325: https://denver.craigslist.org/ctd/d/parker-2022-subaru-outback-awd-all/7805234598.html
Retrieved details for listing 1.
Processing listing 2/325: https://cosprings.craigslist.org/ctd/d/denver-2017-mini-clubman-cooper-all4/7805234466.html
Retrieved details for listing 2.
Processing listing 3/325: https://denver.craigslist.org/ctd/d/denver-2022-ford-maverick-awd-all-wheel/7805233194.html
Retrieved details for listing 3.
Processing listing 4/325: https://denver.craigslist.org/ctd/d/denver-2019-ram-x4-4wd-truck-dodge-big/7805233112.html
Retrieved details for listing 4.
Processing listing 5/325: https://denver.craigslist.org/ctd/d/lafayette-2020-bmw-m2-cs-alpine-white/7805232862.html
Retrieved details for listing 5.
Processing listing 6/325: https://denver.craigslist.org/cto/d/littleton-2017-chevrolet-silverado-1500/7805232689.html
Retrieved details for listing 6.
Processing listing 7/325: https://denver.craigslist.org/ctd/

In [None]:
# Clean the collected data from denver
denver_car_df = clean_and_transform_data(car_data_denver)


Transformed data into 1580 rows.


#### Miami

In [None]:
# Test Miami function
base_url = "https://miami.craigslist.org/search/miami-fl/cta?bundleDuplicates=1&hasPic=1&lat=25.73&lon=-80.529&min_auto_year=2013&search_distance=31#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 336 listings. Processing up to 5.
Processing listing 1/5: https://miami.craigslist.org/brw/cto/d/hollywood-bmw-528i/7805241078.html
Retrieved details for listing 1.
Processing listing 2/5: https://miami.craigslist.org/mdc/ctd/d/miami-2017-mazda-sport-sedan-46k-miles/7805240858.html
Retrieved details for listing 2.
Processing listing 3/5: https://miami.craigslist.org/brw/ctd/d/hollywood-2020-kia-forte-fe/7805240662.html
Retrieved details for listing 3.
Processing listing 4/5: https://miami.craigslist.org/brw/ctd/d/miami-dodge-challenger-sxt-coupe-2d/7805237389.html
Retrieved details for listing 4.
Processing listing 5/5: https://miami.craigslist.org/mdc/cto/d/miami-2020-bmw-z4-blue-white-low-miles/7805222115.html
Retrieved details for listing 5.
[{'year': '2016', 'make_model': 'bmw 528i', 'images': ['https://images.craigslist.org/00202_8gP9KbptTZL_0lM0t2_600x450.jpg', 'https://images.craigslist.org/00n0n_kOrUw1pj0xt_0lM0t2_600x450.jpg', 'https://images.craigslist.org/01515_eTPLdKq

In [None]:
# Call main function for Miami area
base_url_miami = "https://miami.craigslist.org/search/miami-fl/cta?bundleDuplicates=1&hasPic=1&lat=25.73&lon=-80.529&min_auto_year=2013&search_distance=31#search=1~gallery~0~0"
car_data_miami = scrape_craigslist(base_url_miami, max_listings=336, delay=2)
print(car_data_miami[:5])

Found 336 listings. Processing up to 336.
Processing listing 1/336: https://miami.craigslist.org/brw/cto/d/hollywood-bmw-528i/7805241078.html
Retrieved details for listing 1.
Processing listing 2/336: https://miami.craigslist.org/mdc/ctd/d/miami-2017-mazda-sport-sedan-46k-miles/7805240858.html
Retrieved details for listing 2.
Processing listing 3/336: https://miami.craigslist.org/brw/ctd/d/hollywood-2020-kia-forte-fe/7805240662.html
Retrieved details for listing 3.
Processing listing 4/336: https://miami.craigslist.org/brw/ctd/d/miami-dodge-challenger-sxt-coupe-2d/7805237389.html
Retrieved details for listing 4.
Processing listing 5/336: https://miami.craigslist.org/mdc/cto/d/miami-2020-bmw-z4-blue-white-low-miles/7805222115.html
Retrieved details for listing 5.
Processing listing 6/336: https://miami.craigslist.org/brw/cto/d/hollywood-moving-company-affordable/7805208921.html
Retrieved details for listing 6.
Processing listing 7/336: https://miami.craigslist.org/brw/cto/d/fort-lauderd

In [None]:
# Clean the collected data from miami
miami_car_df = clean_and_transform_data(car_data_miami)


Transformed data into 1660 rows.


#### San Francisco

In [None]:
# Test SF function
base_url = "https://sfbay.craigslist.org/search/san-francisco-ca/cta?bundleDuplicates=1&hasPic=1&lat=37.742&lon=-122.436&min_auto_year=2013&search_distance=8.3#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 287 listings. Processing up to 5.
Processing listing 1/5: https://sfbay.craigslist.org/eby/ctd/d/daly-city-2015-dodge-durango-limited/7805269273.html
Retrieved details for listing 1.
Processing listing 2/5: https://sfbay.craigslist.org/eby/ctd/d/daly-city-2016-chevrolet-tahoe-lt-sport/7805247862.html
Retrieved details for listing 2.
Processing listing 3/5: https://sfbay.craigslist.org/sfc/ctd/d/san-francisco-2021-jeep-wrangler-4xe/7805241857.html
Retrieved details for listing 3.
Processing listing 4/5: https://sfbay.craigslist.org/sfc/ctd/d/san-francisco-2022-subaru-ascent-awd/7805241086.html
Retrieved details for listing 4.
Processing listing 5/5: https://sfbay.craigslist.org/sfc/ctd/d/san-francisco-2018-honda-odyssey/7805240387.html
Retrieved details for listing 5.
[{'year': '2015', 'make_model': 'Dodge Durango Limited Sport', 'images': ['https://images.craigslist.org/00c0c_6wdvGrseHH6_0fe0bq_600x450.jpg', 'https://images.craigslist.org/01515_cFEyJkzyd4V_0fe0bq_600x450.jpg', 'h

In [None]:
# Call main function for SF area
base_url_SF = "https://sfbay.craigslist.org/search/san-francisco-ca/cta?bundleDuplicates=1&hasPic=1&lat=37.742&lon=-122.436&min_auto_year=2013&search_distance=8.3#search=1~gallery~0~0"
car_data_SF = scrape_craigslist(base_url_SF, max_listings=287, delay=2)
print(car_data_SF[:5])

Found 287 listings. Processing up to 287.
Processing listing 1/287: https://sfbay.craigslist.org/eby/ctd/d/daly-city-2015-dodge-durango-limited/7805269273.html
Retrieved details for listing 1.
Processing listing 2/287: https://sfbay.craigslist.org/eby/ctd/d/daly-city-2016-chevrolet-tahoe-lt-sport/7805247862.html
Retrieved details for listing 2.
Processing listing 3/287: https://sfbay.craigslist.org/sfc/ctd/d/san-francisco-2021-jeep-wrangler-4xe/7805241857.html
Retrieved details for listing 3.
Processing listing 4/287: https://sfbay.craigslist.org/sfc/ctd/d/san-francisco-2022-subaru-ascent-awd/7805241086.html
Retrieved details for listing 4.
Processing listing 5/287: https://sfbay.craigslist.org/sfc/ctd/d/san-francisco-2018-honda-odyssey/7805240387.html
Retrieved details for listing 5.
Processing listing 6/287: https://sfbay.craigslist.org/sfc/ctd/d/san-francisco-2022-mini-countryman-awd/7805240354.html
Retrieved details for listing 6.
Processing listing 7/287: https://sfbay.craigslist.

In [None]:
# Clean the collected data from SF
SF_car_df = clean_and_transform_data(car_data_SF)


Transformed data into 1358 rows.


#### Washington DC

In [None]:
# Test DC function
base_url = "https://washingtondc.craigslist.org/search/washington-dc/cta?bundleDuplicates=1&hasPic=1&lat=38.905&lon=-77.011&min_auto_year=2013&search_distance=7.8#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 282 listings. Processing up to 5.
Processing listing 1/5: https://washingtondc.craigslist.org/mld/cto/d/temple-hills-2013-bmw-550i-xdrive-door/7805278092.html
Retrieved details for listing 1.
Processing listing 2/5: https://washingtondc.craigslist.org/mld/cto/d/temple-hills-2015-buick-enclave-awd/7805251520.html
Retrieved details for listing 2.
Processing listing 3/5: https://washingtondc.craigslist.org/nva/cto/d/alexandria-rental-cars-available/7805151801.html
Retrieved details for listing 3.
Processing listing 4/5: https://washingtondc.craigslist.org/nva/ctd/d/alexandria-2016-jeep-compass-latitude/7805021008.html
Retrieved details for listing 4.
Processing listing 5/5: https://washingtondc.craigslist.org/nva/cto/d/alexandria-2015-ford-edge-sel-sport/7805011449.html
Retrieved details for listing 5.
[{'year': '2013', 'make_model': 'bmw 550i xdrive', 'images': ['https://images.craigslist.org/00M0M_84K1SLywFcF_0CI0t2_600x450.jpg', 'https://images.craigslist.org/00t0t_hyWvu4Rnv19_0C

In [None]:
# Call main function for DC area
base_url_DC = "https://washingtondc.craigslist.org/search/washington-dc/cta?bundleDuplicates=1&hasPic=1&lat=38.905&lon=-77.011&min_auto_year=2013&search_distance=7.8#search=1~gallery~0~0"
car_data_DC = scrape_craigslist(base_url_DC, max_listings=282, delay=2)
print(car_data_DC[:5])

Found 282 listings. Processing up to 282.
Processing listing 1/282: https://washingtondc.craigslist.org/mld/cto/d/temple-hills-2013-bmw-550i-xdrive-door/7805278092.html
Retrieved details for listing 1.
Processing listing 2/282: https://washingtondc.craigslist.org/mld/cto/d/temple-hills-2015-buick-enclave-awd/7805251520.html
Retrieved details for listing 2.
Processing listing 3/282: https://washingtondc.craigslist.org/nva/cto/d/alexandria-rental-cars-available/7805151801.html
Retrieved details for listing 3.
Processing listing 4/282: https://washingtondc.craigslist.org/nva/ctd/d/alexandria-2016-jeep-compass-latitude/7805021008.html
Retrieved details for listing 4.
Processing listing 5/282: https://washingtondc.craigslist.org/nva/cto/d/alexandria-2015-ford-edge-sel-sport/7805011449.html
Retrieved details for listing 5.
Processing listing 6/282: https://washingtondc.craigslist.org/doc/cto/d/washington-2023-vw-gti-se/7804982040.html
Retrieved details for listing 6.
Processing listing 7/282

In [None]:
# Clean the collected data from miami
DC_car_df = clean_and_transform_data(car_data_DC)


Transformed data into 1376 rows.


#### Philadelphia

In [None]:
# Test Philadelphia function
base_url = "https://philadelphia.craigslist.org/search/philadelphia-pa/cta?bundleDuplicates=1&hasPic=1&lat=40.013&lon=-75.132&min_auto_year=2013&search_distance=14#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 286 listings. Processing up to 5.
Processing listing 1/5: https://philadelphia.craigslist.org/ctd/d/philadelphia-2015-nissan-nv-200-only/7805284871.html
Retrieved details for listing 1.
Processing listing 2/5: https://philadelphia.craigslist.org/ctd/d/philadelphia-2018-mercedes-sprinter/7805283611.html
Retrieved details for listing 2.
Processing listing 3/5: https://southjersey.craigslist.org/ctd/d/palmyra-2014-mercedes-sprinter-3500/7805283388.html
Retrieved details for listing 3.
Processing listing 4/5: https://southjersey.craigslist.org/ctd/d/palmyra-2022-tesla-model/7805278760.html
Retrieved details for listing 4.
Processing listing 5/5: https://philadelphia.craigslist.org/ctd/d/philadelphia-2015-honda-civic-sedan-4d/7805278640.html
Retrieved details for listing 5.
[{'year': '2015', 'make_model': 'nissan nv 200 sv', 'images': ['https://images.craigslist.org/00D0D_EtYCsK7Lnd_0CI0t2_600x450.jpg', 'https://images.craigslist.org/00X0X_iU7FfBykNVI_0CI0t2_600x450.jpg', 'https://ima

In [None]:
# Call main function for Philadelphia area
base_url_philadelphia = "https://philadelphia.craigslist.org/search/philadelphia-pa/cta?bundleDuplicates=1&hasPic=1&lat=40.013&lon=-75.132&min_auto_year=2013&search_distance=14#search=1~gallery~0~0"
car_data_philadelphia = scrape_craigslist(base_url_philadelphia, max_listings=286, delay=2)
print(car_data_philadelphia[:5])

Found 286 listings. Processing up to 286.
Processing listing 1/286: https://philadelphia.craigslist.org/ctd/d/philadelphia-2015-nissan-nv-200-only/7805284871.html
Retrieved details for listing 1.
Processing listing 2/286: https://philadelphia.craigslist.org/ctd/d/philadelphia-2018-mercedes-sprinter/7805283611.html
Retrieved details for listing 2.
Processing listing 3/286: https://southjersey.craigslist.org/ctd/d/palmyra-2014-mercedes-sprinter-3500/7805283388.html
Retrieved details for listing 3.
Processing listing 4/286: https://southjersey.craigslist.org/ctd/d/palmyra-2022-tesla-model/7805278760.html
Retrieved details for listing 4.
Processing listing 5/286: https://philadelphia.craigslist.org/ctd/d/philadelphia-2015-honda-civic-sedan-4d/7805278640.html
Retrieved details for listing 5.
Processing listing 6/286: https://philadelphia.craigslist.org/ctd/d/philadelphia-2015-mercedes-sprinter/7805277108.html
Retrieved details for listing 6.
Processing listing 7/286: https://philadelphia.cr

In [None]:
# Clean the collected data from philadelphia
philadelphia_car_df = clean_and_transform_data(car_data_philadelphia)


Transformed data into 1409 rows.


#### Charlotte

In [None]:
# Test Charlotte function
base_url = "https://charlotte.craigslist.org/search/charlotte-nc/cta?bundleDuplicates=1&hasPic=1&lat=35.209&lon=-80.826&min_auto_year=2013&search_distance=20#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 176 listings. Processing up to 5.
Processing listing 1/5: https://charlotte.craigslist.org/ctd/d/charlotte-2019-ram-promaster-city-cargo/7805282365.html
Retrieved details for listing 1.
Processing listing 2/5: https://charlotte.craigslist.org/cto/d/charlotte-2018-honda-accord-ex/7805271139.html
Retrieved details for listing 2.
Processing listing 3/5: https://charlotte.craigslist.org/cto/d/charlotte-2022-ram-2500-big-horn-64l-4x4/7805264195.html
Retrieved details for listing 3.
Processing listing 4/5: https://charlotte.craigslist.org/cto/d/paw-creek-2014-nissan-altima/7805259084.html
Retrieved details for listing 4.
Processing listing 5/5: https://charlotte.craigslist.org/cto/d/charlotte-mercedes-benz-gle-63s-amg/7805250971.html
Retrieved details for listing 5.
[{'year': '2019', 'make_model': 'Ram Promaster City', 'images': ['https://images.craigslist.org/00Z0Z_5uh5SeEvY3_0CI0t2_600x450.jpg', 'https://images.craigslist.org/00L0L_aVdr7hXo6J3_0CI0t2_600x450.jpg', 'https://images.cra

In [None]:
# Call main function for Charlotte area
base_url_charlotte = "https://charlotte.craigslist.org/search/charlotte-nc/cta?bundleDuplicates=1&hasPic=1&lat=35.209&lon=-80.826&min_auto_year=2013&search_distance=20#search=1~gallery~0~0"
car_data_charlotte = scrape_craigslist(base_url_charlotte, max_listings=176, delay=2)
print(car_data_charlotte[:5])

Found 176 listings. Processing up to 176.
Processing listing 1/176: https://charlotte.craigslist.org/ctd/d/charlotte-2019-ram-promaster-city-cargo/7805282365.html
Retrieved details for listing 1.
Processing listing 2/176: https://charlotte.craigslist.org/cto/d/charlotte-2018-honda-accord-ex/7805271139.html
Retrieved details for listing 2.
Processing listing 3/176: https://charlotte.craigslist.org/cto/d/charlotte-2022-ram-2500-big-horn-64l-4x4/7805264195.html
Retrieved details for listing 3.
Processing listing 4/176: https://charlotte.craigslist.org/cto/d/paw-creek-2014-nissan-altima/7805259084.html
Retrieved details for listing 4.
Processing listing 5/176: https://charlotte.craigslist.org/cto/d/charlotte-mercedes-benz-gle-63s-amg/7805250971.html
Retrieved details for listing 5.
Processing listing 6/176: https://charlotte.craigslist.org/cto/d/gastonia-2013-cadilac-escalade-platinum/7805239074.html
Retrieved details for listing 6.
Processing listing 7/176: https://charlotte.craigslist.or

In [None]:
# Clean the collected data from miami
charlotte_car_df = clean_and_transform_data(car_data_charlotte)


Transformed data into 872 rows.


#### Houston

In [None]:
# Test Houston function
base_url = "https://houston.craigslist.org/search/houston-tx/cta?bundleDuplicates=1&hasPic=1&lat=29.795&lon=-95.416&min_auto_year=2013&search_distance=28#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 353 listings. Processing up to 5.
Processing listing 1/5: https://houston.craigslist.org/ctd/d/houston-2023-ford-bronco-wildtrak-4x4/7805286328.html
Retrieved details for listing 1.
Processing listing 2/5: https://houston.craigslist.org/ctd/d/houston-2023-honda-hr-sport-ez-deals/7805286258.html
Retrieved details for listing 2.
Processing listing 3/5: https://dallas.craigslist.org/dal/ctd/d/houston-2018-ford-150-f150-150-stx-4x2/7805281665.html
Retrieved details for listing 3.
Processing listing 4/5: https://houston.craigslist.org/ctd/d/houston-2018-ford-150-f150-150-stx-4x2/7805280125.html
Retrieved details for listing 4.
Processing listing 5/5: https://houston.craigslist.org/ctd/d/houston-2021-lincoln-corsair-reserve/7805280041.html
Retrieved details for listing 5.
[{'year': '2023', 'make_model': 'FORD BRONCO', 'images': ['https://images.craigslist.org/00o0o_R7HQYflbB_0jm0ew_600x450.jpg', 'https://images.craigslist.org/00H0H_5R0QAXro7nu_0jm0ew_600x450.jpg', 'https://images.craig

In [None]:
# Call main function for Houston area
base_url_houston = "https://houston.craigslist.org/search/houston-tx/cta?bundleDuplicates=1&hasPic=1&lat=29.795&lon=-95.416&min_auto_year=2013&search_distance=28#search=1~gallery~0~0"
car_data_houston = scrape_craigslist(base_url_houston, max_listings=353, delay=2)
print(car_data_houston[:5])

Found 353 listings. Processing up to 353.
Processing listing 1/353: https://houston.craigslist.org/ctd/d/houston-2023-ford-bronco-wildtrak-4x4/7805286328.html
Retrieved details for listing 1.
Processing listing 2/353: https://houston.craigslist.org/ctd/d/houston-2023-honda-hr-sport-ez-deals/7805286258.html
Retrieved details for listing 2.
Processing listing 3/353: https://dallas.craigslist.org/dal/ctd/d/houston-2018-ford-150-f150-150-stx-4x2/7805281665.html
Retrieved details for listing 3.
Processing listing 4/353: https://houston.craigslist.org/ctd/d/houston-2018-ford-150-f150-150-stx-4x2/7805280125.html
Retrieved details for listing 4.
Processing listing 5/353: https://houston.craigslist.org/ctd/d/houston-2021-lincoln-corsair-reserve/7805280041.html
Retrieved details for listing 5.
Processing listing 6/353: https://houston.craigslist.org/ctd/d/houston-2022-hyundai-palisade-easy/7805278936.html
Retrieved details for listing 6.
Processing listing 7/353: https://houston.craigslist.org/c

In [None]:
# Clean the collected data from miami
houston_car_df = clean_and_transform_data(car_data_houston)


Transformed data into 1758 rows.


#### Nashville

In [None]:
# Test Nashville function
base_url = "https://nashville.craigslist.org/search/nashville-tn/cta?bundleDuplicates=1&hasPic=1&lat=36.152&lon=-86.82&min_auto_year=2013&search_distance=23#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 256 listings. Processing up to 5.
Processing listing 1/5: https://nashville.craigslist.org/ctd/d/goodlettsville-2014-jeep-cherokee-4dr/7805242723.html
Retrieved details for listing 1.
Processing listing 2/5: https://nashville.craigslist.org/ctd/d/nashville-2017-bmw-x3/7805231680.html
Retrieved details for listing 2.
Processing listing 3/5: https://nashville.craigslist.org/ctd/d/nashville-2018-nissan-armada/7805230721.html
Retrieved details for listing 3.
Processing listing 4/5: https://nashville.craigslist.org/ctd/d/nashville-2019-honda-pilot/7805229908.html
Retrieved details for listing 4.
Processing listing 5/5: https://nashville.craigslist.org/ctd/d/nashville-2019-ford-transit/7805227410.html
Retrieved details for listing 5.
[{'year': '2014', 'make_model': 'JEEP CHEROKEE', 'images': ['https://images.craigslist.org/00A0A_1t0kGMuoT0c_0jm0ew_600x450.jpg', 'https://images.craigslist.org/00F0F_1eZQlFdqn7G_0jm0ew_600x450.jpg', 'https://images.craigslist.org/01616_bQQAOAxfZNh_0jm0ew_

In [None]:
# Call main function for Nashville area
base_url_nashville = "https://nashville.craigslist.org/search/nashville-tn/cta?bundleDuplicates=1&hasPic=1&lat=36.152&lon=-86.82&min_auto_year=2013&search_distance=23#search=1~gallery~0~0"
car_data_nashville = scrape_craigslist(base_url_nashville, max_listings=256, delay=2)
print(car_data_nashville[:5])

Found 256 listings. Processing up to 256.
Processing listing 1/256: https://nashville.craigslist.org/ctd/d/goodlettsville-2014-jeep-cherokee-4dr/7805242723.html
Retrieved details for listing 1.
Processing listing 2/256: https://nashville.craigslist.org/ctd/d/nashville-2017-bmw-x3/7805231680.html
Retrieved details for listing 2.
Processing listing 3/256: https://nashville.craigslist.org/ctd/d/nashville-2018-nissan-armada/7805230721.html
Retrieved details for listing 3.
Processing listing 4/256: https://nashville.craigslist.org/ctd/d/nashville-2019-honda-pilot/7805229908.html
Retrieved details for listing 4.
Processing listing 5/256: https://nashville.craigslist.org/ctd/d/nashville-2019-ford-transit/7805227410.html
Retrieved details for listing 5.
Processing listing 6/256: https://nashville.craigslist.org/ctd/d/nashville-2020-honda-civic/7805226472.html
Retrieved details for listing 6.
Processing listing 7/256: https://nashville.craigslist.org/ctd/d/goodlettsville-2020-ford-150-f150-150/

In [None]:
# Clean the collected data from miami
nashville_car_df = clean_and_transform_data(car_data_nashville)


Transformed data into 1267 rows.


#### Minneapolis

In [None]:
# Test Minneapolis function
base_url = "https://minneapolis.craigslist.org/search/minneapolis-mn/cta?bundleDuplicates=1&hasPic=1&lat=45.018&lon=-93.316&min_auto_year=2013&search_distance=20#search=1~gallery~0~0"
car_data = scrape_craigslist(base_url, max_listings=5, delay=2)
print(car_data[:5])


Found 353 listings. Processing up to 5.
Processing listing 1/5: https://minneapolis.craigslist.org/dak/ctd/d/saint-michael-2023-ford-f650-650-650/7805289115.html
Retrieved details for listing 1.
Processing listing 2/5: https://minneapolis.craigslist.org/ram/cto/d/saint-paul-2022-subaru-forester-premium/7805287993.html
Retrieved details for listing 2.
Processing listing 3/5: https://minneapolis.craigslist.org/ank/cto/d/minneapolis-2017-honda-ridgeline-rtl/7805285812.html
Retrieved details for listing 3.
Processing listing 4/5: https://minneapolis.craigslist.org/hnp/ctd/d/champlin-2014-subaru-impreza-20i-20-20/7805279712.html
Retrieved details for listing 4.
Processing listing 5/5: https://minneapolis.craigslist.org/ank/ctd/d/saint-paul-2014-subaru-impreza-wagon/7805271594.html
Retrieved details for listing 5.
[{'year': '2023', 'make_model': 'Ford F650 Base', 'images': ['https://images.craigslist.org/00I0I_fMOSwX6D0rv_0fe0bq_600x450.jpg', 'https://images.craigslist.org/00x0x_a2EJZK8v8v9_

In [None]:
# Call main function for Minneapolis area
base_url_minneapolis = "https://minneapolis.craigslist.org/search/minneapolis-mn/cta?bundleDuplicates=1&hasPic=1&lat=45.018&lon=-93.316&min_auto_year=2013&search_distance=20#search=1~gallery~0~0"
car_data_minneapolis = scrape_craigslist(base_url_minneapolis, max_listings=353, delay=2)
print(car_data_minneapolis[:5])

Found 353 listings. Processing up to 353.
Processing listing 1/353: https://minneapolis.craigslist.org/ram/ctd/d/saint-paul-2015-ford-fusion-titanium/7805299714.html
Retrieved details for listing 1.
Processing listing 2/353: https://minneapolis.craigslist.org/ram/ctd/d/saint-paul-2017-jeep-wrangler-unlimited/7805299340.html
Retrieved details for listing 2.
Processing listing 3/353: https://minneapolis.craigslist.org/wsh/ctd/d/2016-subaru-legacy-25i-awd-back-up-cam/7805298499.html
Retrieved details for listing 3.
Processing listing 4/353: https://minneapolis.craigslist.org/ram/ctd/d/saint-paul-2013-chevrolet-volt-sedan/7805297087.html
Retrieved details for listing 4.
Processing listing 5/353: https://minneapolis.craigslist.org/ram/ctd/d/saint-paul-2016-lincoln-mkx-reserve/7805293705.html
Retrieved details for listing 5.
Processing listing 6/353: https://minneapolis.craigslist.org/hnp/ctd/d/minneapolis-2015-chevrolet-impala-ltd/7805293572.html
Retrieved details for listing 6.
Processing 

In [None]:
# Clean the collected data from miami
minneapolis_car_df = clean_and_transform_data(car_data_minneapolis)


Transformed data into 1758 rows.


### Save City Specific DFs

In [None]:
chicago_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/chicago_car_df.csv', index=False)
milwaukee_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/milwaukee_car_df.csv', index=False)
detroit_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/detroit_car_df.csv', index=False)
cleveland_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/cleveland_car_df.csv', index=False)
indianapolis_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/indianapolis_car_df.csv', index=False)
boston_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/boston_car_df.csv', index=False)
new_york_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/new_york_car_df.csv', index=False)
LA_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/LA_car_df.csv', index=False)
seattle_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/seattle_car_df.csv', index=False)
phoenix_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/phoenix_car_df.csv', index=False)
dallas_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/dallas_car_df.csv', index=False)
denver_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/denver_car_df.csv', index=False)
miami_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/miami_car_df.csv', index=False)
SF_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/SF_car_df.csv', index=False)
DC_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/DC_car_df.csv', index=False)
philadelphia_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/philadelphia_car_df.csv', index=False)
charlotte_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/charlotte_car_df.csv', index=False)
houston_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/houston_car_df.csv', index=False)
nashville_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/nashville_car_df.csv', index=False)
minneapolis_car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/city_specific/minneapolis_car_df.csv', index=False)

### Combine results

In [None]:
# Combine city specific results

car_df = pd.concat([chicago_car_df, milwaukee_car_df, detroit_car_df, cleveland_car_df, indianapolis_car_df, boston_car_df, new_york_car_df, LA_car_df, seattle_car_df, phoenix_car_df, dallas_car_df, denver_car_df, miami_car_df, SF_car_df, DC_car_df, philadelphia_car_df, charlotte_car_df, houston_car_df, nashville_car_df, minneapolis_car_df], ignore_index=True)


# Convert final results to save as CSV
car_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/craigslist_car_data2.csv', index=False)

In [None]:
print(car_df.shape)
car_df.head(5)

(30059, 7)


Unnamed: 0,year,make_model,make,model,label,image_url,listing_url
0,2014,buick_lacrosse_leather,buick,lacrosse_leather,2014_buick_lacrosse_leather,https://images.craigslist.org/00m0m_aHTPc450Pm...,https://chicago.craigslist.org/nwi/ctd/d/highl...
1,2014,buick_lacrosse_leather,buick,lacrosse_leather,2014_buick_lacrosse_leather,https://images.craigslist.org/00Q0Q_alevaLceOv...,https://chicago.craigslist.org/nwi/ctd/d/highl...
2,2014,buick_lacrosse_leather,buick,lacrosse_leather,2014_buick_lacrosse_leather,https://images.craigslist.org/00x0x_e5gmtvzqsB...,https://chicago.craigslist.org/nwi/ctd/d/highl...
3,2014,buick_lacrosse_leather,buick,lacrosse_leather,2014_buick_lacrosse_leather,https://images.craigslist.org/00L0L_gwbjbHkcqP...,https://chicago.craigslist.org/nwi/ctd/d/highl...
4,2014,buick_lacrosse_leather,buick,lacrosse_leather,2014_buick_lacrosse_leather,https://images.craigslist.org/00g0g_lQyfS5TZNx...,https://chicago.craigslist.org/nwi/ctd/d/highl...


### Download and save the images to Google Drive

In [None]:
import os
import requests
from PIL import Image
from io import BytesIO

In [None]:
# Load the cleaned data
csv_path = '/content/drive/MyDrive/Computer Vision Project/Scraped_Data/craigslist_car_data2.csv'
df = pd.read_csv(csv_path)

# Define the directory to save images
image_dir = '/content/drive/MyDrive/Computer Vision Project/Scraped_Data/car_images_v2'
os.makedirs(image_dir, exist_ok=True)

In [None]:
def download_and_save_images_with_mapping(df, image_dir):
    # Ensure the directory exists
    os.makedirs(image_dir, exist_ok=True)

    # List to store rows that successfully save images
    successful_rows = []

    for idx, row in df.iterrows():
        label = row['label']
        image_url = row['image_url']

        try:
            # Get the image content
            response = requests.get(image_url.strip())
            response.raise_for_status()

            # Open the image and save it
            image = Image.open(BytesIO(response.content))
            image_path = os.path.join(image_dir, f"{label}_{idx}.jpg")
            image.save(image_path)

            # Update row with image path and save it
            row['image_path'] = image_path
            successful_rows.append(row)

            print(f"Saved {image_path}")
        except Exception as e:
            print(f"Failed to download {image_url} for label {label}: {e}")

    # Create a new DataFrame with successful rows
    saved_df = pd.DataFrame(successful_rows)

    return saved_df

In [None]:
# Download and save images, getting a clean DataFrame
saved_df = download_and_save_images_with_mapping(df, image_dir)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Saved /content/drive/MyDrive/Computer Vision Project/Scraped_Data/car_images_v2/2019_hyundai_accent_25059.jpg
Saved /content/drive/MyDrive/Computer Vision Project/Scraped_Data/car_images_v2/2019_hyundai_accent_25060.jpg
Saved /content/drive/MyDrive/Computer Vision Project/Scraped_Data/car_images_v2/2020_kia_optima_25061.jpg
Saved /content/drive/MyDrive/Computer Vision Project/Scraped_Data/car_images_v2/2020_kia_optima_25062.jpg
Saved /content/drive/MyDrive/Computer Vision Project/Scraped_Data/car_images_v2/2020_kia_optima_25063.jpg
Saved /content/drive/MyDrive/Computer Vision Project/Scraped_Data/car_images_v2/2020_kia_optima_25064.jpg
Saved /content/drive/MyDrive/Computer Vision Project/Scraped_Data/car_images_v2/2020_kia_optima_25065.jpg
Saved /content/drive/MyDrive/Computer Vision Project/Scraped_Data/car_images_v2/2018_nissan_titan_25066.jpg
Saved /content/drive/MyDrive/Computer Vision Project/Scraped_Data/car_images_

In [None]:
# Convert final results to save as CSV
saved_df.to_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/craigslist_labels.csv', index=False)

In [None]:
# Load saved_df
saved_df = pd.read_csv('/content/drive/MyDrive/Computer Vision Project/Scraped_Data/craigslist_labels.csv')

In [None]:
saved_df.shape

(29710, 10)