###TrueCar.com

In [None]:
import os
import pandas as pd
os.listdir('/content')

zip_codes = '/content/Zips.xlsx'

df = pd.read_excel(zip_codes)
# df.head()

df['State Code'] = df['ISO'].str.split('-').str[1]

df['City'] = df['City'].str.replace(' ', '-').str.lower()

df['City-State'] = df['City'] + '-' + df['State Code'].str.lower()

# print(df[['City', 'State', 'City-State']])

def get_batch(df, start, end):
    batch = df.iloc[start:end]
    return batch[['City-State']]

start = 0
end = 10

batch = get_batch(df, start, end)
print(batch)

In [None]:
import requests
from lxml import html
import pandas as pd
import csv
import os
import time
import random

def get_vehicle_details_from_vin(vin):
    url = "https://vpic.nhtsa.dot.gov/api/vehicles/DecodeVINValuesBatch/"
    data = {'DATA': vin, 'format': 'JSON'}

    try:
        response = requests.post(url, data=data)
        response_data = response.json()

        if response_data['Results']:
            result = response_data['Results'][0]
            make = result.get('Make', None)
            model = result.get('Model', None)
            trim = result.get('Trim', None)
            body_class = result.get('BodyClass', None)
            model_year = result.get('ModelYear', None)
            seating_capacity = result.get('Seats', None)

            return make, model, trim, body_class, model_year, seating_capacity
        else:
            return (None,) * 6

    except Exception as e:
        print(f"Error fetching VIN details for {vin}: {e}")
        return (None,) * 6

def get_all_car_hrefs(base_url, location, all_car_hrefs, pages=1):
    session = requests.Session()
    location_car_hrefs = set()

    for page_number in range(1, pages + 1):
        try:
            params = {'page': page_number}
            res = session.get(f'{base_url}{location}/', params=params)

            if res.status_code != 200:
                print(f"Failed to fetch page {page_number}, status code: {res.status_code}")
                continue

            page = html.fromstring(res.text)
            hrefs = page.xpath("//li[contains(@class, 'col-md-6')]//a/@href")
            hrefs = [href.split('?')[0] for href in hrefs]
            location_car_hrefs.update(hrefs)
            time.sleep(random.uniform(0.25, 2))

        except Exception as e:
            print(f"Error processing page {page_number}: {e}")

    print(f"Total car hrefs collected for location {location}: {len(location_car_hrefs)}")
    return location_car_hrefs


def scrape_car_details(all_car_hrefs):
    session = requests.Session()
    car_list = []

    csv_file = 'CA2_TrueCar_details.csv'
    fieldnames = [
        'Car Name', 'Car Webpage', 'Car Health', 'Price', 'Car Exterior Color', 'Car Interior Color', 'Miles',
        'Fuel Type', 'Fuel Efficiency (mileage)', 'EPA Range', 'Transmission', 'Drivetrain',
        'Engine', 'Location', 'Listing Status', 'VIN', 'Stock Number', 'Accidents', 'Owners',
        'Car Title Status', 'Use Type', 'Vehicle Last Inspected', 'Make', 'Model', 'Trim', 'Body Class', 'Model Year', 'Seating Capacity'
    ]

    file_exists = os.path.isfile(csv_file)

    with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        if not file_exists:
            writer.writeheader()

        for href in all_car_hrefs:
            try:
                href = href.split('?')[0]
                res = session.get(f'https://www.truecar.com{href}')
                car_page_href = 'https://www.truecar.com' + href
                page = html.fromstring(res.text)

                car_name = page.xpath("//h1[contains(@class, 'heading-3_5')]//text()")
                car_name = car_name[0].strip() if car_name else "Unknown Car"
                if car_name.startswith('"') and car_name.endswith('"'):
                    car_name = car_name[1:-1]
                    car_name = car_name.strip()

                car_health = page.xpath("//div[contains(@class, 'flex shrink-0')]//span//text()")
                car_health = car_health[0].strip() if car_health else "None"

                price = page.xpath("//div[contains(@class, 'heading-2')]//text()")
                price = price[0].strip() if price else "None"

                car_specs = page.xpath("//div[@class='row pt-3']")

                for spec in car_specs:
                    exterior = spec.xpath(".//div[contains(text(), 'Exterior:')]//text()")
                    exterior_text = [item.strip() for item in exterior if item.strip()][1] if exterior else None

                    interior = spec.xpath(".//div[contains(text(), 'Interior:')]//text()")
                    interior_text = [item.strip() for item in interior if item.strip()][1] if interior else None

                    miles = spec.xpath(".//svg[use/@href='#speed']/following-sibling::div//text()")
                    miles_text = [item.strip() for item in miles if item.strip()][0] if miles else None

                    fuel_type = spec.xpath(".//svg[use/@href='#gas-can']/following-sibling::div//text()")
                    fuel_type_text = [item.strip() for item in fuel_type if item.strip()][1] if fuel_type else None

                    fuel_efficiency = spec.xpath(".//svg[use/@href='#gas-pump']/following-sibling::div//text()")
                    fuel_efficiency_text = [item.strip() for item in fuel_efficiency if item.strip()][0] if fuel_efficiency else None

                    epa_range = spec.xpath(".//svg[use/@href='#route']/following-sibling::div//text()")
                    epa_range_text = [item.strip() for item in epa_range if item.strip()][1] if epa_range else None

                    transmission = spec.xpath(".//svg[use/@href='#transmission']/following-sibling::div//text()")
                    transmission_text = [item.strip() for item in transmission if item.strip()][0] if transmission else None

                    drivetrain = spec.xpath(".//svg[use/@href='#drivetrain']/following-sibling::div//text()")
                    drivetrain_text = [item.strip() for item in drivetrain if item.strip()][0] if drivetrain else None

                    engine = spec.xpath(".//svg[use/@href='#engine']/following-sibling::div//text()")
                    engine_text = [item.strip() for item in engine if item.strip()][0] if engine else None

                    location = spec.xpath(".//svg[use/@href='#location_on']/following-sibling::div//text()")
                    location_text = [item.strip() for item in location if item.strip()][0] if location else None

                    listing_status = spec.xpath(".//svg[use/@href='#calendar_month']/following-sibling::div//text()")
                    listing_status_text = [item.strip() for item in listing_status if item.strip()][0] if listing_status else None

                    vin = spec.xpath(".//svg[use/@href='#vin']/following-sibling::div//text()")
                    vin_text = [item.strip() for item in vin if item.strip()][1] if vin else None

                    stock_number = spec.xpath(".//svg[use/@href='#stock-number']/following-sibling::div//text()")
                    stock_number_text = [item.strip() for item in stock_number if item.strip()][1] if stock_number else None

                    vehicle_condition_div = page.xpath('//div[@data-test="vehicleConditionHistory"]')
                    vehicle_details = vehicle_condition_div[0].xpath('.//div[@class="text-sm"]/text()')
                    vehicle_details = [detail.strip() for detail in vehicle_details]

                    accidents = vehicle_details[0].replace('Accidents', '').strip() if len(vehicle_details) > 0 else None
                    owners = vehicle_details[1] if len(vehicle_details) > 1 else None
                    if owners:
                        if 'Owners' in owners:
                            owners = owners.replace('Owners', '').strip()
                        elif 'Owner' in owners:
                            owners = owners.replace('Owner', '').strip()
                        else:
                            owners = None
                    car_title_status = vehicle_details[2] if len(vehicle_details) > 2 else None
                    use_type = vehicle_details[3].replace('Use', '').strip() if len(vehicle_details) > 3 else None

                    vehicle_last_inspected = vehicle_condition_div[0].xpath('.//div[@class="text-xs text-muted"]/text()')
                    vehicle_last_inspected = ''.join(vehicle_last_inspected).replace('Condition data as of', '').strip() if vehicle_last_inspected else None

                    if vin_text:
                        make, model, trim, body_class, model_year, seating_capacity = get_vehicle_details_from_vin(vin_text)
                    else:
                        make, model, trim, body_class, model_year, seating_capacity = (None,) * 6

                    writer.writerow({
                        'Car Name': car_name,
                        'Car Webpage': car_page_href,
                        'Car Health': car_health,
                        'Price': price,
                        'Car Exterior Color': exterior_text,
                        'Car Interior Color': interior_text,
                        'Miles': miles_text,
                        'Fuel Type': fuel_type_text,
                        'Fuel Efficiency (mileage)': fuel_efficiency_text,
                        'EPA Range': epa_range_text,
                        'Transmission': transmission_text,
                        'Drivetrain': drivetrain_text,
                        'Engine': engine_text,
                        'Location': location_text,
                        'Listing Status': listing_status_text,
                        'VIN': vin_text,
                        'Stock Number': stock_number_text,
                        'Accidents': accidents,
                        'Owners': owners,
                        'Car Title Status': car_title_status,
                        'Use Type': use_type,
                        'Vehicle Last Inspected': vehicle_last_inspected,
                        'Make': make,
                        'Model': model,
                        'Trim': trim,
                        'Body Class': body_class,
                        'Model Year': model_year,
                        'Seating Capacity': seating_capacity
                    })
                time.sleep(random.uniform(0.25, 2))
                print(f"Successfully processed {car_name}")

            except Exception as e:
                print(f"Error processing {href}: {e}")


def process_locations(df):
    all_car_hrefs = set()
    for location in batch['City-State']:
    # location = 'los-angeles-ca'
      print(f"Processing location: {location}")
      base_url = "https://www.truecar.com/used-cars-for-sale/listings/location-"
      location_hrefs = get_all_car_hrefs(base_url, location, all_car_hrefs, pages=52)
      all_car_hrefs.update(location_hrefs)

    print(f"Total Car hrefs starting to scrape: {len(all_car_hrefs)}")
    scrape_car_details(all_car_hrefs)
    print('*' * 60)

process_locations(df)


In [None]:
!curl ipecho.net/plain