# Carsized Data

This notebook scrapes car dimension data from carsized.com and exports it to CSV.

**Data collected:**
- Manufacturer, Car Name, Body Style
- Production Start/End years
- Dimensions: Length, Width, Width incl. mirrors, Height, Wheelbase, Ground Clearance
- Cargo Volume (EU/US), Cargo Volume Max (EU/US)
- Weight, Segment, Price (EU/US)

In [1]:
# Install required packages (run once)
# !pip install httpx selectolax pandas

In [2]:
import httpx
from selectolax.parser import HTMLParser
import pandas as pd
import time
import random
import re
from typing import Optional
import xml.etree.ElementTree as ET
from dataclasses import dataclass, asdict
import asyncio

  from pandas.core import (


In [3]:
@dataclass
class CarData:
    """Data class for car specifications"""
    url: str
    manufacturer: Optional[str] = None
    car_name: Optional[str] = None
    body_style: Optional[str] = None
    production_start: Optional[str] = None
    production_end: Optional[str] = None
    length: Optional[str] = None
    width: Optional[str] = None
    width_incl_mirrors: Optional[str] = None
    height: Optional[str] = None
    wheelbase: Optional[str] = None
    ground_clearance: Optional[str] = None
    cargo_volume_eu: Optional[str] = None
    cargo_volume_us: Optional[str] = None
    cargo_volume_max_eu: Optional[str] = None
    cargo_volume_max_us: Optional[str] = None
    weight: Optional[str] = None
    segment: Optional[str] = None
    price_eu: Optional[str] = None
    price_us: Optional[str] = None

In [4]:
# Configuration
BASE_URL = "https://www.carsized.com"
SITEMAP_INDEX_URL = f"{BASE_URL}/sitemap.xml"
NUM_SITEMAPS = 55  # sitemap0.xml through sitemap54.xml

# Rate limiting settings - be respectful to the server
MIN_DELAY = 1.0  # Minimum seconds between requests
MAX_DELAY = 2.0  # Maximum seconds between requests
MAX_CONCURRENT_REQUESTS = 3  # Limit concurrent requests
MAX_RETRIES = 3  # Max retries on failure
RETRY_DELAY = 5  # Seconds to wait before retry

# Request headers to mimic browser
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Connection": "keep-alive",
}

In [5]:
def get_urls_from_sitemap(sitemap_url: str, client: httpx.Client) -> list[str]:
    """Extract all car page URLs from a sitemap XML file.
    Excludes URLs containing 'compare'.
    """
    urls = []
    try:
        response = client.get(sitemap_url, timeout=30)
        response.raise_for_status()
        
        root = ET.fromstring(response.content)
        namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        
        for url_elem in root.findall('.//ns:url/ns:loc', namespace):
            url = url_elem.text
            if url and 'compare' not in url.lower():
                urls.append(url)
    except Exception:
        pass
    
    return urls

In [6]:
def get_all_car_urls(client: httpx.Client) -> list[str]:
    """Collect all car URLs from all sitemaps."""
    all_urls = []
    
    for i in range(NUM_SITEMAPS):
        sitemap_url = f"{BASE_URL}/sitemap{i}.xml"
        urls = get_urls_from_sitemap(sitemap_url, client)
        all_urls.extend(urls)
        time.sleep(0.5)
    
    return all_urls

In [7]:
def extract_dimension_value(text: str) -> Optional[str]:
    """Extract numeric value from dimension text (e.g., '493.5 cm' -> '493.5')."""
    if not text:
        return None
    # Extract number (including decimals)
    match = re.search(r'[\d,.]+', text.replace(',', '.'))
    return match.group() if match else None


def extract_price_value(text: str) -> Optional[str]:
    """Extract numeric value from price text (e.g., '€ 87300' -> '87300')."""
    if not text:
        return None
    # Remove currency symbols and extract number
    cleaned = re.sub(r'[€$£,\s]', '', text)
    match = re.search(r'[\d]+', cleaned)
    return match.group() if match else None

In [8]:
def clean_label(texts: list) -> str:
    """Join label parts and remove superscript numbers."""
    label = ' '.join(t for t in texts[1:] if t).strip().lower()
    # Remove trailing numbers (superscripts like Width2, Height1)
    label = re.sub(r'[\d,]+$', '', label).strip()
    return label


def parse_car_page(html: str, url: str) -> CarData:
    """Parse a car page HTML and extract all specifications."""
    tree = HTMLParser(html)
    car = CarData(url=url)
    
    # Extract manufacturer
    brand_elem = tree.css_first('[itemprop="brand"]')
    if brand_elem:
        car.manufacturer = brand_elem.text(strip=True)
    
    # Extract car name/model - exclude generation code spans
    model_elem = tree.css_first('[itemprop="model"]')
    if model_elem:
        full_text = model_elem.text(strip=True)
        # Remove generation code spans (e.g., "4.1", "G05")
        gc_spans = model_elem.css('span.carmodelgc')
        for gc in gc_spans:
            gc_text = gc.text(strip=True)
            full_text = full_text.replace(gc_text, '')
        car.car_name = full_text.strip()
    
    # Extract body style
    body_elem = tree.css_first('[itemprop="bodyType"]')
    if body_elem:
        car.body_style = body_elem.text(strip=True)
    
    # Extract production years
    date_elem = tree.css_first('[itemprop="vehicleModelDate"]')
    if date_elem:
        date_text = date_elem.text(strip=True)
        # Parse "2023 - present" or "2018 - 2023" format
        if ' - ' in date_text:
            parts = date_text.split(' - ')
            car.production_start = parts[0].strip()
            end = parts[1].strip().lower()
            car.production_end = None if end == 'present' else parts[1].strip()
        else:
            car.production_start = date_text
    
    # Extract dimensions from the data matrix
    content_rows = tree.css('.contentmargin')
    
    for row in content_rows:
        title_divs = row.css('.dmatrixtitle, .dmatrixtitlesup')
        texts = [d.text(strip=True) for d in title_divs]
        
        if len(texts) >= 2:
            value_text = texts[0]
            label = clean_label(texts)
            
            # Map labels to car attributes
            if 'length' in label:
                car.length = extract_dimension_value(value_text)
            elif 'width incl' in label:
                car.width_incl_mirrors = extract_dimension_value(value_text)
            elif 'width' in label and 'incl' not in label:
                car.width = extract_dimension_value(value_text)
            elif 'height' in label:
                car.height = extract_dimension_value(value_text)
            elif 'wheelbase' in label:
                car.wheelbase = extract_dimension_value(value_text)
            elif 'ground clearance' in label:
                car.ground_clearance = extract_dimension_value(value_text)
            elif 'cargo volume max' in label and 'eu' in label:
                car.cargo_volume_max_eu = extract_dimension_value(value_text)
            elif 'cargo volume max' in label and 'us' in label:
                car.cargo_volume_max_us = extract_dimension_value(value_text)
            elif 'cargo volume' in label and 'eu' in label:
                car.cargo_volume_eu = extract_dimension_value(value_text)
            elif 'cargo volume' in label and 'us' in label:
                car.cargo_volume_us = extract_dimension_value(value_text)
            elif 'weight' in label:
                car.weight = extract_dimension_value(value_text)
            elif 'segment' in label:
                car.segment = value_text
            elif 'price eu' in label:
                car.price_eu = extract_price_value(value_text)
            elif 'price us' in label:
                car.price_us = extract_price_value(value_text)
    
    return car

In [9]:
def scrape_car_page(url: str, client: httpx.Client) -> Optional[CarData]:
    """Fetch and parse a single car page with retry logic."""
    for attempt in range(MAX_RETRIES):
        try:
            response = client.get(url, timeout=30)
            response.raise_for_status()
            return parse_car_page(response.text, url)
            
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 429:
                time.sleep(RETRY_DELAY * (attempt + 1))
            elif e.response.status_code == 403:
                return None
            else:
                pass
                
        except Exception:
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_DELAY)
    
    return None

In [10]:
def scrape_all_cars(urls: list[str], checkpoint_every: int = 500) -> list[CarData]:
    """Scrape all car pages with rate limiting and checkpointing.
    
    Args:
        urls: List of car page URLs to scrape
        checkpoint_every: Save progress every N cars
    
    Returns:
        List of CarData objects
    """
    cars = []
    
    with httpx.Client(headers=HEADERS, follow_redirects=True) as client:
        for i, url in enumerate(urls):
            delay = random.uniform(MIN_DELAY, MAX_DELAY)
            time.sleep(delay)
            
            car_data = scrape_car_page(url, client)
            if car_data:
                cars.append(car_data)
            
            if (i + 1) % checkpoint_every == 0:
                checkpoint_df = pd.DataFrame([asdict(c) for c in cars])
                checkpoint_df.to_csv(f'carsized_checkpoint_{i + 1}.csv', index=False)
    
    return cars

In [11]:
async def scrape_car_page_async(url: str, client: httpx.AsyncClient, semaphore: asyncio.Semaphore) -> Optional[CarData]:
    """Async version of car page scraper."""
    async with semaphore:
        for attempt in range(MAX_RETRIES):
            try:
                await asyncio.sleep(random.uniform(MIN_DELAY, MAX_DELAY))
                response = await client.get(url, timeout=30)
                response.raise_for_status()
                return parse_car_page(response.text, url)
                
            except httpx.HTTPStatusError as e:
                if e.response.status_code == 429:
                    await asyncio.sleep(RETRY_DELAY * (attempt + 1))
                else:
                    break
            except Exception:
                if attempt < MAX_RETRIES - 1:
                    await asyncio.sleep(RETRY_DELAY)
        return None


async def scrape_all_cars_async(urls: list[str]) -> list[CarData]:
    """Async scraper with controlled concurrency."""
    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
    
    async with httpx.AsyncClient(headers=HEADERS, follow_redirects=True) as client:
        tasks = [scrape_car_page_async(url, client, semaphore) for url in urls]
        results = await asyncio.gather(*tasks)
    
    return [r for r in results if r is not None]

## Step 1: Collect all URLs from sitemaps

In [12]:
with httpx.Client(headers=HEADERS, follow_redirects=True) as client:
    all_urls = get_all_car_urls(client)

len(all_urls)

2312

## Step 2: Test scraping on a few sample URLs

In [13]:
sample_urls = all_urls[:5] if all_urls else [
    "https://www.carsized.com/en/cars/bmw-x5-2023-suv/",
    "https://www.carsized.com/en/cars/audi-a3-2020-5-door-hatchback/",
]

sample_cars = scrape_all_cars(sample_urls)
sample_df = pd.DataFrame([asdict(c) for c in sample_cars])
sample_df

Unnamed: 0,url,manufacturer,car_name,body_style,production_start,production_end,length,width,width_incl_mirrors,height,wheelbase,ground_clearance,cargo_volume_eu,cargo_volume_us,cargo_volume_max_eu,cargo_volume_max_us,weight,segment,price_eu,price_us
0,https://www.carsized.com/en/cars/abarth-500-20...,Abarth,50032,3-door Hatchback,2008,2016.0,365.7,162.7,189.3,148.5,230.0,10.4,185.0,,610.0,,1110,Standard,17850.0,
1,https://www.carsized.com/en/cars/abarth-500-20...,Abarth,500,3-door Hatchback,2016,2022.0,366.0,162.7,189.3,148.5,230.0,10.4,185.0,,610.0,,1110,Standard,18490.0,
2,https://www.carsized.com/en/cars/abarth-500-20...,Abarth,500 500e,Semi-cabriolet,2022,,367.3,168.2,190.0,151.8,232.2,,185.0,,550.0,,1435,Standard,40990.0,
3,https://www.carsized.com/en/cars/abarth-punto-...,Abarth,Punto99,3-door Hatchback,2008,2010.0,404.1,172.6,,149.0,251.0,,275.0,,1030.0,,1260,Standard,18500.0,
4,https://www.carsized.com/en/cars/acura-integra...,Acura,Integra,Liftback,2022,,471.9,182.9,,141.0,273.6,13.0,,,,,1399,Premium,,31300.0


## Step 3: Full scrape

**Warning**: This will take a long time and make many requests to the server. Consider running in batches.

In [14]:
# Alternative: Run in batches
BATCH_SIZE = 500
START_BATCH = 0

all_cars = []
for batch_num in range(START_BATCH, len(all_urls) // BATCH_SIZE + 1):
     start_idx = batch_num * BATCH_SIZE
     end_idx = min((batch_num + 1) * BATCH_SIZE, len(all_urls))
     batch_urls = all_urls[start_idx:end_idx]
     
     batch_cars = scrape_all_cars(batch_urls)
     all_cars.extend(batch_cars)
     
     batch_df = pd.DataFrame([asdict(c) for c in batch_cars])
     batch_df.to_csv(f'carsized_batch_{batch_num}.csv', index=False)

## Step 4: Export to CSV

In [15]:
# Combine batch files if you ran in batches
import glob
import os

## Create tables directory if it doesn't exist
os.makedirs('tables', exist_ok=True)

batch_files = sorted(glob.glob('carsized_batch_*.csv'))
if batch_files:
    dfs = [pd.read_csv(f) for f in batch_files]
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df.to_csv('tables/carsized_data.csv', index=False)