# Script for scraping cars-data by fetching all specific variants per model of each brand available. Runtime: ~5hrs
This produces multiple .csv files as intermediary results. However, the one that should be used is cars-data-table.csv, containing all cars available on the website as entries and for each car:
- Year of fabrication
- Fuel type
- Transmission
- Price
- Body Type
- Number of seats
- Engine type
- Specific fuel type
- Maximum power
- Total maximum power both in kw and hp
- Maximum torque
- Fuel tank capacity
- Turbo specifications
- Top speed
- Acceleration 0 to 100 km/h
- Consumption details
- Battery details


In [5]:
!pip install beautifulsoup4



In [6]:
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd

In [7]:
# Brand URLs

def fetch_brand_urls():
    url = "https://www.cars-data.com/en/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    brand_divs = soup.find_all("div", class_="col-2")
    brands = []

    for div in brand_divs:
        a_tag = div.find("a", href=True, title=True)
        if a_tag:
            brand_name = a_tag["title"].strip()
            brand_url = a_tag["href"]

            if brand_name.lower() == "station wagon cars":
                continue

            brands.append((brand_name, brand_url))

    return brands

brand_links = fetch_brand_urls()

def generate_brand_urls(brands):
    return [url for _, url in brands]

def print_brands_and_urls(brands):
    for name, url in brands:
      print(f"{name}: {url}")

# print_brands_and_urls(brand_links)

In [8]:
# Model URLs

urls = generate_brand_urls(brand_links)

# Function to get all model links for a brand
def get_model_links(brand_url):
    try:
        # Add a delay to be respectful to the website
        time.sleep(2)

        # Get the page content
        response = requests.get(brand_url)
        response.raise_for_status()

        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # Look for links with a title attribute that might contain model names
        model_links = soup.find_all('a', title=True, href=True)

        brand_name = brand_url.split('/')[-1]
        models = []

        for link in model_links:

            href = link['href']
            model_split = href.split('/')

            if brand_name in href and len(model_split) > 4:

                if '.html' in href or model_split[-1] == brand_url.split('/')[-1]:
                  continue

                models.append({
                    'brand': brand_name,
                    'model': model_split[-1],
                    'title': brand_name + " " + model_split[-1].replace('-',' '),
                    'url': href
                })

        return models

    except Exception as e:
        print(f"Error processing {brand_url}: {str(e)}")
        return []


def generate_model_urls():

  all_models = []

  # Iterate through each brand URL
  for brand_url in urls:
    #  print(f"Processing {brand_url}...")

      model_links = get_model_links(brand_url)
      for links in model_links:
          print(links)
      all_models.extend(model_links)

     # print(f"Found {len(model_links)} models for {brand_url}")

  return all_models

model_links = generate_model_urls()

model_df = pd.DataFrame(model_links)
model_df.to_csv('data/cars-data-model-links-table.csv', index=True)
#print(model_links[:100])

{'brand': 'abarth', 'model': '124-spider', 'title': 'abarth 124 spider', 'url': 'https://www.cars-data.com/en/abarth/124-spider'}
{'brand': 'abarth', 'model': '595', 'title': 'abarth 595', 'url': 'https://www.cars-data.com/en/abarth/595'}
{'brand': 'abarth', 'model': '595-cabrio', 'title': 'abarth 595 cabrio', 'url': 'https://www.cars-data.com/en/abarth/595-cabrio'}
{'brand': 'abarth', 'model': '695', 'title': 'abarth 695', 'url': 'https://www.cars-data.com/en/abarth/695'}
{'brand': 'abarth', 'model': '500c', 'title': 'abarth 500c', 'url': 'https://www.cars-data.com/en/abarth/500c'}
{'brand': 'abarth', 'model': 'punto-evo', 'title': 'abarth punto evo', 'url': 'https://www.cars-data.com/en/abarth/punto-evo'}
{'brand': 'abarth', 'model': '500', 'title': 'abarth 500', 'url': 'https://www.cars-data.com/en/abarth/500'}
{'brand': 'abarth', 'model': 'grande-punto', 'title': 'abarth grande punto', 'url': 'https://www.cars-data.com/en/abarth/grande-punto'}
{'brand': 'aiways', 'model': 'u5', 'ti

In [9]:
# Variant URLs

def extract_variant_links(model_url, model_title):
    try:
        # Add a delay to be respectful to the website
        time.sleep(1)

        # Get the page content
        response = requests.get(model_url)
        response.raise_for_status()

        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the models section
        models_section = soup.find('section', class_='models')

        if not models_section:
            print(f"Could not find models section for {model_url}")
            return []

        # Find all links with title containing the model title
        variant_links = []
        for link in models_section.find_all('a', href=True, title=True):
            if model_title.lower() in link['title'].lower():
                variant_links.append({
                    'brand': model_title.split()[0],
                    'model': model_title.split()[1] if len(model_title.split()) > 1 else '',
                    'variant_url': link['href'],
                    'variant_title': link['title'],
                    'variant_id': link['href'].split('/')[-1] if '/' in link['href'] else ''
                })

        return variant_links

    except Exception as e:
        print(f"Error processing {model_url}: {str(e)}")
        return []


def process_models_for_variants(models_list):
    all_variants = []

    for model in models_list:
       # print(f"Processing variants for {model['title']}...")
        variants = extract_variant_links(model['url'], model['title'])
        all_variants.extend(variants)
       # print(f"Found {len(variants)} variants for {model['title']}")

    return all_variants

variant_links = process_models_for_variants(model_links)


variants_df = pd.DataFrame(variant_links)
variants_df.to_csv('data/cars-data-variants-links-table.csv', index=True)

variant_links

[{'brand': 'abarth',
  'model': '124',
  'variant_url': 'https://www.cars-data.com/en/abarth-124-spider-2016/3560',
  'variant_title': 'Abarth 124 Spider',
  'variant_id': '3560'},
 {'brand': 'abarth',
  'model': '595',
  'variant_url': 'https://www.cars-data.com/en/abarth-595-2016/3561',
  'variant_title': 'Abarth 595',
  'variant_id': '3561'},
 {'brand': 'abarth',
  'model': '595',
  'variant_url': 'https://www.cars-data.com/en/abarth-595-cabrio-2016/3562',
  'variant_title': 'Abarth 595 Cabrio',
  'variant_id': '3562'},
 {'brand': 'abarth',
  'model': '695',
  'variant_url': 'https://www.cars-data.com/en/abarth-695-2017/4118',
  'variant_title': 'Abarth 695',
  'variant_id': '4118'},
 {'brand': 'abarth',
  'model': '695',
  'variant_url': 'https://www.cars-data.com/en/abarth-695-2014/4115',
  'variant_title': 'Abarth 695',
  'variant_id': '4115'},
 {'brand': 'abarth',
  'model': '500c',
  'variant_url': 'https://www.cars-data.com/en/abarth-500c-2010/2',
  'variant_title': 'Abarth 50

In [10]:
# Spec URLs

def extract_specs_links(variant_url, variant_title):
    try:
        # Add a delay to be respectful to the website
        time.sleep(1)

        # Get the page content
        response = requests.get(variant_url)
        response.raise_for_status()

        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all rows that might contain spec links
        specs_links = []

        # Look for links in h3 tags inside col-6 divs
        for div in soup.find_all('div', class_='col-6'):
            h3 = div.find('h3')
            if h3:
                link = h3.find('a', href=True, title=True)
                if link and variant_title.lower() in link['title'].lower():
                    # Extract year from the title using regex
                    # Looking for a 4-digit year at the beginning of the title
                    year_match = re.search(r'^(\d{4})', link['title'])
                    year = year_match.group(1) if year_match else "Unknown"

                    specs_links.append({
                        'brand': variant_title.split()[0],
                        'year': year,
                        'spec_title': link['title'],
                        'spec_url': link['href'] +'/tech',
                        'fuel_type': div.find_next_sibling('div', class_='col-2').text.strip() if div.find_next_sibling('div', class_='col-2') else '',
                        'transmission': div.find_next_sibling('div', class_='col-2').find_next_sibling('div', class_='col-2').text.strip() if div.find_next_sibling('div', class_='col-2') and div.find_next_sibling('div', class_='col-2').find_next_sibling('div', class_='col-2') else ''
                    })

        return specs_links

    except Exception as e:
        print(f"Error processing {variant_url}: {str(e)}")
        return []

def process_variants_for_specs(variants_list):

    all_specs = []

    for variant in variants_list:
        print(f"Processing specs for {variant['variant_title']}...")
        specs = extract_specs_links(variant['variant_url'], variant['variant_title'])
        all_specs.extend(specs)
        print(f"Found {len(specs)} specifications for {variant['variant_title']}")

    return all_specs

spec_links = process_variants_for_specs(variant_links)


spec_df = pd.DataFrame(spec_links)
spec_df.to_csv('data/cars-data-spec-links-table.csv', index=True)

spec_links


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing specs for Chevrolet Lumina...
Found 2 specifications for Chevrolet Lumina
Processing specs for Chevrolet Lumina APV...
Found 2 specifications for Chevrolet Lumina APV
Processing specs for Chevrolet Beretta...
Found 1 specifications for Chevrolet Beretta
Processing specs for Chevrolet Corsica...
Found 2 specifications for Chevrolet Corsica
Processing specs for Chevrolet Corsica...
Found 1 specifications for Chevrolet Corsica
Processing specs for Chrysler Grand Voyager...
Found 6 specifications for Chrysler Grand Voyager
Processing specs for Chrysler Grand Voyager...
Found 15 specifications for Chrysler Grand Voyager
Processing specs for Chrysler Grand Voyager...
Found 18 specifications for Chrysler Grand Voyager
Processing specs for Chrysler 300C...
Found 3 specifications for Chrysler 300C
Processing specs for Chrysler 300C Touring...
Found 5 specifications for Chrysler 300C Touring
Processing specs for Chrysler

[{'brand': 'Abarth',
  'year': '2016',
  'spec_title': '2016 Abarth 124 Spider 1.4 MultiAir 16v specs',
  'spec_url': 'https://www.cars-data.com/en/abarth-124-spider-1-4-multiair-16v-specs/73526/tech',
  'fuel_type': 'Petrol',
  'transmission': 'Manual'},
 {'brand': 'Abarth',
  'year': '2016',
  'spec_title': '2016 Abarth 124 Spider 1.4 MultiAir 16v specs',
  'spec_url': 'https://www.cars-data.com/en/abarth-124-spider-1-4-multiair-16v-specs/73527/tech',
  'fuel_type': 'Petrol',
  'transmission': 'Automatic'},
 {'brand': 'Abarth',
  'year': '2016',
  'spec_title': '2016 Abarth 595 1.4 T-Jet 145 specs',
  'spec_url': 'https://www.cars-data.com/en/abarth-595-1-4-t-jet-145-specs/73528/tech',
  'fuel_type': 'Petrol',
  'transmission': 'Manual'},
 {'brand': 'Abarth',
  'year': '2016',
  'spec_title': '2016 Abarth 595 1.4 T-Jet 145 specs',
  'spec_url': 'https://www.cars-data.com/en/abarth-595-1-4-t-jet-145-specs/73529/tech',
  'fuel_type': 'Petrol',
  'transmission': 'Automatic'},
 {'brand':

In [13]:
# Spec collection

def process_spec_page(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return {}

    soup = BeautifulSoup(response.text, 'html.parser')

    # Collect data from tables
    tables = soup.find_all('table')

    specs = {}

    for table in tables:
        rows = table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                raw_key = cols[0].get_text().strip().lower()
                value = cols[1].get_text().strip()

                if value == '' or value is None:
                    continue

                # print(raw_key + " " + value + "\n")

                # GENERAL
                if 'price' in raw_key and 'road' not in raw_key:
                    specs['price'] = value
                elif 'body type' in raw_key:
                    specs['body_type'] = value
                elif 'number of seats' in raw_key:
                    specs['number_of_seats'] = value

                # ENGINE AND FUEL
                elif 'engine/motor type' in raw_key:
                    specs['engine_type'] = value
                elif 'fuel tank capacity' in raw_key:
                    specs['fuel_tank_capacity'] = value
                elif 'fuel type' in raw_key:
                    specs['fuel_type_detail'] = value
                elif 'max torque' in raw_key:
                    specs['max_torque'] = value

                # ELECTRIC
                elif 'battery range' in raw_key:
                    specs['battery_range'] = value
                elif 'battery capacity' in raw_key:
                    specs['battery_capacity'] = value
                elif 'charging time' in raw_key:
                    specs['charging_time'] = value
                elif 'power consumption' in raw_key:
                    specs['power_consumption'] = value

                # POWER
                elif 'max power' in raw_key:
                    specs['max_power'] = value
                elif 'total max. power' in raw_key and 'kw' in raw_key:
                    specs['total_max_power_kw'] = value
                elif 'total max. power' in raw_key and 'hp' in raw_key:
                    specs['total_max_power_hp'] = value

                # PERFORMANCE
                elif 'turbo' in raw_key:
                    specs['turbo'] = value
                elif 'top speed' in raw_key:
                    specs['top_speed'] = value
                elif 'acceleration 0-100' in raw_key:
                    specs['acceleration_0_100_kmh'] = value

                # CONSUMPTION
                elif 'extra-urban consumption' in raw_key:
                    specs['extra_urban_consumption'] = value
                elif 'urban consumption' in raw_key:
                    specs['urban_consumption'] = value
                elif 'combined consumption' in raw_key:
                    specs['combined_consumption'] = value
                elif 'co2 emissions' in raw_key:
                    specs['co2_emissions'] = value

    return specs

def enrich_car_data(car_entries):
    enriched_data = []
    for entry in car_entries:
        url = entry.get('spec_url')
        print(f"Processing: {entry.get('spec_title')}")

        details = process_spec_page(url)

        # Merge original with extracted details
        enriched_entry = {
            'year': entry.get('year'),
            'spec_title': entry.get('spec_title'),
            # 'spec_url': entry.get('spec_url'),
            'fuel_type': entry.get('fuel_type'),
            'transmission': entry.get('transmission')
        }

        enriched_entry.update({
            'price': details.get('price'),
            'body_type': details.get('body_type'),
            'number_of_seats': details.get('number_of_seats'),
            'engine_type': details.get('engine_type'),
            'fuel_type_detail': details.get('fuel_type_detail'),
            'max_power': details.get('max_power'),
            'total_max_power_kw': details.get('total_max_power_kw'),
            'total_max_power_hp': details.get('total_max_power_hp'),
            'max_torque': details.get('max_torque'),
            'fuel_tank_capacity': details.get('fuel_tank_capacity'),
            'turbo': details.get('turbo'),
            'top_speed': details.get('top_speed'),
            'acceleration_0_100_kmh': details.get('acceleration_0_100_kmh'),
            'urban_consumption': details.get('urban_consumption'),
            'extra_urban_consumption': details.get('extra_urban_consumption'),
            'combined_consumption': details.get('combined_consumption'),
            'co2_emissions': details.get('co2_emissions'),
            'battery_range': details.get('battery_range'),
            'battery_capacity': details.get('battery_capacity'),
            'charging_time': details.get('charging_time'),
            'power_consumption': details.get('power_consumption')
        })

        enriched_data.append(enriched_entry)

    return enriched_data


car_data = enrich_car_data(spec_links)

car_data

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing: 2012 Subaru XV 2.0i Executive specs
Processing: 2012 Subaru XV 2.0i Executive specs
Processing: 2012 Subaru XV 2.0D Luxury specs
Processing: 2012 Subaru XV 2.0D Luxury Plus specs
Processing: 2011 Subaru Trezia 1.3 Intro specs
Processing: 2011 Subaru Trezia 1.3 Comfort specs
Processing: 2011 Subaru Trezia 1.3 Comfort S specs
Processing: 2011 Subaru Trezia 1.3 Luxury specs
Processing: 2011 Subaru Trezia 1.4D Intro specs
Processing: 2011 Subaru Trezia 1.4D Comfort specs
Processing: 2011 Subaru Trezia 1.4D Luxury specs
Processing: 2011 Subaru Trezia 1.4D Luxury specs
Processing: 2011 Subaru Trezia 1.4D Executive specs
Processing: 2011 Subaru Trezia 1.4D Executive specs
Processing: 2014 Subaru WRX STI 2.5T Sport specs
Processing: 2014 Subaru WRX STI 2.5T Sport Executive specs
Processing: 2010 Subaru WRX STI 2.5T Sport specs
Processing: 2010 Subaru WRX STI 2.5T Sport Executive specs
Processing: 2010 Subaru WRX STI 2

[{'year': '2016',
  'spec_title': '2016 Abarth 124 Spider 1.4 MultiAir 16v specs',
  'fuel_type': 'Petrol',
  'transmission': 'Manual',
  'price': '€ 44.760',
  'body_type': '2-doors, convertible',
  'number_of_seats': '2',
  'engine_type': 'fuel engine',
  'fuel_type_detail': 'gasoline',
  'max_power': '125 kw (170 hp)',
  'total_max_power_kw': '125',
  'total_max_power_hp': '170',
  'max_torque': '2500 tpm',
  'fuel_tank_capacity': '45 l',
  'turbo': 'yes, with intercooler',
  'top_speed': '232 km/h',
  'acceleration_0_100_kmh': '6,8 s',
  'urban_consumption': '8,5 l/100km',
  'extra_urban_consumption': '5,1 l/100km',
  'combined_consumption': '6,4 l/100km',
  'co2_emissions': '148 g/km',
  'battery_range': 'n/a',
  'battery_capacity': None,
  'charging_time': None,
  'power_consumption': 'n/a'},
 {'year': '2016',
  'spec_title': '2016 Abarth 124 Spider 1.4 MultiAir 16v specs',
  'fuel_type': 'Petrol',
  'transmission': 'Automatic',
  'price': '€ 47.160',
  'body_type': '2-doors, con

In [14]:
# DataFrame Conversion and Saving data

df = pd.DataFrame(car_data)
df.to_csv('cars-data-table.csv', index=True)

df.head()

Unnamed: 0,year,spec_title,fuel_type,transmission,price,body_type,number_of_seats,engine_type,fuel_type_detail,max_power,...,top_speed,acceleration_0_100_kmh,urban_consumption,extra_urban_consumption,combined_consumption,co2_emissions,battery_range,battery_capacity,charging_time,power_consumption
0,2016,2016 Abarth 124 Spider 1.4 MultiAir 16v specs,Petrol,Manual,€ 44.760,"2-doors, convertible",2,fuel engine,gasoline,125 kw (170 hp),...,232 km/h,"6,8 s","8,5 l/100km","5,1 l/100km","6,4 l/100km",148 g/km,,,,
1,2016,2016 Abarth 124 Spider 1.4 MultiAir 16v specs,Petrol,Automatic,€ 47.160,"2-doors, convertible",2,fuel engine,gasoline,125 kw (170 hp),...,229 km/h,"6,9 s","9,1 l/100km","5,2 l/100km","6,6 l/100km",153 g/km,,,,
2,2016,2016 Abarth 595 1.4 T-Jet 145 specs,Petrol,Manual,€ 25.330,"3-doors, hatchback",4,fuel engine,gasoline,107 kw (145 hp),...,210 km/h,"7,8 s","7,9 l/100km","4,9 l/100km","6,0 l/100km",139 g/km,,,,
3,2016,2016 Abarth 595 1.4 T-Jet 145 specs,Petrol,Automatic,€ 26.730,"3-doors, hatchback",4,fuel engine,gasoline,107 kw (145 hp),...,210 km/h,"8,0 s","7,6 l/100km","4,7 l/100km","5,8 l/100km",134 g/km,,,,
4,2016,2016 Abarth 595 1.4 T-Jet 165 Turismo specs,Petrol,Manual,€ 29.230,"3-doors, hatchback",4,fuel engine,gasoline,121 kw (165 hp),...,218 km/h,"7,3 s","7,9 l/100km","4,9 l/100km","6,0 l/100km",139 g/km,,,,
