In [None]:
!pip install beautifulsoup4



In [None]:
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
import os

In [None]:
def fetch_brand_urls():
    base_url = "https://www.auto-data.net/en/"
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Correct selector: all <a class="marki_blok">
    brand_links = soup.find_all("a", class_="marki_blok")
    brands = []

    for a_tag in brand_links:
        if a_tag and "title" in a_tag.attrs and "href" in a_tag.attrs:
            # Get clean brand name
            brand_name = a_tag["title"].split(" - ")[0].strip()

            # Remove leading "/en/" and build full URL
            raw_href = a_tag["href"]
            cleaned_path = raw_href.replace("/en/", "", 1)
            full_url = base_url + cleaned_path

            brands.append((brand_name, full_url))

    return brands

def print_brands_and_urls2(brands):
    for name, url in brands:
        print(f"{name}: {url}")

if __name__ == "__main__":
    brand_links = fetch_brand_urls()
    print_brands_and_urls2(brand_links)


Acura: https://www.auto-data.net/en/acura-brand-6
Alfa Romeo: https://www.auto-data.net/en/alfa-romeo-brand-11
Alpina: https://www.auto-data.net/en/alpina-brand-16
Aston Martin: https://www.auto-data.net/en/aston-martin-brand-36
Audi: https://www.auto-data.net/en/audi-brand-41
Bentley: https://www.auto-data.net/en/bentley-brand-66
BMW: https://www.auto-data.net/en/bmw-brand-86
Bugatti: https://www.auto-data.net/en/bugatti-brand-106
BYD: https://www.auto-data.net/en/byd-brand-116
Cadillac: https://www.auto-data.net/en/cadillac-brand-121
Chevrolet: https://www.auto-data.net/en/chevrolet-brand-156
Chrysler: https://www.auto-data.net/en/chrysler-brand-161
Citroen: https://www.auto-data.net/en/citroen-brand-166
Cupra: https://www.auto-data.net/en/cupra-brand-256
Dacia: https://www.auto-data.net/en/dacia-brand-181
Daewoo: https://www.auto-data.net/en/daewoo-brand-191
Daihatsu: https://www.auto-data.net/en/daihatsu-brand-2
Dodge: https://www.auto-data.net/en/dodge-brand-32
DS: https://www.aut

In [None]:
import os

# Model URL for Car-Data website

# Extract brand URLs
brand_links = fetch_brand_urls()

def generate_brand_urls2(brands):
    return [url for _, url in brands]

urls = generate_brand_urls2(brand_links)

# Get all model links for each brand
def get_model_links(brand_url):
    try:
        time.sleep(2)  # Be polite to the server

        response = requests.get(brand_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        models = []

        # Find all a tags with class 'modeli'
        model_tags = soup.find_all('a', class_='modeli')

        for tag in model_tags:
            href = tag['href']
            full_url = "https://www.auto-data.net" + href

            title = tag.get('title', '').strip()
            if not title:
                continue

            title_parts = title.split()
            if len(title_parts) < 2:
                continue

            brand = title_parts[0]
            model = title_parts[1]

            models.append({
                'brand': brand.lower(),
                'model': model.lower().replace(' ', '-'),
                'title': title.lower(),
                'url': full_url
            })

        return models

    except Exception as e:
        print(f"Error processing {brand_url}: {str(e)}")
        return []

# Generate and collect all model data
def generate_model_urls():
    all_models = []
    for brand_url in urls:
        print(f"Processing {brand_url}...")
        model_links = get_model_links(brand_url)
        all_models.extend(model_links)
        print(f"Found {len(model_links)} models.")
    return all_models

# Export to CSV
model_links = generate_model_urls()
model_df = pd.DataFrame(model_links)

# Ensure the 'data' directory exists
os.makedirs('data', exist_ok=True)

# Save the file
model_df.to_csv('data/auto-data-model-links-table.csv', index=True)

print("Model data saved to 'data/auto-data-model-links-table.csv'")


Processing https://www.auto-data.net/en/acura-brand-6...
Found 17 models.
Processing https://www.auto-data.net/en/alfa-romeo-brand-11...
Found 33 models.
Processing https://www.auto-data.net/en/alpina-brand-16...
Found 22 models.
Processing https://www.auto-data.net/en/aston-martin-brand-36...
Found 25 models.
Processing https://www.auto-data.net/en/audi-brand-41...
Found 62 models.
Processing https://www.auto-data.net/en/bentley-brand-66...
Found 11 models.
Processing https://www.auto-data.net/en/bmw-brand-86...
Found 50 models.
Processing https://www.auto-data.net/en/bugatti-brand-106...
Found 11 models.
Processing https://www.auto-data.net/en/byd-brand-116...
Found 37 models.
Processing https://www.auto-data.net/en/cadillac-brand-121...
Found 30 models.
Processing https://www.auto-data.net/en/chevrolet-brand-156...
Found 80 models.
Processing https://www.auto-data.net/en/chrysler-brand-161...
Found 28 models.
Processing https://www.auto-data.net/en/citroen-brand-166...
Found 46 mode

In [None]:
#Variant url for auto-data
import requests
import time
import random
from bs4 import BeautifulSoup

# Safe request with headers, timeout, and polite delay
def safe_request(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
    }

    try:
        print(f"Requesting: {url}")

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

# Variant URL extractor
def extract_variant_links(model_url, model_title):
    try:
        response = safe_request(model_url)
        if not response:
            return []

        soup = BeautifulSoup(response.text, 'html.parser')
        variant_links = []

        table = soup.find('table', class_='generr')
        if not table:
            print(f"No variant table found for {model_url}")
            return []

        for a_tag in table.find_all('a', href=True, title=True):
            href = a_tag['href']
            title_full = a_tag['title']

            # Trim the title to remove everything after ' - Technical'
            title_trimmed = title_full.split(' - Technical')[0].strip()

            # Split the title into parts
            title_parts = title_trimmed.split()
            if len(title_parts) < 2:
                continue

            brand = title_parts[1].lower() if title_parts[0].isdigit() else title_parts[0].lower()

            # The model is everything after the brand
            model_start = 1 if title_parts[0].isdigit() else 1
            model_parts = title_parts[model_start:]
            model = ' '.join(model_parts).lower()

            variant_id = href.split('-')[-1]
            variant_links.append({
                'brand': brand,
                'model': model,
                'variant_url': 'https://www.auto-data.net' + href,
                'variant_title': title_trimmed,
                'variant_id': variant_id
            })

        return variant_links

    except Exception as e:
        print(f"Error processing {model_url}: {str(e)}")
        return []


def process_models_for_variants2(models_list):
    all_variants = []

    # Limit to first 5 models for testing
    limited_models = models_list

    for model in limited_models:
        print(f"Requesting: {model['url']}")
        variants = extract_variant_links(model['url'], model['title'])
        all_variants.extend(variants)

    return all_variants

variant_links = process_models_for_variants2(model_links)


variants_df = pd.DataFrame(variant_links)
variants_df.to_csv('data/auto-data-variants-links-table.csv', index=True)

variant_links

Requesting: https://www.auto-data.net/en/acura-adx-model-3586
Requesting: https://www.auto-data.net/en/acura-adx-model-3586
Requesting: https://www.auto-data.net/en/acura-cl-model-138
Requesting: https://www.auto-data.net/en/acura-cl-model-138
Requesting: https://www.auto-data.net/en/acura-csx-model-143
Requesting: https://www.auto-data.net/en/acura-csx-model-143
Requesting: https://www.auto-data.net/en/acura-el-model-148
Requesting: https://www.auto-data.net/en/acura-el-model-148
Requesting: https://www.auto-data.net/en/acura-ilx-model-1994
Requesting: https://www.auto-data.net/en/acura-ilx-model-1994
Requesting: https://www.auto-data.net/en/acura-integra-model-2258
Requesting: https://www.auto-data.net/en/acura-integra-model-2258
Requesting: https://www.auto-data.net/en/acura-mdx-model-144
Requesting: https://www.auto-data.net/en/acura-mdx-model-144
Requesting: https://www.auto-data.net/en/acura-nsx-model-149
Requesting: https://www.auto-data.net/en/acura-nsx-model-149
Requesting: ht

[{'brand': 'acura',
  'model': 'acura adx',
  'variant_url': 'https://www.auto-data.net/en/acura-adx-generation-10407',
  'variant_title': '2025 Acura ADX',
  'variant_id': '10407'},
 {'brand': 'acura',
  'model': 'acura cl ii',
  'variant_url': 'https://www.auto-data.net/en/acura-cl-ii-generation-7196',
  'variant_title': '2001 Acura CL II',
  'variant_id': '7196'},
 {'brand': 'acura',
  'model': 'acura cl',
  'variant_url': 'https://www.auto-data.net/en/acura-cl-generation-304',
  'variant_title': '1997 Acura CL',
  'variant_id': '304'},
 {'brand': 'acura',
  'model': 'acura csx (facelift, 2009)',
  'variant_url': 'https://www.auto-data.net/en/acura-csx-facelift-2009-generation-7197',
  'variant_title': '2010 Acura CSX (facelift, 2009)',
  'variant_id': '7197'},
 {'brand': 'acura',
  'model': 'acura csx',
  'variant_url': 'https://www.auto-data.net/en/acura-csx-generation-313',
  'variant_title': '2006 Acura CSX',
  'variant_id': '313'},
 {'brand': 'acura',
  'model': 'acura el',
  '

In [None]:
from collections import defaultdict
import requests
import time
import re
import pandas as pd
from bs4 import BeautifulSoup

# --- Get fuel type & transmission from spec page ---
def get_fuel_and_transmission(spec_url):
    try:
        response = requests.get(spec_url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
        })
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        fuel_type = ''
        transmission = ''

        for row in soup.find_all('tr'):
            th = row.find('th')
            td = row.find('td')
            if th and td:
                label = th.text.strip()
                value = td.text.strip()

                if label == 'Fuel Type':
                    fuel_type = value
                elif 'Number of gears and type of gearbox' in label:
                    transmission = value

        return fuel_type, transmission

    except Exception as e:
        print(f"Error in spec URL {spec_url}: {e}")
        return '', ''

# --- Main variant page scraper ---
def extract_specs_links(variant_url, variant_title):
    try:
        time.sleep(1)

        response = requests.get(variant_url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
        })
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        specs_links = []

        for th in soup.find_all('th', class_='i'):
            link = th.find('a', href=True)
            if not link:
                continue

            href = link['href']
            visible_text = link.text.strip()
            spec_url = 'https://www.auto-data.net' + href + '/tech'

            # Extract year from variant title
            year_match = re.search(r'\b(19|20)\d{2}\b', variant_title)
            year = year_match.group(0) if year_match else "Unknown"

            # Get fuel & transmission from the /tech page
            fuel, trans = get_fuel_and_transmission(spec_url)

            specs_links.append({
                'brand': variant_title.split()[0],
                'year': year,
                'spec_title': visible_text,
                'spec_url': spec_url,
                'fuel_type': fuel,
                'transmission': trans
            })

        return specs_links

    except Exception as e:
        print(f"Error processing {variant_url}: {str(e)}")
        return []

# --- Run it across a list of variants with brand limit ---
def process_variants_for_specs(variants_list, limit_per_brand=3):
    all_specs = []
    brand_counts = defaultdict(int)

    for variant in variants_list:
        brand = variant['brand'].lower()
        if brand_counts[brand] >= limit_per_brand:
            continue

        print(f"Processing specs for {variant['variant_title']}...")
        specs = extract_specs_links(variant['variant_url'], variant['variant_title'])
        all_specs.extend(specs)
        brand_counts[brand] += 1
        print(f"Found {len(specs)} specs for {variant['variant_title']}")

    return all_specs

# --- Run ---
spec_links = process_variants_for_specs(variant_links, limit_per_brand=3)

spec_df = pd.DataFrame(spec_links, columns=[
    'brand', 'year', 'spec_title', 'spec_url', 'fuel_type', 'transmission'
])
spec_df.to_csv('data/cars-data-spec-links-table.csv', index=False)


Processing specs for 2025 Acura ADX...
Found 1 specs for 2025 Acura ADX
Processing specs for 2001 Acura CL II...
Found 3 specs for 2001 Acura CL II
Processing specs for 1997 Acura CL...
Found 3 specs for 1997 Acura CL
Processing specs for 1999 Alfa Romeo 145 (930, facelift 1999)...
Found 5 specs for 1999 Alfa Romeo 145 (930, facelift 1999)
Processing specs for 1997 Alfa Romeo 145 (930, facelift 1997)...
Found 7 specs for 1997 Alfa Romeo 145 (930, facelift 1997)
Processing specs for 1994 Alfa Romeo 145 (930)...
Found 5 specs for 1994 Alfa Romeo 145 (930)
Processing specs for 1997 Alpina B10 Touring (E39)...
Found 4 specs for 1997 Alpina B10 Touring (E39)
Processing specs for 1997 Alpina B10 (E39)...
Found 5 specs for 1997 Alpina B10 (E39)
Processing specs for 1993 Alpina B10 Touring (E34)...
Found 3 specs for 1993 Alpina B10 Touring (E34)
Processing specs for 1972 Aston Martin AMV8 Volante...
Found 6 specs for 1972 Aston Martin AMV8 Volante
Processing specs for 1972 Aston Martin AMV8...

In [None]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import time

# Configure session with retries and timeouts
session = requests.Session()
adapter = requests.adapters.HTTPAdapter(
    max_retries=3,
    pool_connections=100,
    pool_maxsize=100
)
session.mount('http://', adapter)
session.mount('https://', adapter)
session.headers.update({
    'User-Agent': 'Mozilla/5.0',
    'Accept-Encoding': 'gzip, deflate',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive'
})

# Cache for storing already fetched pages
PAGE_CACHE = {}
CACHE_EXPIRY = 3600  # 1 hour in seconds

def get_cached_page(url):
    now = time.time()
    if url in PAGE_CACHE and now - PAGE_CACHE[url]['timestamp'] < CACHE_EXPIRY:
        return PAGE_CACHE[url]['content']
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        PAGE_CACHE[url] = {
            'content': soup,
            'timestamp': now
        }
        return soup
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_model_name_from_generation(soup):
    generation_row = soup.find('th', string='Generation')
    if generation_row:
        generation_td = generation_row.find_next_sibling('td')
        if generation_td:
            link = generation_td.find('a')
            if link:
                return link.text.strip()
    return None

def extract_model_name_from_header(soup):
    h2 = soup.find('h2', class_='car')
    if h2:
        words = h2.text.strip().split()
        return words[1] if len(words) > 1 else 'Unknown'
    return None


def get_model_name(soup, brand=None):
    model = extract_model_name_from_generation(soup)
    if not model:
        model = extract_model_name_from_header(soup)
    return model or 'Unknown'


def parse_spec_details(soup):
    details = {
        'model': get_model_name(soup),
        'fuel_type': '', 'transmission': '', 'engine_modification': '',
        'start_production': '', 'body_type': '', 'seats': '', 'doors': '',
        'power': '', 'drive_wheel': '', 'engine_aspiration': '',
        'fuel_injection_system': '', 'number_of_cylinders': '',
        'engine_configuration': '', 'tires_size': '',
        'fuel_consumption_urban': '', 'fuel_consumption_extra_urban': ''
    }

    LABEL_MAP = {
        'Fuel Type': 'fuel_type',
        'Start of production': 'start_production',
        'Drive wheel': 'drive_wheel',
        'Engine aspiration': 'engine_aspiration',
        'Fuel injection system': 'fuel_injection_system',
        'Number of cylinders': 'number_of_cylinders',
        'Engine configuration': 'engine_configuration',
        'Tires size': 'tires_size',
        'Power': 'power',
        'Modification (Engine)': 'engine_modification',
        'Number of gears and type of gearbox': 'transmission',
        'Number of seats': 'seats',
        'Seats': 'seats',
        'Number of doors': 'doors',
        'Doors': 'doors',
        'Fuel consumption (economy) - urban': 'fuel_consumption_urban',
        'Fuel consumption (economy) - extra urban': 'fuel_consumption_extra_urban',
        'Body type': 'body_type',
        'Car body': 'body_type'
    }

    for row in soup.find_all('tr'):
        th = row.find('th')
        td = row.find('td')
        if not th or not td:
            continue
        label = th.text.strip().rstrip(':')
        value = td.text.strip()
        if label in LABEL_MAP:
            details[LABEL_MAP[label]] = value

    return details

def get_spec_details(spec_url):
    try:
        soup = get_cached_page(spec_url)
        if not soup:
            return {'model': 'Unknown', **{k: '' for k in details if k != 'model'}}
        return parse_spec_details(soup)
    except Exception as e:
        print(f"Error in spec URL {spec_url}: {e}")
        return {'model': 'Unknown', **{k: '' for k in details if k != 'model'}}

def extract_years_from_content(soup):
    years = []
    keyspecs_table = soup.find('table', id='ulkey')
    if keyspecs_table:
        h3 = keyspecs_table.find('h3')
        if h3:
            years = [int(y) for y in re.findall(r'\b\d{4}\b', h3.text)]
    return years or ['Unknown']

def process_spec_link(th):
    link = th.find('a', href=True)
    if not link:
        return None
    href = link['href']
    visible_text = link.text.strip()
    spec_url = 'https://www.auto-data.net' + href + '/tech'
    return {
        'spec_title': visible_text,
        'spec_url': spec_url
    }

def extract_specs_links(variant_url, variant_title):
    try:
        soup = get_cached_page(variant_url)
        if not soup:
            return []

        brand = variant_url.split('/')[4].split('-')[0].capitalize()
        model = get_model_name(soup)
        years = extract_years_from_content(soup)

        spec_entries = []
        specs = [process_spec_link(th) for th in soup.find_all('th', class_='i')]
        specs = [s for s in specs if s is not None]

        for spec in specs:
            spec_years = years
            if years == ['Unknown']:
                title_years = [int(y) for y in re.findall(r'\b\d{4}\b', spec['spec_title'])]
                if title_years:
                    spec_years = title_years

            if len(spec_years) > 1 and ' - ' in spec['spec_title']:
                try:
                    start_year = min(spec_years)
                    end_year = max(spec_years)
                    spec_years = list(range(start_year, end_year + 1))
                except:
                    pass

            spec_entries.append({
                'brand': brand,
                'model': model,
                'years': spec_years,
                'spec_title': spec['spec_title'],
                'spec_url': spec['spec_url']
            })

        return spec_entries
    except Exception as e:
        print(f"Error processing {variant_url}: {e}")
        return []

def process_variant(variant):
    print(f"Extracting spec links: {variant['variant_title']}")
    return extract_specs_links(variant['variant_url'], variant['variant_title'])

def process_spec_entry(entry, details):
    results = []
    for year in entry['years']:
        results.append({
            'brand': entry['brand'],
            'model': details['model'],
            'year': year,
            'spec_title': entry['spec_title'],
            'spec_url': entry['spec_url'],
            **details
        })
    return results

def main():
    variant_df = pd.read_csv('data/auto-data-variants-links-table.csv')

    # Limit to 5 cars
    variant_links = variant_df.to_dict(orient='records')

    print("Extracting all spec links for 5 cars...")
    all_spec_entries = []

    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(process_variant, variant) for variant in variant_links]
        for future in as_completed(futures):
            try:
                all_spec_entries.extend(future.result())
            except Exception as e:
                print(f"Error processing variant: {e}")

    print(f"Fetching {len(all_spec_entries)} spec details in parallel...")
    results = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_entry = {
            executor.submit(get_spec_details, entry['spec_url']): entry
            for entry in all_spec_entries
        }

        for future in as_completed(future_to_entry):
            entry = future_to_entry[future]
            try:
                details = future.result()
                results.extend(process_spec_entry(entry, details))
            except Exception as e:
                print(f"Error retrieving details: {e}")

    spec_df = pd.DataFrame(results)
    spec_df.to_csv('data/auto-data-table.csv', index=False)
    print(f"Saved: data/auto-data-table.csv with {len(spec_df)} records")

if __name__ == '__main__':
    start_time = time.time()
    main()
    print(f"Total execution time: {time.time() - start_time:.2f} seconds")


Extracting all spec links for 5 cars...
Extracting spec links: 2025 Acura ADX
Extracting spec links: 2001 Acura CL II
Extracting spec links: 1997 Acura CL
Extracting spec links: 2010 Acura CSX (facelift, 2009)
Extracting spec links: 2006 Acura CSX
Extracting spec links: 1997 Acura ELExtracting spec links: 2019 Acura ILX (facelift 2019)

Extracting spec links: 2016 Acura ILX (facelift 2016)
Extracting spec links: 2013 Acura ILX
Extracting spec links: 2023 Acura Integra V
Extracting spec links: 1994 Acura Integra III Sedan
Extracting spec links: 1994 Acura Integra III Coupe
Extracting spec links: 1990 Acura Integra II Sedan
Extracting spec links: 1990 Acura Integra II HatchbackExtracting spec links: 1986 Acura Integra I

Extracting spec links: 2025 Acura MDX IV (facelift 2024)
Extracting spec links: 2022 Acura MDX IV
Extracting spec links: 2017 Acura MDX III (facelift 2017)Extracting spec links: 2014 Acura MDX III
Extracting spec links: 2007 Acura MDX II

Extracting spec links: 2001 Acur