In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# Custom headers with a User-Agent
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

main_url = 'https://katalogus.hasznaltauto.hu/'

# Implement a simple exponential backoff strategy for the main URL request
for i in range(5):
    try:
        response = requests.get(main_url, headers=headers)
        response.raise_for_status()
        break  # Success! Exit the retry loop.
    except requests.exceptions.HTTPError as e:
        if response.status_code == 429:
            wait = 2 ** i  # Exponential backoff
            print(f"Rate limited. Waiting {wait} seconds to retry...")
            time.sleep(wait)
        else:
            raise  # Re-raise the exception for other non-429 errors.

soup = BeautifulSoup(response.content, 'html.parser')

# Get brand URLs
brand_list = soup.find('ul', class_='cimkefelho')
brand_links = brand_list.find_all('a', href=True)
brands = {brand.get_text(strip=True): brand['href'] for brand in brand_links}

# Function to scrape brand pages for car links
def scrape_brand_page(brand_url, max_links_per_brand):
    car_links = []
    response = requests.get(brand_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    listings = soup.find_all('tr', class_='paros')
    for listing in listings:
        if len(car_links) < max_links_per_brand:
            link_tag = listing.find('a', href=True)
            if link_tag:
                car_links.append(link_tag['href'])
        else:
            break
    return car_links

# Function to scrape car details
def scrape_car_details(car_detail_url):
    response = requests.get(car_detail_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    car_details = {}

    # Extracting 'General Information'
    try:
        general_info_table = soup.find('h3', text='Általános információk').find_next('table')
        for row in general_info_table.find_all('tr'):
            cells = row.find_all('td')
            car_details[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True)
    except AttributeError:
        car_details['General Information'] = 'Not available'

    # Extracting 'Equipment'
    try:
        equipment_info = soup.find('h3', text='Felszereltség').find_next('td')
        car_details['Equipment'] = equipment_info.get_text(strip=True, separator=', ')
    except AttributeError:
        car_details['Equipment'] = 'Not available'

    # Extracting 'Body Type'
    try:
        body_type_info = soup.find('h3', text='Karosszéria').find_next('table')
        for row in body_type_info.find_all('tr'):
            cells = row.find_all('td')
            car_details[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True)
    except AttributeError:
        car_details['Body Type'] = 'Not available'

    # Extracting 'Dimensions and Weight'
    try:
        dimensions_table = soup.find('h3', text='Méretek és tömeg').find_next('table')
        for row in dimensions_table.find_all('tr'):
            cells = row.find_all('td')
            car_details[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True)
    except AttributeError:
        car_details['Dimensions and Weight'] = 'Not available'

    # Extracting 'Engine and Transmission'
    try:
        engine_table = soup.find('h3', text='Motor és sebességváltó').find_next('table')
        for row in engine_table.find_all('tr'):
            cells = row.find_all('td')
            car_details[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True)
    except AttributeError:
        car_details['Engine and Transmission'] = 'Not available'

    # Extracting 'Consumption, Acceleration, and Speed Data'
    try:
        performance_table = soup.find('h3', text='Fogyasztás, gyorsulás- és sebességadatok').find_next('table')
        for row in performance_table.find_all('tr'):
            cells = row.find_all('td')
            car_details[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True)
    except AttributeError:
        car_details['Performance'] = 'Not available'

    return car_details


# Scrape links for each brand
all_car_links = []
links_per_brand = 4  # Adjust this to ensure 200 listings

for brand_name, brand_url in brands.items():
    print(f"Scraping car links from {brand_name}...")
    car_links = scrape_brand_page(brand_url, links_per_brand)
    all_car_links.extend(car_links)
    
# Place to store all car details and headers
car_details_list = []
all_headers = set()

# Scrape details for each car link (limit the number of links as needed)
for link in all_car_links[:200]:  # Adjust the slice as per your requirement
    print(f"Scraping details from {link}")
    details = scrape_car_details(link)
    car_details_list.append(details)
    all_headers.update(details.keys())

# Convert the set of headers to a list and sort it
headers = list(all_headers)
headers.sort()

# Writing to CSV
with open('car_details.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=headers)
    writer.writeheader()
    for car in car_details_list:
        # Fill missing fields with a placeholder
        row = {header: car.get(header, 'Not available') for header in headers}
        writer.writerow(row)

print("Scraping completed. Car details saved to car_details.csv.")

Scraping car links from ALFA ROMEO...
Scraping car links from AUDI...
Scraping car links from BENTLEY...
Scraping car links from BMW...
Scraping car links from CADILLAC...
Scraping car links from CHEVROLET...
Scraping car links from CHRYSLER...
Scraping car links from CITROEN...
Scraping car links from DACIA...
Scraping car links from DAEWOO...
Scraping car links from DAIHATSU...
Scraping car links from DODGE...
Scraping car links from FERRARI...
Scraping car links from FIAT...
Scraping car links from FORD...
Scraping car links from HONDA...
Scraping car links from HYUNDAI...
Scraping car links from INFINITI...
Scraping car links from JAGUAR...
Scraping car links from JEEP...
Scraping car links from KIA...
Scraping car links from LADA...
Scraping car links from LANCIA...
Scraping car links from LAND ROVER...
Scraping car links from LEXUS...
Scraping car links from MASERATI...
Scraping car links from MAZDA...
Scraping car links from MERCEDES-AMG...
Scraping car links from MERCEDES-BENZ.

  general_info_table = soup.find('h3', text='Általános információk').find_next('table')
  equipment_info = soup.find('h3', text='Felszereltség').find_next('td')
  body_type_info = soup.find('h3', text='Karosszéria').find_next('table')
  dimensions_table = soup.find('h3', text='Méretek és tömeg').find_next('table')
  engine_table = soup.find('h3', text='Motor és sebességváltó').find_next('table')
  performance_table = soup.find('h3', text='Fogyasztás, gyorsulás- és sebességadatok').find_next('table')


Scraping details from https://katalogus.hasznaltauto.hu/alfa_romeo/alfa_145_1.4_ts_l-76
Scraping details from https://katalogus.hasznaltauto.hu/alfa_romeo/alfa_145_1.6_l-78
Scraping details from https://katalogus.hasznaltauto.hu/alfa_romeo/alfa_145_1.6_ts_l-80
Scraping details from https://katalogus.hasznaltauto.hu/audi/100_2.2_l-2603
Scraping details from https://katalogus.hasznaltauto.hu/audi/100_1.9_c-2605
Scraping details from https://katalogus.hasznaltauto.hu/audi/100_2.2_gl_5_e-2607
Scraping details from https://katalogus.hasznaltauto.hu/audi/100_2.0_d_cs_5_d-2609
Scraping details from https://katalogus.hasznaltauto.hu/bentley/6.8_v8-67489
Scraping details from https://katalogus.hasznaltauto.hu/bentley/6.7-67491
Scraping details from https://katalogus.hasznaltauto.hu/bentley/6.8-67493
Scraping details from https://katalogus.hasznaltauto.hu/bentley/4.4_32v-67495
Scraping details from https://katalogus.hasznaltauto.hu/bmw/116i_advantage-3395
Scraping details from https://katalogus.

Scraping details from https://katalogus.hasznaltauto.hu/maserati/2.8_bi_turbo-76978
Scraping details from https://katalogus.hasznaltauto.hu/maserati/2.0-76980
Scraping details from https://katalogus.hasznaltauto.hu/maserati/a6gcm-71881
Scraping details from https://katalogus.hasznaltauto.hu/mazda/121_1.3_lx-15540
Scraping details from https://katalogus.hasznaltauto.hu/mazda/121_1.3_baby_s-15542
Scraping details from https://katalogus.hasznaltauto.hu/mazda/121_1.3_lx_canvas_top-15544
Scraping details from https://katalogus.hasznaltauto.hu/mazda/121_1.3_lx-15546
Scraping details from https://katalogus.hasznaltauto.hu/mercedes-amg/amg_gt_roadster_4.0_automata-103880
Scraping details from https://katalogus.hasznaltauto.hu/mercedes-amg/amg_gt_coupe_4.0_c_automata-112475
Scraping details from https://katalogus.hasznaltauto.hu/mercedes-amg/amg_gt_coupe_63_4matic_plusz_9g-tronic-115694
Scraping details from https://katalogus.hasznaltauto.hu/mercedes-amg/amg_gt_coupe_43_9g-t_eq_boost-117578
Scr