In [1]:
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin, urlparse
import re
from bs4 import BeautifulSoup
import json

class ArticleLinkParser:
    def __init__(self, headless=False, delay_range=(1, 3)):
        """
        Initialize the parser with browser settings

        Args:
            headless (bool): Whether to run browser in headless mode
            delay_range (tuple): Min and max delay in seconds between actions
        """
        self.delay_range = delay_range
        self.driver = None
        self.setup_driver(headless)

    def setup_driver(self, headless=False):
        """Setup Chrome driver with human-like settings"""
        chrome_options = Options()

        if headless:
            chrome_options.add_argument("--headless")

        # Enhanced anti-detection measures
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_argument("--disable-extensions")
        chrome_options.add_argument("--disable-plugins")
        chrome_options.add_argument("--disable-images")  # Faster loading
        chrome_options.add_argument("--disable-javascript")  # Remove if JS is needed
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-web-security")
        chrome_options.add_argument("--allow-running-insecure-content")
        chrome_options.add_argument("--disable-features=VizDisplayCompositor")

        # Remove automation indicators
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)

        # More realistic user agent (latest Chrome)
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

        # Additional prefs to avoid detection
        prefs = {
            "profile.default_content_setting_values": {
                "notifications": 2,
                "media_stream": 2,
                "geolocation": 2
            },
            "profile.managed_default_content_settings": {
                "images": 2  # Block images for faster loading
            }
        }
        chrome_options.add_experimental_option("prefs", prefs)

        self.driver = webdriver.Chrome(options=chrome_options)

        # Execute script to remove webdriver traces
        self.driver.execute_script("""
            Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
            Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
            Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']});
            window.chrome = {runtime: {}};
        """)

        # Set window size to common resolution
        self.driver.set_window_size(1366, 768)

    def human_delay(self, min_delay=None, max_delay=None):
        """Add random delay to mimic human behavior"""
        if min_delay is None:
            min_delay = self.delay_range[0]
        if max_delay is None:
            max_delay = self.delay_range[1]

        delay = random.uniform(min_delay, max_delay)
        time.sleep(delay)

    def scroll_page(self, scrolls=3):
        """Scroll the page naturally like a human would"""
        for i in range(scrolls):
            # Scroll down
            scroll_height = random.randint(300, 800)
            self.driver.execute_script(f"window.scrollBy(0, {scroll_height});")
            self.human_delay(0.5, 1.5)

        # Scroll back to top
        self.driver.execute_script("window.scrollTo(0, 0);")
        self.human_delay(1, 2)

    def get_all_links(self, url):
        """
        Navigate to URL and get all links on the page

        Args:
            url (str): The URL to parse

        Returns:
            list: All links found on the page
        """
        print(f"Navigating to: {url}")
        self.driver.get(url)

        # Wait for page to load
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        # Human-like behavior: scroll through page
        self.scroll_page()

        # Find all links
        link_elements = self.driver.find_elements(By.TAG_NAME, "a")

        links = []
        for element in link_elements:
            href = element.get_attribute("href")
            text = element.text.strip()

            if href:
                # Convert relative URLs to absolute
                absolute_url = urljoin(url, href)
                links.append({
                    'url': absolute_url,
                    'text': text,
                    'element': element
                })

        print(f"Found {len(links)} total links")
        return links

    def filter_item_links(self, links):
        """
        Filter links to get only item links with #content and convert to /details

        Args:
            links (list): List of all links

        Returns:
            list: Filtered item links with #content replaced by /details
        """
        item_links = []
        seen_urls = set()  # To avoid duplicates

        for link in links:
            url = link['url']
            text = link['text']

            # Check if URL contains "/item/" and ends with "#content"
            if "/item/" in url and url.endswith("#content"):
                # Replace #content with /details
                details_url = url.replace("#content", "/details")

                # Avoid duplicates (same item might have multiple links)
                if details_url not in seen_urls:
                    seen_urls.add(details_url)
                    item_links.append(details_url)

        return item_links

    def get_item_links(self, url):
        """
        Get all item links from the page

        Args:
            url (str): The URL to parse

        Returns:
            list: Filtered item links
        """
        try:
            # Get all links first
            all_links = self.get_all_links(url)

            # Filter for item links
            item_links = self.filter_item_links(all_links)

            return item_links

        except Exception as e:
            print(f"Error getting item links: {str(e)}")
            return []

    def get_vehicle_soup(self, vehicle_url):
        self.human_delay()
        self.scroll_page()
        self.driver.get(vehicle_url)

        return BeautifulSoup(self.driver.page_source, 'html.parser')

    def parse_vehicle_details(self, soup):
        ausstattung_header = soup.find(lambda tag: tag.name == "header" and "Vehicle extras, add-ons and accessories" in tag.get_text())

        items = []
        freetext = ""

        if ausstattung_header:
            ausstattung_list = ausstattung_header.find_next("ul")
            if ausstattung_list:
                items = [li.get_text(strip=True) for li in ausstattung_list.find_all("li")]
            else:
                print("List not found after header.")

            # Now look for the next <div> after the list — this might be the free text
            freetext_div = ausstattung_list.find_next("div") if ausstattung_list else None
            if freetext_div:
                freetext = freetext_div.get_text(strip=True)
            else:
                print("Freetext div not found.")
        else:
            print("Header not found.")

        return items, freetext

    def parse_table_after_header(self, soup, header_text):
        header = soup.find(lambda tag: tag.name == "header" and header_text in tag.get_text())
        table_data = []

        if header:
            table = header.find_next("table")
            if table:
                rows = table.find_all("tr")
                for row in rows:
                    cols = row.find_all(["td", "th"])  # In case the table has headers
                    row_data = [col.get_text(strip=True) for col in cols]
                    table_data.append(row_data)
            else:
                print(f"Table not found after header '{header_text}'.")
        else:
            print(f"Header '{header_text}' not found.")

        return table_data

    def scrape_all_links(self, base_url, max_pages):
        all_links = set()

        for page in range(1, max_pages + 1):
            page_url = f"{base_url}&currentPage={page}&pageType=next"
            print(f"Scraping page {page} -> {page_url}")
            item_links = self.get_item_links(page_url)
            all_links.update(item_links)

        return list(all_links)

    def close(self):
        """Close the browser"""
        if self.driver:
            self.driver.quit()


# Initialize parser
parser = ArticleLinkParser(headless=False, delay_range=(1, 3))

website_url = "https://autobid.de/en/search-results?e367=1&sortingType=auctionStartDate-ASCENDING"
# Get item links
item_links = parser.scrape_all_links(website_url, 130)

all_data = {}

for item_url in item_links:
    print(f"\n--- Extracting content from: {item_url} ---")

    vehicle_soup = parser.get_vehicle_soup(item_url)

    # Parse sections
    information_list = parser.parse_table_after_header(vehicle_soup, "Information")
    details_list, details_text = parser.parse_vehicle_details(vehicle_soup)

    # Convert to dict
    information_dict = {}
    for pair in information_list:
        key = pair[0].rstrip(':')
        value = pair[1]
        information_dict[key] = value

    # Store in the result dict
    all_data[item_url] = {
        "information_dict": information_dict,
        "details_list": details_list,
        "details_text": details_text
    }

# Save to JSON file
with open("vehicles_data.json", "w", encoding="utf-8") as f:
    json.dump(all_data, f, ensure_ascii=False, indent=4)

print("\n✅ Data saved to vehicles_data.json")

parser.close()


Scraping page 1 -> https://autobid.de/en/search-results?e367=1&sortingType=auctionStartDate-ASCENDING&currentPage=1&pageType=next
Navigating to: https://autobid.de/en/search-results?e367=1&sortingType=auctionStartDate-ASCENDING&currentPage=1&pageType=next
Found 173 total links
Scraping page 2 -> https://autobid.de/en/search-results?e367=1&sortingType=auctionStartDate-ASCENDING&currentPage=2&pageType=next
Navigating to: https://autobid.de/en/search-results?e367=1&sortingType=auctionStartDate-ASCENDING&currentPage=2&pageType=next
Found 173 total links
Scraping page 3 -> https://autobid.de/en/search-results?e367=1&sortingType=auctionStartDate-ASCENDING&currentPage=3&pageType=next
Navigating to: https://autobid.de/en/search-results?e367=1&sortingType=auctionStartDate-ASCENDING&currentPage=3&pageType=next
Found 173 total links
Scraping page 4 -> https://autobid.de/en/search-results?e367=1&sortingType=auctionStartDate-ASCENDING&currentPage=4&pageType=next
Navigating to: https://autobid.de/en