# Get all the URLs for each car dealer in Berlin

In [4]:
import asyncio
from playwright.async_api import async_playwright

async def collect_page_urls(page, max_retries=3):
    for attempt in range(max_retries):
        try:
            await page.wait_for_timeout(2000)  # Wait for page to stabilize
            
            # Wait for dealer items and collect URLs
            await page.wait_for_selector('div[data-react-component="DealerListItem"]')
            dealer_urls = await page.evaluate('''
                () => {
                    const dealers = document.querySelectorAll('div[data-react-component="DealerListItem"] .name a');
                    return Array.from(dealers).map(a => a.href);
                }
            ''')
            
            if len(dealer_urls) > 0:
                return dealer_urls
            
            print(f"Attempt {attempt + 1}: Found 0 URLs, retrying...")
            await page.wait_for_timeout(2000)  # Wait before retry
            
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")
            if attempt < max_retries - 1:
                await page.wait_for_timeout(2000)  # Wait before retry
            else:
                raise e
    
    return []

async def visit_autoscout24_dealers():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()
        all_dealer_urls = set()
        page_number = 1
        
        # Initial page load
        url = "https://www.autoscout24.de/haendler/#?city=Berlin&pageIndex=1&sortBy=distanceAscending"
        await page.goto(url)
        await page.wait_for_selector("h1", timeout=10000)
        
        # Accept cookies
        accept_button = await page.wait_for_selector('button[data-testid="as24-cmp-accept-all-button"]')
        await accept_button.click()
        
        while True:
            # Collect URLs with retry mechanism
            dealer_urls = await collect_page_urls(page)
            
            if not dealer_urls:
                print(f"No dealers found on page {page_number} after all retries")
                break
                
            # Add new URLs to set
            previous_count = len(all_dealer_urls)
            all_dealer_urls.update(dealer_urls)
            new_urls = len(all_dealer_urls) - previous_count
            
            print(f"Page {page_number}: Found {len(dealer_urls)} dealers, {new_urls} new unique dealers")
            print(f"Total unique dealers so far: {len(all_dealer_urls)}")
            
            # If we didn't get any new unique URLs, try collecting again
            if new_urls == 0:
                print("No new unique URLs found, retrying page...")
                continue
            
            # Try to find and click the "Weiter" button
            try:
                next_button = await page.wait_for_selector('button[aria-label="Go to next page"]', timeout=5000)
                if not await next_button.is_visible():
                    print("Next button not visible")
                    break
                    
                await page.wait_for_timeout(1000)
                await next_button.click()
                
                # Wait for navigation
                await page.wait_for_load_state('networkidle')
                page_number += 1
                
            except Exception as e:
                print(f"Navigation error: {str(e)}")
                break
        
        await browser.close()
        print(f"Total unique dealers found: {len(all_dealer_urls)}")
        return list(all_dealer_urls)

# For Jupyter notebook
dealer_urls = await visit_autoscout24_dealers()

Page 1: Found 10 dealers, 10 new unique dealers
Total unique dealers so far: 10
Page 2: Found 10 dealers, 10 new unique dealers
Total unique dealers so far: 20
Page 3: Found 10 dealers, 10 new unique dealers
Total unique dealers so far: 30
Page 4: Found 10 dealers, 10 new unique dealers
Total unique dealers so far: 40
Page 5: Found 10 dealers, 10 new unique dealers
Total unique dealers so far: 50
Page 6: Found 10 dealers, 10 new unique dealers
Total unique dealers so far: 60
Page 7: Found 10 dealers, 10 new unique dealers
Total unique dealers so far: 70
Page 8: Found 10 dealers, 10 new unique dealers
Total unique dealers so far: 80
Page 9: Found 10 dealers, 10 new unique dealers
Total unique dealers so far: 90
Page 10: Found 10 dealers, 10 new unique dealers
Total unique dealers so far: 100
Page 11: Found 10 dealers, 10 new unique dealers
Total unique dealers so far: 110
Page 12: Found 10 dealers, 10 new unique dealers
Total unique dealers so far: 120
Page 13: Found 10 dealers, 10 new 

In [5]:
print(len(dealer_urls))

814


In [6]:
# Write URLs to a file, one URL per line, adding "/ueber-uns" to each URL
with open('dealer_urls.txt', 'w') as f:
    for url in dealer_urls:
        # Remove trailing slash if present and add "/ueber-uns"
        modified_url = url.rstrip('/') + '/ueber-uns'
        f.write(f"{modified_url}\n")

print(f"Saved {len(dealer_urls)} URLs to dealer_urls.txt")

Saved 814 URLs to dealer_urls.txt


In [2]:
# Load dealer URLs from file
with open('dealer_urls.txt', 'r') as f:
    dealer_urls = [line.strip() for line in f.readlines()]

print(f"Loaded {len(dealer_urls)} URLs from dealer_urls.txt")



Loaded 814 URLs from dealer_urls.txt


In [1]:
#next i need to get the number, person, opening hours, email, url
import asyncio
import json
from playwright.async_api import async_playwright
from typing import List

async def process_single_url(page, url: str, retry_count=3) -> dict:
    base_result = {
        'scrape_url': url,
        'actual_url': None,
        'name': "N/A",
        'phone': "N/A",
        'contact_person': "N/A",
        'email': "N/A",
        'opening_hours': {},
        'error': None
    }
    
    for attempt in range(retry_count):
        try:
            # Longer timeout for initial page load
            await page.goto(url, timeout=60000)  # 60 seconds timeout
            await page.wait_for_load_state('networkidle', timeout=30000)
            await page.wait_for_timeout(5000)  # Wait additional 5 seconds for dynamic content
            
            # Get the current URL (in case of redirects)
            current_url = page.url
            base_result['actual_url'] = current_url
            
            try:
                # Get the dealer name
                name_element = await page.wait_for_selector('h1.sc-font-xl', timeout=20000)
                base_result['name'] = await name_element.inner_text() if name_element else "N/A"
            except Exception as e:
                print(f"Error getting dealer name: {str(e)}")
            
            try:
                # Get the phone number
                phone_element = await page.wait_for_selector('div.dp-top__phone a[data-testid="contact-person-link"]', timeout=20000)
                base_result['phone'] = await phone_element.inner_text() if phone_element else "N/A"
            except Exception as e:
                print(f"Error getting phone number: {str(e)}")
            
            try:
                # Get the contact person name and email
                contact_info = await page.evaluate('''
                    () => {
                        const contactWrapper = document.querySelector('div[data-testid="contact-person-wrapper"]');
                        if (contactWrapper) {
                            const paragraphs = contactWrapper.getElementsByTagName('p');
                            const emailLink = contactWrapper.querySelector('a[data-testid="email-link"]');
                            return {
                                name: paragraphs[1]?.textContent || "N/A",
                                email: emailLink ? emailLink.getAttribute('href').replace('mailto:', '') : "N/A"
                            };
                        }
                        return { name: "N/A", email: "N/A" };
                    }
                ''')
                base_result['contact_person'] = contact_info['name']
                base_result['email'] = contact_info['email']
            except Exception as e:
                print(f"Error getting contact info: {str(e)}")
            
            try:
                # Get opening hours
                base_result['opening_hours'] = await page.evaluate('''
                    () => {
                        const hours = {};
                        const container = document.querySelector('.dp-opening-hours-block__container');
                        if (container) {
                            const dts = container.querySelectorAll('dt.dp-opening-hours__label');
                            const dds = container.querySelectorAll('dd.dp-opening-hours__list');
                            
                            for (let i = 0; i < dts.length; i++) {
                                const day = dts[i].textContent.trim();
                                const timeElement = dds[i];
                                
                                if (timeElement.querySelector('.sc-font-silent')) {
                                    hours[day] = "Geschlossen";
                                } else {
                                    const times = Array.from(timeElement.querySelectorAll('span[data-testid="dp-opening-hours-opened-day"]'))
                                        .map(span => span.textContent.trim());
                                    hours[day] = times.join('–').replace('Uhr', '').trim();
                                }
                            }
                        }
                        return hours;
                    }
                ''')
            except Exception as e:
                print(f"Error getting opening hours: {str(e)}")
            
            print(f"Successfully collected from {current_url}: {base_result['name']}")
            return base_result
            
        except Exception as e:
            error_msg = f"Attempt {attempt + 1} failed: {str(e)}"
            print(error_msg)
            base_result['error'] = error_msg
            
            if attempt < retry_count - 1:
                print(f"Retrying... ({attempt + 2}/{retry_count})")
                await page.wait_for_timeout(5000)  # Wait 5 seconds before retry
            else:
                print(f"All {retry_count} attempts failed for {url}")
                return base_result  # Return partial data with error message

async def process_url_chunk(urls: List[str], browser):
    context = await browser.new_context()
    pages = await asyncio.gather(*[context.new_page() for _ in range(len(urls))])
    results = await asyncio.gather(*[process_single_url(page, url) for page, url in zip(pages, urls)])
    await context.close()
    return results  # Return all results, including failed ones

async def collect_dealer_info():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        dealer_info = []
        
        # Read URLs from the file
        with open('dealer_urls.txt', 'r') as f:
            urls = [url.strip() for url in f.readlines()]
        
        total_urls = len(urls)
        print(f"Starting to process {total_urls} URLs")
        
        # Process URLs in chunks of 5
        chunk_size = 5
        for i in range(0, len(urls), chunk_size):
            url_chunk = urls[i:i + chunk_size]
            chunk_results = await process_url_chunk(url_chunk, browser)
            dealer_info.extend(chunk_results)
            
            # Count successful and failed requests
            successful = sum(1 for r in chunk_results if not r.get('error'))
            failed = sum(1 for r in chunk_results if r.get('error'))
            
            print(f"\nProcessed chunk {i//chunk_size + 1} of {(len(urls) + chunk_size - 1)//chunk_size}")
            print(f"Progress: {len(dealer_info)}/{total_urls} processed ({successful} successful, {failed} failed in this batch)")
            
            # Print batch results
            print("\nData collected in this batch:")
            for dealer in chunk_results:
                print(f"\nDealer: {dealer['name']}")
                print(f"Phone: {dealer['phone']}")
                print(f"Contact: {dealer['contact_person']}")
                print(f"Email: {dealer['email']}")
                if dealer.get('error'):
                    print(f"Error: {dealer['error']}")
                print("Opening Hours:")
                for day, hours in dealer['opening_hours'].items():
                    print(f"  {day}: {hours}")
                print("-" * 50)
        
        await browser.close()
        return dealer_info

# For Jupyter notebook
dealer_info = await collect_dealer_info()

# Save the results
with open('dealer_info.json', 'w', encoding='utf-8') as f:
    json.dump(dealer_info, f, ensure_ascii=False, indent=2)

print(f"Saved information for {len(dealer_info)} dealers")

Collected from https://www.autoscout24.de/haendler/automobile-zossen/ueber-uns: Automobile Zossen - Marcel Feuser - 033708-527140 - marcel.feuser@automobile-zossen.de
Collected from https://www.autoscout24.de/haendler/j-h-auto-h-aus-europa-bernau-gmbh-bernau-bei-berlin/ueber-uns: J.H.Auto(h)aus Europa Bernau GmbH - Martin Merkwirth - 03338-6016219 - martin.merkwirth@autohauseuropa.de
Collected from https://www.autoscout24.de/haendler/bredlow-gmbh/ueber-uns: Bredlow GmbH - N/A - 030-6090060 - info@bredlow-berlin.de
Collected from https://www.autoscout24.de/haendler/autohaus-erdmann-gmbh-berlin/ueber-uns: Autohaus Erdmann GmbH - Herr M. - 01786266475 - autohaus.erdmann@mail.de
Collected from https://www.autoscout24.de/haendler/smirnov-kfz/ueber-uns: Smirnov KFZ - N/A - 017623974715 - micha91183@web.de
Collected from https://www.autoscout24.de/haendler/autohaus-tabor-gmbh-berlin/ueber-uns: Autohaus Tabor GmbH - Domenik Hamel - +49 30 166362231 - domenik.hamel@autohaus-tabor.de
Collected f

In [2]:
# Read and count dealers from the saved JSON file
with open('dealer_info.json', 'r', encoding='utf-8') as f:
    saved_dealers = json.load(f)
    
print(f"Number of dealers in dealer_info.json: {len(saved_dealers)}")

Number of dealers in dealer_info.json: 546
