# Get all the URLs for each car dealer in Berlin

In [4]:
import asyncio
from playwright.async_api import async_playwright

async def visit_autoscout24_dealers():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()
        all_dealer_urls = []
        
        # Initial page load
        url = "https://www.autoscout24.de/haendler/#?city=Berlin&pageIndex=1&sortBy=distanceAscending"
        await page.goto(url)
        await page.wait_for_selector("h1", timeout=10000)
        
        # Accept cookies
        accept_button = await page.wait_for_selector('button[data-testid="as24-cmp-accept-all-button"]')
        await accept_button.click()
        
        while True:
            # Wait for dealer list items to load
            await page.wait_for_selector('div[data-react-component="DealerListItem"]')
            
            # Extract dealer URLs from current page
            dealer_urls = await page.evaluate('''
                () => {
                    const dealers = document.querySelectorAll('div[data-react-component="DealerListItem"] .name a');
                    return Array.from(dealers).map(a => a.href);
                }
            ''')
            
            all_dealer_urls.extend(dealer_urls)
            print(f"Found {len(dealer_urls)} dealers on current page")
            
            # Try to find and click the "Weiter" button
            try:
                next_button = await page.wait_for_selector('button[aria-label="Go to next page"]', timeout=5000)
                await next_button.click()
                # Wait for navigation to complete
                await page.wait_for_load_state('networkidle')
                # Optional: Add a small delay between pages
                await page.wait_for_timeout(1000)
            except:
                print("No more pages to load")
                break
        
        await browser.close()
        print(f"Total dealers found: {len(all_dealer_urls)}")
        return all_dealer_urls

# For Jupyter notebook
dealer_urls = await visit_autoscout24_dealers()

Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 dealers on current page
Found 10 d

In [11]:
dealer_urls_set = set(dealer_urls)
if len(dealer_urls) != len(dealer_urls_set):
    print("There are duplicates in the list")
else:
    print("No duplicates found")

There are duplicates in the list


In [9]:
# Write URLs to a file, one URL per line, adding "/ueber-uns" to each URL
with open('dealer_urls.txt', 'w') as f:
    for url in dealer_urls:
        # Remove trailing slash if present and add "/ueber-uns"
        modified_url = url.rstrip('/') + '/ueber-uns'
        f.write(f"{modified_url}\n")

print(f"Saved {len(dealer_urls)} URLs to dealer_urls.txt")

Saved 1314 URLs to dealer_urls.txt


In [None]:
#next i need to get the number, person, opening hours, email, url
import asyncio
import json
from playwright.async_api import async_playwright
from typing import List

async def process_single_url(page, url: str) -> dict:
    try:
        await page.goto(url)
        
        # Get the current URL (in case of redirects)
        current_url = page.url
        
        # Get the dealer name
        name_element = await page.wait_for_selector('h1.sc-font-xl')
        dealer_name = await name_element.inner_text() if name_element else "N/A"
        
        # Get the phone number
        phone_element = await page.wait_for_selector('div.dp-top__phone a[data-testid="contact-person-link"]')
        phone_number = await phone_element.inner_text() if phone_element else "N/A"
        
        # Get the contact person name and email
        contact_info = await page.evaluate('''
            () => {
                const contactWrapper = document.querySelector('div[data-testid="contact-person-wrapper"]');
                if (contactWrapper) {
                    const paragraphs = contactWrapper.getElementsByTagName('p');
                    const emailLink = contactWrapper.querySelector('a[data-testid="email-link"]');
                    return {
                        name: paragraphs[1]?.textContent || "N/A",
                        email: emailLink ? emailLink.getAttribute('href').replace('mailto:', '') : "N/A"
                    };
                }
                return { name: "N/A", email: "N/A" };
            }
        ''')
        
        # Get opening hours
        opening_hours = await page.evaluate('''
            () => {
                const hours = {};
                const container = document.querySelector('.dp-opening-hours-block__container');
                if (container) {
                    const dts = container.querySelectorAll('dt.dp-opening-hours__label');
                    const dds = container.querySelectorAll('dd.dp-opening-hours__list');
                    
                    for (let i = 0; i < dts.length; i++) {
                        const day = dts[i].textContent.trim();
                        const timeElement = dds[i];
                        
                        if (timeElement.querySelector('.sc-font-silent')) {
                            hours[day] = "Geschlossen";
                        } else {
                            const times = Array.from(timeElement.querySelectorAll('span[data-testid="dp-opening-hours-opened-day"]'))
                                .map(span => span.textContent.trim());
                            hours[day] = times.join('–').replace('Uhr', '').trim();
                        }
                    }
                }
                return hours;
            }
        ''')
        
        result = {
            'scrape_url': url,  # Original URL we tried to scrape
            'actual_url': current_url,  # URL we ended up on (in case of redirects)
            'name': dealer_name,
            'phone': phone_number,
            'contact_person': contact_info['name'],
            'email': contact_info['email'],
            'opening_hours': opening_hours
        }
        
        print(f"Collected from {current_url}: {dealer_name} - {contact_info['name']} - {phone_number} - {contact_info['email']}")
        return result
        
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
        return None

async def process_url_chunk(urls: List[str], browser):
    context = await browser.new_context()
    pages = await asyncio.gather(*[context.new_page() for _ in range(len(urls))])
    results = await asyncio.gather(*[process_single_url(page, url) for page, url in zip(pages, urls)])
    await context.close()
    return [r for r in results if r is not None]

async def collect_dealer_info():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        dealer_info = []
        
        # Read URLs from the file
        with open('dealer_urls.txt', 'r') as f:
            urls = [url.strip() for url in f.readlines()]
        
        # Process URLs in chunks of 10
        chunk_size = 10
        for i in range(0, len(urls), chunk_size):
            url_chunk = urls[i:i + chunk_size]
            chunk_results = await process_url_chunk(url_chunk, browser)
            dealer_info.extend(chunk_results)
            print(f"Processed chunk {i//chunk_size + 1} of {(len(urls) + chunk_size - 1)//chunk_size}")
        
        await browser.close()
        return dealer_info

# For Jupyter notebook
dealer_info = await collect_dealer_info()

# Save the results
with open('dealer_info.json', 'w', encoding='utf-8') as f:
    json.dump(dealer_info, f, ensure_ascii=False, indent=2)

print(f"Saved information for {len(dealer_info)} dealers")

Collected from https://www.autoscout24.de/haendler/pkw-online-de-neuwagen/ueber-uns: pkw-online.de Neuwagen - N/A - 0304418314 - pkwde@aol.com
Collected from https://www.autoscout24.de/haendler/taheri-auto-kassel/ueber-uns: Taheri Auto - N/A - 01776732152 - abdultaheri732@yahoo.com
Collected from https://www.autoscout24.de/haendler/suv4you-gmbh/ueber-uns: SUV4YOU GmbH - Fatih Sarayköylüoglu - 0221 98654409 - suv4you@gmx.de
Collected from https://www.autoscout24.de/haendler/duerkop-gmbh-berlin-prenzlauer-berg-berlin/ueber-uns: DÜRKOP GmbH - Berlin Prenzlauer Berg - Ihr Dürkop-Team in Berlin-Prenzlauer Berg - 0531 9718998510 - N/A
Collected from https://www.autoscout24.de/haendler/motorrad-lukas/ueber-uns: Motorrad Lukas - Ronald Lukas - 030/44356535 - Kontakt@motorrad-lukas-berlin.de
Collected from https://www.autoscout24.de/haendler/carpoint-gmbh-berlin/ueber-uns: Carpoint GmbH - Maik Jung - 030-57714957 - info@carpoint-nrw.de
Collected from https://www.autoscout24.de/haendler/duerkop-

Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed
Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed


Collected from https://www.autoscout24.de/haendler/pkw-online-de-neuwagen/ueber-uns: pkw-online.de Neuwagen - N/A - 0304418314 - pkwde@aol.com
Collected from https://www.autoscout24.de/haendler/carpoint-gmbh-berlin/ueber-uns: Carpoint GmbH - Maik Jung - 030-57714957 - info@carpoint-nrw.de
Collected from https://www.autoscout24.de/haendler/amore-automobile-moresco-e-k/ueber-uns: Amore Automobile Moresco e.K. - N/A - 030 6167040 - team.vw@amore-automobile.de
Collected from https://www.autoscout24.de/haendler/suv4you-gmbh/ueber-uns: SUV4YOU GmbH - Fatih Sarayköylüoglu - 0221 98654409 - suv4you@gmx.de
Collected from https://www.autoscout24.de/haendler/duerkop-gmbh-berlin-10409/ueber-uns: DÜRKOP GmbH - Ihr Dürkop-Team in Berlin-Prenzlauer Berg - 0531 9718998510 - N/A
Collected from https://www.autoscout24.de/haendler/duerkop-gmbh-berlin-prenzlauer-berg-berlin/ueber-uns: DÜRKOP GmbH - Berlin Prenzlauer Berg - Ihr Dürkop-Team in Berlin-Prenzlauer Berg - 0531 9718998510 - N/A
Error processing 