In [None]:
import concurrent.futures
import threading
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

all_car_data = []
data_lock = threading.Lock()

def excel_writer_batch(all_car_data, batch_size=500):
    print(f"Writing {len(all_car_data)} rows in batches of {batch_size}")

    try:
        df = pd.read_excel("car_info.xlsx")
    except:
        df = pd.DataFrame(columns=['Car mark', 'Car price', 'upload time'])

    for i in range(0, len(all_car_data), batch_size):
        batch = all_car_data[i:i + batch_size]
        new_df = pd.DataFrame(batch)
        df = pd.concat([df, new_df], ignore_index=True)

        if i % (batch_size * 5) == 0:
            df.to_excel("car_info.xlsx", index=False)
            print(f"💾 Saved {len(df)} rows so far...")

    df.to_excel("car_info.xlsx", index=False)
    print(f"✅ Total rows written: {len(df)}")
    return df

def get_optimized_driver():
    """Create optimized Chrome driver"""
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-images")
    options.add_argument("--disable-javascript")
    options.add_argument("--disable-plugins")
    options.add_argument("--disable-extensions")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-notifications")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.page_load_strategy = 'eager'

    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def scrape_page_fast(page_num):
    """Fast page scraping"""
    driver = get_optimized_driver()
    car_data = []

    try:
        url = f"https://www.unegui.mn/avto-mashin/-avtomashin-zarna/?page={page_num}"
        driver.get(url)

        datatable = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='list-simple__output js-list-simple__output  ']"))
        )

        car_elements = datatable.find_elements(By.XPATH, ".//div[@data-event-name='advert_click']")

        for car in car_elements:
            try:
                car_price = car.find_element(By.XPATH, ".//a[@class='advert__content-price _not-title']")
                car_title = car.find_element(By.XPATH, ".//a[@class='advert__content-title']")
                time_info = car.find_element(By.XPATH, ".//div[@class='advert__content-date']")

                car_data.append({
                    'Car mark': car_title.text,
                    'Car price': car_price.text,
                    'upload time': time_info.text
                })
            except:
                continue

    except Exception as e:
        print(f"❌ Page {page_num}: {e}")
    finally:
        driver.quit()

    return car_data

def fast_scraper(max_pages=100, num_threads=8):
    print(f"🚀 Starting fast scraper: {max_pages} pages, {num_threads} threads")
    start_time = time.time()

    all_data = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(scrape_page_fast, page) for page in range(1, max_pages + 1)]

        for i, future in enumerate(concurrent.futures.as_completed(futures)):
            page_data = future.result()
            if page_data:
                all_data.extend(page_data)

            if (i + 1) % 10 == 0:
                print(f"�� Processed {i + 1}/{max_pages} pages, collected {len(all_data)} cars")

    if all_data:
        excel_writer_batch(all_data)

    end_time = time.time()
    print(f"⏱️ Total time: {end_time - start_time:.2f} seconds")
    print(f"📈 Speed: {len(all_data) / (end_time - start_time):.2f} cars/second")


In [2]:
SCRAP_URL = "https://www.unegui.mn/avto-mashin/-avtomashin-zarna/"

driver = get_optimized_driver()
driver.get(SCRAP_URL)

ul_element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, "//ul[@class='number-list']"))
)

last_page = None
dataframe = pd.DataFrame()

try :
    li_elements = ul_element.find_elements(By.XPATH, ".//li[last()]")
    last_page = int(li_elements[0].text)
except Exception as e:
    print('last element not found: ', e)

fast_scraper(max_pages=last_page, num_threads=6)

🚀 Starting fast scraper: 237 pages, 6 threads
�� Processed 10/237 pages, collected 600 cars
�� Processed 20/237 pages, collected 1200 cars
�� Processed 30/237 pages, collected 1800 cars
�� Processed 40/237 pages, collected 2400 cars
�� Processed 50/237 pages, collected 3000 cars
�� Processed 60/237 pages, collected 3600 cars
�� Processed 70/237 pages, collected 4200 cars
�� Processed 80/237 pages, collected 4800 cars
�� Processed 90/237 pages, collected 5400 cars
�� Processed 100/237 pages, collected 6000 cars
�� Processed 110/237 pages, collected 6600 cars
�� Processed 120/237 pages, collected 7200 cars
�� Processed 130/237 pages, collected 7800 cars
�� Processed 140/237 pages, collected 8400 cars
�� Processed 150/237 pages, collected 9000 cars
�� Processed 160/237 pages, collected 9600 cars
�� Processed 170/237 pages, collected 10200 cars
�� Processed 180/237 pages, collected 10800 cars
�� Processed 190/237 pages, collected 11400 cars
�� Processed 200/237 pages, collected 12000 cars
