## Библиотеки

In [None]:
import os
from bs4 import BeautifulSoup as bs

import pandas as pd
import numpy as np
import warnings
import random
from datetime import datetime

from lxml import html
import requests

import aiohttp
import asyncio
from aiohttp import ClientTimeout
from aiohttp_socks import ProxyConnector
from aiohttp import ClientSession
from stem import Signal
from stem.control import Controller

from tqdm.asyncio import tqdm
from charset_normalizer import from_bytes

warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings("ignore")

## Функции

In [None]:
dff_cities = pd.read_excel(r"03_all_cities.xlsx")
print("Датасет с городами и регионами РФ собран.")

In [None]:
REQUEST_AMOUNT = 10 # больше 10 - плохая идея

semaphore = asyncio.Semaphore(REQUEST_AMOUNT)

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36 Edg/93.0.961.52",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
]

LANGUAGES = [
    "en-US,en;q=0.9",
    "en-GB,en;q=0.9",
    "fr-FR,fr;q=0.9",
    "de-DE,de;q=0.9",
    "es-ES,es;q=0.9",
    "ru-RU,ru;q=0.9",
    "zh-CN,zh;q=0.9",
    "ja-JP,ja;q=0.9",
    "it-IT,it;q=0.9",
    "pt-BR,pt;q=0.9",
]

TOR_PROXY = "socks5://127.0.0.1:9150"  # Порт Tor Browser
CONTROL_PORT = 9051  # Порт ControlPort
CONTROL_PASSWORD = "ENTER_YOUR_OWN_PASSWORD"

ip_change_lock = asyncio.Lock()

### Уже было
def renew_tor_ip():
    with Controller.from_port(port=CONTROL_PORT) as controller:
        controller.authenticate(password=CONTROL_PASSWORD)
        controller.signal(Signal.NEWNYM)

### Уже было
async def fetch(session, url, max_retries=10):
    retries = 0
    while retries < max_retries:
        try:
            async with semaphore:
                await asyncio.sleep(1)  # Задержка между запросами
                headers = {
                    "User-Agent": random.choice(USER_AGENTS),
                    "Accept-Language": random.choice(LANGUAGES),
                    "Referer": "https://example.com",
                }
                async with session.get(url, headers=headers) as response:
                    if response.status == 200:
                        content = await response.content.read()
                        en = from_bytes(content)
                        encoding = en.best().encoding
                        return content.decode(encoding)
                    elif response.status == 429:  # Too Many Requests
                        retry_after = response.headers.get("Retry-After")
                        if retry_after:
                            try:
                                wait_time = int(retry_after)
                            except ValueError:
                                retry_time = datetime.strptime(retry_after, "%a, %d %b %Y %H:%M:%S GMT")
                                wait_time = (retry_time - datetime.now()).total_seconds()
                            await asyncio.sleep(wait_time)
                        else:
                            async with ip_change_lock:
                                renew_tor_ip()
                                await asyncio.sleep(15)
                    else:
                        async with ip_change_lock:
                            print(f"Server returned {response.status} for {url}. Changing IP...")
                            renew_tor_ip()
                            await asyncio.sleep(15)
                    retries += 1
        except (aiohttp.ClientError, asyncio.TimeoutError) as e:
            print(f"Error fetching {url}: {e}")
            retries += 1
            if retries < max_retries:
                async with ip_change_lock:
                    print("Changing IP due to error...")
                    renew_tor_ip()
                    await asyncio.sleep(15)
    return None

### Тут ничего умного - просто специфическая обработка
### HTML страницы, которая может быть уже и неактуальная
### (на момент 29.01.2025 все работает)
async def process_page(url, session):
    content = await fetch(session, url)
    if content:
        tree = html.fromstring(content)

        if len(tree.xpath('//div[@class="css-pxeubi evnwjo70"]')) == 0:
            bad_table = {'Ссылка': url,
                        'Цена': np.nan,
                        #    'Дата размещения объявления': np.nan,
                        'Кол-во просмотров': np.nan,
                        'Скрыто': np.nan,
                        'Объем двигателя': np.nan,
                        'Тип двигателя': np.nan,
                        'Мощность': np.nan,
                        'Коробка передач': np.nan,
                        'Привод': np.nan,
                        'Цвет': np.nan,
                        'Пробег': np.nan,
                        'Руль': np.nan,
                        'Поколение': np.nan,
                        'Рестайлинг': np.nan,
                        'Комплектация': np.nan,
                        'Владелец': np.nan,
                        "Город": np.nan,
                        "Регион": np.nan}
            return bad_table
        divs = tree.xpath('//div[@class="wb9m8q0"]')
        price = html.tostring(divs[0]).decode("utf-8")
        price = price[price.find(">") + 1:]
        price = price[:price.find("<")].replace("&#160;", "").replace("&#8381;", "")
        price = int(price)

        divs = tree.xpath('//div[@class="css-14wh0pm e1lm3vns0"]')[0]
        views = int(divs.text_content().strip())

        divs = tree.xpath('//div[@class="css-h1pukl edsrp6u2"]')
        is_canceled = (len(divs) != 0 and divs[0].text_content() == 'Автомобиль снят с продажи')

        divs = tree.xpath('//div[@class="css-inmjwf e162wx9x0"]')
        for div in divs:
            city_district = div.text_content()
            if city_district.find("Город: ") != -1:
                city_district = city_district[city_district.find("Город: ") + len("Город: "):]
                break
        city_district = city_district.split(", ")
        if len(city_district) == 2:
            city, district = city_district[0], city_district[1]
        else:
            if city_district[0] in ["Москва", "Санкт-Петербург", "Севастополь"]:
                city, district = city_district[0], city_district[0]
            elif len(tree.xpath('//div[@class="css-hp5vyz ejipaoe0"]')) != 0:
                city, district = np.nan, np.nan # <div class="css-hp5vyz ejipaoe0">
            else:
                if len(list(dff_cities[dff_cities["city"] == city_district[0]]["region"])) == 0:
                    district = np.nan
                else:
                    district =  list(dff_cities[dff_cities["city"] == city_district[0]]["region"])[0]
                city = city_district[0]

        # divs = tree.xpath('//table[@class="css-xalqz7 eppj3wm0"]')[0]
        table = bs(content, "html.parser").find_all("table")[0]
        # table = bs(html.tostring(divs).decode("utf-8"), "html.parser").find_all("tr")
        table = table.find_all("tr")
        params = {}
        for tr in table:
            row = str(tr)
            Y, X = row[:row.find("</th>")], row[row.find("<td") + len("<td"):]
            Y = Y[Y.rfind(">") + 1:]
            if Y.find("<") != -1:
                Y = ""
            # print(Y)
            X = X.replace("><", "").replace("<!-- -->", "")
            X = X[:X.rfind("</")]
            while X.find("<") != -1:
                X = X[:X.find("<")] + X[X.find(">", X.find("<") + 1) + 1:]

            X = X[X.find(">") + 1:]
            if X.rfind(";}") != -1:
                X = X[X.rfind(";}") + 2:]
            if Y != "":
                params[Y] = X.replace("\xa0", " ")

        table = {"Ссылка": url, "Цена": price, "Кол-во просмотров": views, "Скрыто": is_canceled}

        for par in params.keys():
            if par == "Двигатель":
                tup = params[par].split(", ")
                if tup[0] == "электро":
                    table["Объем двигателя"], table["Тип двигателя"] = 0.0, tup[0]
                else:
                    if len(tup) == 1:
                        st = ""
                        for symb in tup[0]:
                            # print(symb)
                            if symb.isnumeric() or symb == ".":
                                st = st + symb
                            else:
                                break
                        if st == "":
                            table["Объем двигателя"], table["Тип двигателя"] = np.nan, tup[0]
                        else:
                            table["Объем двигателя"], table["Тип двигателя"] = float(st), "бензин"
                    else:
                        if tup[1][0].isnumeric():
                            table["Объем двигателя"], table["Тип двигателя"] = float(tup[1].split()[0]), tup[0]
                        elif tup[0][0].isnumeric():
                            table["Объем двигателя"], table["Тип двигателя"] = float(tup[0].split()[0]), tup[1]
                        else:
                            table["Объем двигателя"], table["Тип двигателя"] = np.nan, np.nan
            elif par == "Мощность":
                tup = params[par].split()[0]
                table["Мощность"] = int(tup)
            elif par == "Пробег":
                params[par] = params[par].split(", ")[0]
                tup = "".join(params[par].split()[:-1])
                if tup == "новый" or tup[0] == "б":
                    table["Пробег"] = 0
                else:
                    table["Пробег"] = int(tup)
            elif par == "Поколение":
                tup = params[par].split(", ")
                gen_sty = []
                for x in tup:
                    st = ""
                    # print(x)
                    for symb in x:
                        # print(symb)
                        if symb.isnumeric():
                            st = st + symb
                        else:
                            break
                    if st == "": st = "0"
                    gen_sty.append(int(st))
                if len(gen_sty) < 1:
                    gen_sty = [0, 0]
                if len(gen_sty) < 2:
                    gen_sty.append(0)
                table["Поколение"], table["Рестайлинг"] = gen_sty[0], gen_sty[1]
            else:
                table[par] = params[par]

        try_private = tree.xpath('//div[@class="_1n15liu6"]')
        try_firm = tree.xpath('//a[@data-ga-stats-name="dealer_name"]')
        if len(try_private) == 0:
            if len(try_firm) != 0:
                table["Владелец"] = "Фирма"
        else:
            table["Владелец"] = try_private[0].text_content()

        table["Город"] = city
        table["Регион"] = district

        return table
    return None

async def worker(url_queue, session, results, pbar):
    while True:
        url = await url_queue.get()
        if url is None:
            break
        try:
            data = await process_page(url, session)
            if data:
                results.append(data)
        except Exception as e:
            print(f"Error processing URL {url}: {e}")
        finally:
            url_queue.task_done()
            pbar.update(1)

async def main(urls, text):
    url_queue = asyncio.Queue()
    results = []

    for url in urls:
        url_queue.put_nowait(url)

    connector = ProxyConnector.from_url(TOR_PROXY)
    async with ClientSession(connector=connector) as session:
        with tqdm(total=len(urls), desc=f"Обработка файла {text}") as pbar:
            workers = [asyncio.create_task(worker(url_queue, session, results, pbar)) for _ in range(REQUEST_AMOUNT)]
            
            await url_queue.join()

            for _ in range(REQUEST_AMOUNT):
                await url_queue.put(None)

            await asyncio.gather(*workers)

    return results


## Код

In [None]:
progress_for = 0
progress_car = 0
# schet = 0

files_sorted = []
for filename in os.listdir(r"./second_href/"):
    if filename.endswith(".xlsx"):
        files_sorted.append(filename[:filename.find(".xlsx")])
print(f"Список файлов для парсинга собран. Всего {len(files_sorted)} файлов.")
files_sorted = sorted(files_sorted, key = lambda x: int(x[:x.find("s")]))

files_ready = []
for filename in os.listdir(r"./all_cars/"):
    if filename.endswith(".xlsx"):
        files_ready.append(filename[:filename.find(".xlsx")])

files_sorted = [x for x in files_sorted if x not in files_ready]
print(f"Список файлов для парсинга собран. Всего {len(files_sorted)} файлов.")

In [None]:
for i in range(progress_car, len(files_sorted)):
    progress_car = i
    car_name = files_sorted[i]
    df = pd.read_excel(f"./second_href/{car_name}.xlsx")
    urls = list(df["Ссылка"])

    res = await main(urls, car_name)

    table = pd.DataFrame(res)
    final_table = pd.merge(df, table, on=["Ссылка"], how="outer")
    final_table.to_excel(f"./all_cars/{car_name}.xlsx", index = False)

    print(f"Спаршено {car_name}:\t", len(res))
    print(("="*80 + "\n"*1))