In [1]:
import scrapy
from pymongo import MongoClient
from scrapy.crawler import CrawlerProcess
import json
import logging

In [2]:
class JuralParser:
    
    def __init__(self, mongoUrl):
        self.mongoUrl = mongoUrl
        self.databaseName="JuralDetails"
        self.collectionName="AllDetails"
        client = MongoClient(self.mongoUrl)
        db = client[self.databaseName]
        self.collection = db[self.collectionName]
        self.collection.create_index("title", unique=True)
    
    # Добавляет в БД уникальные записи
    def add_data(self, product_data):
        try:
            self.collection.insert_one(product_data)
        except:
            logging.info(f"{product_data['title']} уже существует")
        
    
    # Поиск деталей по цене
    def get_details_with_greater_price(self, price:str = 0):
        query = {"price" : {"$gt": price}}
        documents = self.collection.find(query)
        return documents

    
    def print_data(self, data):
        for d in data:
            print(d)

In [3]:
class JuralSpider(scrapy.Spider):
    name = 'jural_spider'
    start_urls = ['https://agroteh26.ru/catalog/19-zapchasti_dlya_selhoztehniki/13-katalog_zapchastej_mtz_/dvigateli/']
    max_pages = 5
    juralParser = JuralParser(mongoUrl="mongodb://localhost:27017/PSU")
    custom_settings = {
        "LOG_FILE": "scrapy.log",
    }

    def parse(self, response):
        # Извлекаем данные с текущей страницы
        for item in response.css('div.product-card'):
            title = item.css('div.product-card__name a::text').get(default='Нет названия').strip()
            article = item.css('div.product-card__info span[itemprop="model"]::text').get(default='Нет артикула').strip()
            manufacturer = item.css('div.product-card__info span[itemprop="brand"]::text').get(default='Нет производителя').strip()
            price = item.css('div.product-card__prices meta[itemprop="price"]::attr(content)').get(default='0')

            product_data = {
                "title": title,
                "article": article,
                "manufacturer": manufacturer,
                "price": int(price.replace(" ", ""))
            }

            self.juralParser.add_data(product_data)

        current_page = response.url.split('page=')[-1]
        if len(current_page) > 2:
            current_page = '1'
            
        page_number = int(current_page)
        
        if page_number < self.max_pages:
            next_page = page_number + 1
            next_page_url = self.start_urls[0] + f'?page={next_page}'
            yield scrapy.Request(next_page_url, callback=self.parse)


In [4]:
process = CrawlerProcess()

process.crawl(JuralSpider)
process.start()

2024-11-27 17:02:49 [scrapy.utils.log] INFO: Scrapy 2.12.0 started (bot: scrapybot)
2024-11-27 17:02:49 [scrapy.utils.log] INFO: Versions: lxml 5.3.0.0, libxml2 2.11.7, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.10.0, Python 3.13.0 (tags/v3.13.0:60403a5, Oct  7 2024, 09:38:07) [MSC v.1941 64 bit (AMD64)], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.3, Platform Windows-10-10.0.19045-SP0
