In [1]:
from typing import Tuple, Any

import pandas as pd
import numpy as np
import plotly.express as px
import requests
import bs4
from lxml import etree
import re

In [2]:
def get_sitemats_list(url: str='https://goldapple.ru/sitemap.xml') -> list[str]:
    """
    Get sitemaps list from goldapple or another url.
    :param url: url to sitemap.xml
    :return: list of sitemaps urls
    """
    try:
        res = requests.get(url, timeout=30).text
    except requests.exceptions.ConnectTimeout as err:
        raise requests.exceptions.ConnectTimeout(f'connection timeout, {err}')
    except requests.exceptions.ConnectionError:
        raise requests.exceptions.ConnectionError(f'check url: {url}')
    except requests.exceptions.HTTPError as err:
        raise requests.exceptions.HTTPError(err)

    try:
        soup = bs4.BeautifulSoup(res, 'xml')
        res = [x.text for x in soup.find_all('loc')]
    except AttributeError as err:
        raise AttributeError(f'could not find attribute "loc", {err}')
    except Exception as err:
        raise f'Error parsing xml data: {err}'

    return res

In [3]:
def get_product_urls(sitemaps: list[str]) -> list[str]:
    """
    Get products urls from sitemaps.

    :param sitemaps: list of sitemaps urls
    :return: list of product links
    """
    res = []
    pattern = re.compile(r'\d{10,12}')
    for sitemap in sitemaps:
        xml_products = requests.get(sitemap).text
        soup_products = bs4.BeautifulSoup(xml_products, 'xml')
        product_urls = [x.text for x in soup_products.find_all('loc')]
        for index, x in enumerate(product_urls):
            product = re.findall(pattern, x)
            if product:
                res.append(product_urls[index])
    return res

In [None]:
def get_sku_and_product_id(url: str) -> tuple[Any, Any]:
    """
    Return sku_id and product_id from product URL
    :param url: url to product
    :return: [sku_id, prod_id]
    """
    pattern = re.compile(r'\d{4,20}')
    res_sku = re.findall(pattern, url)
    if len(res_sku) == 1:
        prod_id = None
        sku_id = res_sku[0]
    elif len(res_sku) > 1:
        prod_id = res_sku[0]
        sku_id = res_sku[1]
    else:
        prod_id = sku_id = None

    return sku_id, prod_id

In [4]:
sitemaps = get_sitemats_list()
product_urls = get_product_urls(sitemaps)

In [55]:
class Product:
    def __init__(self, product_page, product_url:str=None, product_id:int=None, sku_id:int=None):
        self.product_page: etree._Element = etree.HTML(product_page)
        self.product_url = product_url
        self.product_id: int or None = product_id
        self.sku_id: int or None = sku_id
        self.title: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/header/p')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/header/p')) > 0 \
            else None
        self.first_subcategory: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[2]/a/span')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[2]/a/span')) > 0 \
            else None
        self.second_subcategory: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[3]/a/span')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[3]/a/span')) > 0 \
            else None
        self.third_subcategory: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[4]/a/span')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[4]/a/span')) > 0 \
            else None
        self.fourth_subcategory: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[5]/a/span')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[5]/a/span')) > 0 \
            else None
        self.price: int or None = int(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/form/div[3]/div/div/span/span/span/span')[0].text.replace('₽', '').replace(u'\xa0', u' ').replace(u' ', u'').strip()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/form/div[3]/div/div/span/span/span/span')) > 0 \
            else None
        self.loyalty_price: int or None = int(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/form/div[3]/div/span/span/span/span')[0].text.replace('₽', '').replace(u'\xa0', u' ').replace(u' ', u'').strip()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/form/div[3]/div/span/span/span/span')) > 0 \
            else None
        self.description: str or None = ' '.join(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[1]')[0].text.replace('\n\n', '').split()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[1]')) > 0 \
            else None
        self.product_usage: str or None = ' '.join(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[2]/article/div/section')[0].text.replace('\n\n', '').split()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[2]/article/div/section')) > 0 \
            else None
        self.product_composition: str or None = ' '.join(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[3]/article/div/section')[0].text.replace('\n\n', '').split()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[3]/article/div/section')) > 0 \
            else None
        self.product_brand: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[3]/article/div/header/h4/span')[0].text \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[3]/article/div/header/h4/span')) > 0 \
            else None


In [6]:
len(product_urls)

71948

In [326]:
res = requests.get(product_urls[30000]).content.decode('utf-8')

In [299]:
res



In [330]:
res_soup = bs4.BeautifulSoup(res_text, 'html.parser')
tree = etree.HTML(res)

In [57]:
prod = Product(res, )

In [107]:
prod.__dict__

{'product_page': <Element html at 0x15759f2acc0>,
 'product_id': None,
 'sku_id': None,
 'title': 'Ёршики межзубные M d 3.0 мм',
 'first_subcategory': None,
 'second_subcategory': None,
 'third_subcategory': None,
 'fourth_subcategory': None,
 'price': 504,
 'loyalty_price': 453,
 'description': 'Межзубные щетки (ершики) необходимы каждому человеку для тщательной очистки межзубных пространств. Также ими можно пользоваться для лучшей гигиены при ношении ортодонтических конструкций. Цилиндрические ершики Lacalut имеют высокопрочные нейлоновые щетинки и удобную цветную ручку. В упаковке комплект из 5 штук. Для подбора нужного диаметра ершиков необходимо обратиться за консультацией к стоматологу.',
 'product_usage': 'Перед первым применением рекомендуется промыть межзубной ершик в теплой воде. После чистки каждой зоны и в конце чистки необходимо тщательно промыть межзубной ершик теплой водой. При ежедневном использовании рекомендуется менять межзубной ершик каждую неделю. Средство индивиду

In [356]:
for x in res_soup.find_all('li', {'class': "pdp-breadcrumbs__crumb-item pdp-breadcrumbs__item"}):
    print(x.get_text())

In [349]:
tree.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[2]/a/span')

[]

In [139]:
product_properties = {}
for x in tree.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[2]/dl')[0]:
    print(x[0][0].text)
    if 'Тип продукта' in x[0][0].text:
        product_properties['product_type'] = x[1][0].text
    if 'Для кого' in x[0][0].text:
        product_properties['who_use'] = x[1][0].text
    if 'Страна бренда' in x[0][0].text:
        product_properties['brand_country'] = x[1][0].text
    if 'Область применения' in x[0][0].text:
        product_properties['use_scope'] = x[1][0].text
    if 'Назначение' in x[0][0].text:
        product_properties['purpose'] = x[1][0].text

product_properties

Тип продукта
Для кого
Назначение
Область применения
Страна бренда


{'product_type': 'межзубный ершик',
 'who_use': 'унисекс',
 'purpose': 'личная гигиена',
 'use_scope': 'для полости рта',
 'brand_country': 'Германия'}

In [242]:
MyClass = type("MyClass", (object, ), product_properties)

SyntaxError: invalid syntax (2261796374.py, line 2)

In [106]:
link = 'https://goldapple.ru/14503-4551247598-rastusevka-bez-granicdsf'
pattern = re.compile(r'\d{4,20}')
res_sku = re.findall(pattern, link)
if len(res_sku) == 1:
    prod_id = None
    sku_id = res_sku[0]
elif len(res_sku) > 1:
    prod_id = res_sku[0]
    sku_id = res_sku[1]
else:
    prod_id = sku_id = None
print(res_sku)
print(prod_id, sku_id)

['14503', '4551247598']
14503 4551247598


In [280]:
product_urls[30000]

'https://goldapple.ru/19760343449-platinum-blonde-toning-shampoo'