In [4]:
from typing import Tuple, Any

import pandas as pd
import numpy as np
import plotly.express as px
import requests
import bs4
from lxml import etree
import re
import fake_useragent
import threading
from queue import Queue
from time import sleep
import random
import configer

In [5]:
def get_category_data(type: str, cat_name: str = None) -> int | list:

    """
    Returns the requested data by category

    :param type: Type returned data. Should be one of: id, ru_name or cat_names
    :param cat_name: category name
    :return: returns the requested data
    """

    cat = {
        'makijazh':              {'id': 3, 'name': 'макияж'},
        'uhod':                  {'id': 4, 'name': 'уход'},
        'volosy':                {'id': 6, 'name': 'волосы'},
        'parfjumerija':          {'id': 7, 'name': 'парфюмерия'},
        'zdorov-e-i-apteka':     {'id': 3747, 'name': 'здоровье и аптека'},
        'sexual-wellness':       {'id': 5962, 'name': 'sexual wellness'},
        'azija':                 {'id': 10, 'name': 'азия'},
        'organika':              {'id': 12, 'name': 'органика'},
        'dlja-muzhchin':         {'id': 3887, 'name': 'для мужчин'},
        'dlja-detej':            {'id': 4357, 'name': 'для детей'},
        'tehnika':               {'id': 3870, 'name': 'техника'},
        'dlja-doma':             {'id': 8202, 'name': 'для дома'},
        'odezhda-i-aksessuary':  {'id': 8529, 'name': 'одежда и аксессуары'},
        'nizhnee-bel-jo':        {'id': 8563, 'name': 'нижнее бельё'},
        'ukrashenija':           {'id': 5746, 'name': 'украшения'},
        'lajfstajl':             {'id': 8579, 'name': 'лайфстайл'},
        'ini-formaty':           {'id': 5159, 'name': 'тревел-форматы'},
        'tovary-dlja-zhivotnyh': {'id': 7638, 'name': 'товары для животных'}
    }
    
    if type not in ['cat_names', 'id', 'ru_name']:
        raise ValueError('Parameter type should by "cat_names","id" or "ru_name"')
    
    if type == 'cat_names':
        return [cat for cat in cat.keys()]
    
    if cat_name is None:
        raise ValueError('Value cat_name should be not None')
    
    try:
        cat[cat_name]
    except KeyError:
        raise ValueError(f'Wrong category name: {cat_name}. \n'
                         f'Possible cat_names: {", ".join([name for name in cat.keys()])}')

    if type == 'id':
        return cat[cat_name]['id']
    elif type == 'ru_name':
        return cat[cat_name]['name']
    else:
        raise ValueError('type should be "cat_names", "id" or "ru_name"')

In [6]:
def get_product_data_by_url(url: str) -> tuple[Any, Any, Any]:
    """
    Get additional data by product URL.
    Additional data uncludes:
        - description
        - product usage
        - product composition

    """
    user_agent = fake_useragent.UserAgent().random
    try:
        res = requests.get(url, timeout=5, headers={'User-Agent': user_agent}).text
        res = etree.HTML(res)
    except:
        return None, None, None

    try:
        description: str or None = ' '.join(res.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[1]')[0].text.replace('\n\n', '').split()) \
        if len(res.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[1]')) > 0 \
           and res.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[1]')[0].text is not None\
        else None
    except:
        description = None

    try:
        product_usage: str or None = ' '.join(res.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[2]/article/div/section')[0].text.replace('\n\n', '').split()) \
        if len(res.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[2]/article/div/section')) > 0 \
        else None
    except:
        product_usage = None

    try:
        product_composition: str or None = ' '.join(res.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[3]/article/div/section')[0].text.replace('\n\n', '').split()) \
        if len(res.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[3]/article/div/section')) > 0 \
        else None
    except:
        product_composition = None

    return description, product_usage, product_composition

In [7]:
def parse_category(queue: Queue, cat_name: str):
    category_id = get_category_data(cat_name=cat_name, type='id')
    category_ru_name = get_category_data(cat_name=cat_name, type='ru_name')
    fields = [
        'id',
        'sku',
        'name',
        'brand',
        'brand_type',
        'dimension17',
        'dimension18',
        'dimension19',
        'dimension20',
        'country',
        'price',
        'currency',
        'old_price',
        'category_type',
        'url',
        'images',
        'type',
        'volume',
        'main_product_sku',
        'main_product_id',
        'best_loyalty_price',
        'dimension29',
        'dimension28',
        'description',
        'product_usage',
        'product_composition',
        'category',
        'category_ru'
        ]

    for page in range(1, 10000):
        try:
            # ждем рандомный момент времени
            sleep(np.round(np.random.uniform(10, 20), 1))
            # генерим рандомный юезерагент
            user_agent = fake_useragent.UserAgent().random
            # пытаемся получить данные по странице с товарами
            res = requests.get(url, params={'cat': category_id, 'page': page}, timeout=5, headers={'User-Agent': user_agent}).json()['products']
        except:
            break
        # в случае количества страниц, кратным 20 можем получить страницу с 0 товарами
        if len(res) == 0:
            break
        # если добрались до последней страницы, собираем с нее остатки и возвращаем результат
        elif len(res) < 20:
            for product in res:
                # фильтруем только необходимые поля, если поля нет, то None
                product_new = {your_key: product[your_key] if your_key in product.keys() else None  for your_key in fields}
                # получаем данные из html странички, которые отсутствуют в запросе (исключив это можно значительно сократить время парсинга)
                description, product_usage, product_composition = get_product_data_by_url(product['url'])
                product_new['description'] = description
                product_new['product_usage'] = product_usage
                product_new['product_composition'] = product_composition
                product_new['category'] = cat_name
                product_new['category_ru'] = category_ru_name
                # кладем объект в очередь
                queue.put(product_new)
                queue.task_done()
            break
        else:
            for product in res:
                # фильтруем только необходимые поля, если поля нет, то None
                product_new = {your_key: product[your_key] if your_key in product.keys() else None  for your_key in fields}
                # получаем данные из html странички, которые отсутствуют в запросе (исключив это можно значительно сократить время парсинга)
                description, product_usage, product_composition = get_product_data_by_url(product['url'])
                product_new['description'] = description
                product_new['product_usage'] = product_usage
                product_new['product_composition'] = product_composition
                product_new['category'] = cat_name
                product_new['category_ru'] = category_ru_name
                # кладем объект в очередь
                queue.put(product_new)
                queue.task_done()


In [8]:
def save_to_pd_dataframe(queue: Queue, df: pd.DataFrame):
    while True:
        try:
            # sleep + timeout в requests не должны превышать данный timeout иначе парсинг может закончится раньше времени
            product_data = queue.get(timeout=30)
            df = pd.concat([
                df,
                pd.DataFrame([product_data])
            ])
            print(f'Len data: {len(df)}', end='\r')
        except:
            # если за 30 секунд в очереди не появилось данных, полагаем что парсинг завершен
            df.to_csv('products.csv', index=False)
            break


In [9]:
agent = fake_useragent.UserAgent().random
queue = Queue()
df = pd.DataFrame()
url = 'https://goldapple.ru/web_scripts/discover/category/products/'

In [10]:
categories_name = get_category_data(type='cat_names')

# создаем поток для каждой категории
threads = [threading.Thread(target=parse_category, args=(queue, category_name)) for category_name in categories_name]
# создаем поток для сохранения результатов парсинга в df
threads.append(threading.Thread(target=save_to_pd_dataframe, args=(queue, df)))
# стартуем потоки
[t.start() for t in threads]
# ждем завершения парсинга и сохранения результатов
[t.join() for t in threads]

Len data: 40580

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [None]:
data = pd.read_csv('products.csv')
data.info()

In [None]:
data.head(40)

In [None]:
data['brand_type'].value_counts()

In [None]:
res

In [None]:
res = requests.get('https://goldapple.ru/19000032959-detangling-comb-detangling-dog-grooming-brush-navy-orange').text
res = etree.HTML(res)
res.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[1]')[0].text is not None

In [None]:
def get_sitemats_list(url: str='https://goldapple.ru/sitemap.xml') -> list[str]:
    """
    Get sitemaps list from goldapple or another url.
    :param url: url to sitemap.xml
    :return: list of sitemaps urls
    """
    try:
        res = requests.get(url, timeout=30).text
    except requests.exceptions.ConnectTimeout as err:
        raise requests.exceptions.ConnectTimeout(f'connection timeout, {err}')
    except requests.exceptions.ConnectionError:
        raise requests.exceptions.ConnectionError(f'check url: {url}')
    except requests.exceptions.HTTPError as err:
        raise requests.exceptions.HTTPError(err)

    try:
        soup = bs4.BeautifulSoup(res, 'xml')
        res = [x.text for x in soup.find_all('loc')]
    except AttributeError as err:
        raise AttributeError(f'could not find attribute "loc", {err}')
    except Exception as err:
        raise f'Error parsing xml data: {err}'

    return res

In [None]:
def get_product_urls(sitemaps: list[str]) -> list[str]:
    """
    Get products urls from sitemaps.

    :param sitemaps: list of sitemaps urls
    :return: list of product links
    """
    cat_urls = []
    prod_urls = []
    pattern = re.compile(r'\d{10,12}')
    for sitemap in sitemaps:
        xml_products = requests.get(sitemap).text
        soup_products = bs4.BeautifulSoup(xml_products, 'xml')
        product_urls = [x.text for x in soup_products.find_all('loc')]
        for index, x in enumerate(product_urls):
            product = re.findall(pattern, x)
            if product:
                prod_urls.append(product_urls[index])
            else:
                cat_urls.append(product_urls[index])
    return cat_urls, prod_urls

In [None]:
def get_sku_and_product_id(url: str) -> tuple[Any, Any]:
    """
    Return sku_id and product_id from product URL
    :param url: url to product
    :return: [sku_id, prod_id]
    """
    pattern = re.compile(r'\d{4,20}')
    res_sku = re.findall(pattern, url)
    if len(res_sku) == 1:
        prod_id = None
        sku_id = res_sku[0]
    elif len(res_sku) > 1:
        prod_id = res_sku[0]
        sku_id = res_sku[1]
    else:
        prod_id = sku_id = None

    return sku_id, prod_id

In [None]:
sitemaps = get_sitemats_list()
cat_urls, product_urls = get_product_urls(sitemaps)

In [None]:
requests.get('https://goldapple.ru/web_scripts/discover/category/products/', params={'cat': 3}, ).json()['products']

In [None]:
old = {'id': '173372',
  'sku': '19000016941',
  'name': 'К122',
  'brand': 'Manly PRO',
  'brand_type': 'standard',
  'dimension17': 'Кисти для лица',
  'dimension18': 'Женский',
  'dimension19': None,
  'dimension20': None,
  'country': 'Россия',
  'price': 1630,
  'price_object': {'amount': '1630', 'currency': 'RUB'},
  'old_price': 1630,
  'old_price_object': {'amount': '1630', 'currency': 'RUB'},
       'category_type': 'Большая плоская кисть для пудры ',
  'url': 'https://goldapple.ru/19000016941-k122',
  'image_url': 'https://goldapple.ru/media/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg',
  'webp_image_url': 'https://goldapple.ru/web_scripts/webp/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg?quality=83',
  'images': ['https://goldapple.ru/media/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg',
   'https://goldapple.ru/media/catalog/product/6/9/6921166812419_2_chfmwll08kkefjw8.jpg',
   'https://goldapple.ru/media/catalog/product/6/9/6921166812419_3_3apxe5hspdnd5it6.jpg'],
  'webp_images': ['https://goldapple.ru/web_scripts/webp/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg?quality=83',
   'https://goldapple.ru/web_scripts/webp/catalog/product/6/9/6921166812419_2_chfmwll08kkefjw8.jpg?quality=83',
   'https://goldapple.ru/web_scripts/webp/catalog/product/6/9/6921166812419_3_3apxe5hspdnd5it6.jpg?quality=83'],
  'is_saleable': True,
  'type': 'simple',
  'volume': '1',
       'best_loyalty_price': 1467,
  'best_loyalty_price_object': {'amount': '1467', 'currency': 'RUB'},
  'dimension29': False,
  'dimension28': False,
  'discover_tracking_click_url': 'http://recs.richrelevance.com/rrserver/apiclick?a=80fdecab677dbf85&cak=a850db30e0e760ca&channelId=a850db30e0e760ca&vg=8c855a7d-b5fe-418c-a4db-6060711c8360&stid=165&mrrid=110081820&pti=4&pa=47968&pn=-1&pos=0&p=19000016941&rid=msk&mvtId=55814&mvtTs=1656858237280',
       'resized_gallery_images': [],
  'need_render_old_price': True,
  'main_product_sku': '19000016941',
  'main_product_id': '173372',}

fields = [
        'id',
        'sku',
        'name',
        'brand',
        'brand_type',
        'dimension17',
        'dimension18',
        'dimension19',
        'dimension20',
        'country',
        'price',
        'old_price',
        'category_type',
        'url',
        'images',
        'type',
        'volume',
        'main_product_sku',
        'main_product_id',
        'best_loyality_price',
        'dimension29',
        'dimension28',
        'description',
        'product_usage',
        'product_composition',
        'category',
        'category_ru'
        ]

dict_you_want = {your_key: old[your_key] if your_key in old.keys() else None  for your_key in fields }
dict_you_want

In [None]:
product_template = {
    'id': None,
    'sku': None,
    'name': None,
    'brand': None,
    'brand_type': None,
    'dimension17': None,
    'dimension18': None,
    'dimension19': None,
    'dimension20': None,
    'country': None,
    'price': None,
    'currency': None,
    'old_price': None,
    'category_type': None,
    'url': None,
    'images': None,
    'type': None,
    'volume': None,
    'main_product_sku': None,
    'main_product_id': None,
    'best_loyality_price': None,
    'dimension29': None,
    'dimension28': None,
    'description': None,
    'product_usage': None,
    'product_composition': None,
    'category': None,
    'category_ru': None
}

In [None]:
len(cat_urls)

In [None]:
counter = 0
for url in cat_urls:
    url = url.split('https://goldapple.ru/')[1].split('/')
    print(url)
    counter += 1
    # if counter > 20:
    #     break

In [None]:
class Product:
    def __init__(self, product_page, product_url:str=None, product_id:int=None, sku_id:int=None):
        self.product_page: etree._Element = etree.HTML(product_page)
        self.product_url = product_url
        self.product_id: int or None = product_id
        self.sku_id: int or None = sku_id
        self.title: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/header/p')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/header/p')) > 0 \
            else None
        self.first_subcategory: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[2]/a/span')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[2]/a/span')) > 0 \
            else None
        self.second_subcategory: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[3]/a/span')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[3]/a/span')) > 0 \
            else None
        self.third_subcategory: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[4]/a/span')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[4]/a/span')) > 0 \
            else None
        self.fourth_subcategory: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[5]/a/span')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[5]/a/span')) > 0 \
            else None
        self.price: int or None = int(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/form/div[3]/div/div/span/span/span/span')[0].text.replace('₽', '').replace(u'\xa0', u' ').replace(u' ', u'').strip()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/form/div[3]/div/div/span/span/span/span')) > 0 \
            else None
        self.loyalty_price: int or None = int(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/form/div[3]/div/span/span/span/span')[0].text.replace('₽', '').replace(u'\xa0', u' ').replace(u' ', u'').strip()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/form/div[3]/div/span/span/span/span')) > 0 \
            else None
        self.description: str or None = ' '.join(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[1]')[0].text.replace('\n\n', '').split()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[1]')) > 0 \
            else None
        self.product_usage: str or None = ' '.join(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[2]/article/div/section')[0].text.replace('\n\n', '').split()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[2]/article/div/section')) > 0 \
            else None
        self.product_composition: str or None = ' '.join(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[3]/article/div/section')[0].text.replace('\n\n', '').split()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[3]/article/div/section')) > 0 \
            else None
        self.product_brand: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[3]/article/div/header/h4/span')[0].text \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[3]/article/div/header/h4/span')) > 0 \
            else None or self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[2]/article/div/header/h4/span')[0].text \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[2]/article/div/header/h4/span')) > 0 \
            else None


In [None]:
len(product_urls)

In [None]:
url = product_urls[4000]
res = requests.get(url, timeout=(3, 5)).content.decode('utf-8')

In [None]:
url

In [None]:
res = requests.get('https://goldapple.ru/web_scripts/discover/category/products/', params={'cat': 3, 'page': 1}).json()['products']
len(res)
res[0]

In [None]:
res = requests.get('https://goldapple.ru/web_scripts/discover/category/products/', params={'cat': 3, 'page': 743}).json()['products']
len(res)

In [None]:
from selenium import webdriver

browser = webdriver.Edge(executable_path='msedgedriver.exe')
browser.get(url)
html = browser.page_source
tree = etree.HTML(html)
tree.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[2]/a/span')

In [None]:
res

In [None]:
res_soup = bs4.BeautifulSoup(res, 'html.parser')
tree = etree.HTML(res)

In [None]:
res_soup.find('div', {'class': 'product-card'})

In [None]:
tree.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[2]/a/span')

In [None]:
prod = Product(res)

In [None]:
prod.__dict__

In [None]:
for x in res_soup.find_all('li', {'class': "pdp-breadcrumbs__crumb-item pdp-breadcrumbs__item"}):
    print(x.get_text())

In [None]:
str(etree.tostring(tree.xpath('/html/body')[0])).find('')

In [None]:
product_properties = {}
for x in tree.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[2]/dl')[0]:
    print(x[0][0].text)
    if 'Тип продукта' in x[0][0].text:
        product_properties['product_type'] = x[1][0].text
    if 'Для кого' in x[0][0].text:
        product_properties['who_use'] = x[1][0].text
    if 'Страна бренда' in x[0][0].text:
        product_properties['brand_country'] = x[1][0].text
    if 'Область применения' in x[0][0].text:
        product_properties['use_scope'] = x[1][0].text
    if 'Назначение' in x[0][0].text:
        product_properties['purpose'] = x[1][0].text

product_properties

In [None]:
MyClass = type("MyClass", (object, ), product_properties)

In [None]:
link = 'https://goldapple.ru/14503-4551247598-rastusevka-bez-granicdsf'
pattern = re.compile(r'\d{4,20}')
res_sku = re.findall(pattern, link)
if len(res_sku) == 1:
    prod_id = None
    sku_id = res_sku[0]
elif len(res_sku) > 1:
    prod_id = res_sku[0]
    sku_id = res_sku[1]
else:
    prod_id = sku_id = None
print(res_sku)
print(prod_id, sku_id)

In [8]:
data.head(40)

Unnamed: 0,id,sku,name,brand,brand_type,dimension17,dimension18,dimension19,dimension20,country,...,main_product_sku,main_product_id,best_loyality_price,dimension29,dimension28,description,product_usage,product_composition,category,category_ru
0,201225,19000049706,темно-синий,Tkano,standard,Постельное белье,Унисекс,,,Россия,...,19000049706,201225,,False,False,Комплект постельного белья темно-синего цвета ...,100% органический хлопок,,dlja-doma,для дома
1,214715,19000067648,Prestige Santorini Blue,ILUXE,standard,н/д,Унисекс,,,Россия,...,19000067648,214715,,False,False,Набор сомелье iLUXE Prestige разработан специа...,1. Установите резак поверх горлышка горизонтал...,Материал корпуса – алюминий и пластик.,dlja-doma,для дома
2,173622,19000019748,STEAMPOD 3.0 x BARBIE,L'Oreal Professionnel,standard,Стайлер,Унисекс,,,Франция,...,19000019748,173622,,False,False,Лимитированный профессиональный паровой стайле...,Предварительно нанесите на влажные волосы крем...,Марка L’Oréal Professionnel стала известной в ...,tehnika,техника
3,224665,19000077743,Blue Valentine,MORФEUS,standard,Постельное белье,Унисекс,,,Россия,...,19000077743,224665,,False,False,Евро-комплект постельного белья Blue Valentine...,Перед первым использованием рекомендуем постир...,"100% хлопок, сатин.",dlja-doma,для дома
4,222155,19000070441,Female Multiple,Solgar,standard,н/д,Женский,Женское здоровье,,США,...,19000070441,222155,,False,False,Данный комплекс создан специально для женщин и...,Женщинам по 1 таблетке 3 раза в день во время ...,Кальций 133 мг,zdorov-e-i-apteka,здоровье и аптека
5,155448,19760318176,TIME CONTROL +,TALIKA,standard,Массажер,Женский,,,Франция,...,19760318176,155448,,False,True,Косметический прибор,"включите прибор, нажав кнопку, и подождите, по...",АБС-терполимер Термопластический эластомер USB...,tehnika,техника
6,169605,12241-19000013502,лазурно-голубой,Belle YOU,standard,Комбинезон,Женский,,,Россия,...,19000013502,169604,,False,False,Базовый комбинезон с шортами выполнен из мягко...,"нательное, для домаших тренировок","96% полиамид, 4% эластан",nizhnee-bel-jo,нижнее бельё
7,148215,22470300026,из серебра с алмазной гранью,SOKOLOV,standard,Серьги,Женский,,,,...,22470300026,148215,,False,False,,,,ukrashenija,украшения
8,148216,22470300027,из серебра с алмазной гранью,SOKOLOV,standard,Серьги,Женский,,,,...,22470300027,148216,,False,False,,,,ukrashenija,украшения
9,200485,19000046442,Catnip Chaser,Petstages,standard,игрушка для животных,,,,США,...,19000046442,200485,,False,False,"Игрушка трек с пластиковым мячиком, тубом коша...",Подбирайте игрушки в соответствии с весом и дв...,Пластик,tovary-dlja-zhivotnyh,товары для животных


In [9]:
data['brand_type'].value_counts()

standard    1094
middle        23
special        3
Name: brand_type, dtype: int64

In [369]:
res

<coroutine object parse_category at 0x000001E81DE645F0>

In [233]:
res = requests.get('https://goldapple.ru/19000032959-detangling-comb-detangling-dog-grooming-brush-navy-orange').text
res = etree.HTML(res)
res.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[1]')[0].text is not None

False

In [1]:
def get_sitemats_list(url: str='https://goldapple.ru/sitemap.xml') -> list[str]:
    """
    Get sitemaps list from goldapple or another url.
    :param url: url to sitemap.xml
    :return: list of sitemaps urls
    """
    try:
        res = requests.get(url, timeout=30).text
    except requests.exceptions.ConnectTimeout as err:
        raise requests.exceptions.ConnectTimeout(f'connection timeout, {err}')
    except requests.exceptions.ConnectionError:
        raise requests.exceptions.ConnectionError(f'check url: {url}')
    except requests.exceptions.HTTPError as err:
        raise requests.exceptions.HTTPError(err)

    try:
        soup = bs4.BeautifulSoup(res, 'xml')
        res = [x.text for x in soup.find_all('loc')]
    except AttributeError as err:
        raise AttributeError(f'could not find attribute "loc", {err}')
    except Exception as err:
        raise f'Error parsing xml data: {err}'

    return res

In [4]:
def get_product_urls(sitemaps: list[str]) -> list[str]:
    """
    Get products urls from sitemaps.

    :param sitemaps: list of sitemaps urls
    :return: list of product links
    """
    cat_urls = []
    prod_urls = []
    pattern = re.compile(r'\d{10,12}')
    for sitemap in sitemaps:
        xml_products = requests.get(sitemap).text
        soup_products = bs4.BeautifulSoup(xml_products, 'xml')
        product_urls = [x.text for x in soup_products.find_all('loc')]
        for index, x in enumerate(product_urls):
            product = re.findall(pattern, x)
            if product:
                prod_urls.append(product_urls[index])
            else:
                cat_urls.append(product_urls[index])
    return cat_urls, prod_urls

In [7]:
def get_sku_and_product_id(url: str) -> tuple[Any, Any]:
    """
    Return sku_id and product_id from product URL
    :param url: url to product
    :return: [sku_id, prod_id]
    """
    pattern = re.compile(r'\d{4,20}')
    res_sku = re.findall(pattern, url)
    if len(res_sku) == 1:
        prod_id = None
        sku_id = res_sku[0]
    elif len(res_sku) > 1:
        prod_id = res_sku[0]
        sku_id = res_sku[1]
    else:
        prod_id = sku_id = None

    return sku_id, prod_id

In [8]:
sitemaps = get_sitemats_list()
cat_urls, product_urls = get_product_urls(sitemaps)

In [32]:
requests.get('https://goldapple.ru/web_scripts/discover/category/products/', params={'cat': 3}, ).json()['products']

[{'id': '173372',
  'sku': '19000016941',
  'name': 'К122',
  'brand': 'Manly PRO',
  'brand_type': 'standard',
  'dimension17': 'Кисти для лица',
  'dimension18': 'Женский',
  'dimension19': None,
  'dimension20': None,
  'country': 'Россия',
  'price': 1630,
  'price_object': {'amount': '1630', 'currency': 'RUB'},
  'old_price': 1630,
  'old_price_object': {'amount': '1630', 'currency': 'RUB'},
  'category_type': 'Большая плоская кисть для пудры ',
  'url': 'https://goldapple.ru/19000016941-k122',
  'image_url': 'https://goldapple.ru/media/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg',
  'webp_image_url': 'https://goldapple.ru/web_scripts/webp/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg?quality=83',
  'images': ['https://goldapple.ru/media/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg',
   'https://goldapple.ru/media/catalog/product/6/9/6921166812419_2_chfmwll08kkefjw8.jpg',
   'https://goldapple.ru/media/catalog/product/6/9/6921166812419_3_3apxe5h

In [14]:
old = {'id': '173372',
  'sku': '19000016941',
  'name': 'К122',
  'brand': 'Manly PRO',
  'brand_type': 'standard',
  'dimension17': 'Кисти для лица',
  'dimension18': 'Женский',
  'dimension19': None,
  'dimension20': None,
  'country': 'Россия',
  'price': 1630,
  'price_object': {'amount': '1630', 'currency': 'RUB'},
  'old_price': 1630,
  'old_price_object': {'amount': '1630', 'currency': 'RUB'},
       'category_type': 'Большая плоская кисть для пудры ',
  'url': 'https://goldapple.ru/19000016941-k122',
  'image_url': 'https://goldapple.ru/media/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg',
  'webp_image_url': 'https://goldapple.ru/web_scripts/webp/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg?quality=83',
  'images': ['https://goldapple.ru/media/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg',
   'https://goldapple.ru/media/catalog/product/6/9/6921166812419_2_chfmwll08kkefjw8.jpg',
   'https://goldapple.ru/media/catalog/product/6/9/6921166812419_3_3apxe5hspdnd5it6.jpg'],
  'webp_images': ['https://goldapple.ru/web_scripts/webp/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg?quality=83',
   'https://goldapple.ru/web_scripts/webp/catalog/product/6/9/6921166812419_2_chfmwll08kkefjw8.jpg?quality=83',
   'https://goldapple.ru/web_scripts/webp/catalog/product/6/9/6921166812419_3_3apxe5hspdnd5it6.jpg?quality=83'],
  'is_saleable': True,
  'type': 'simple',
  'volume': '1',
       'best_loyalty_price': 1467,
  'best_loyalty_price_object': {'amount': '1467', 'currency': 'RUB'},
  'dimension29': False,
  'dimension28': False,
  'discover_tracking_click_url': 'http://recs.richrelevance.com/rrserver/apiclick?a=80fdecab677dbf85&cak=a850db30e0e760ca&channelId=a850db30e0e760ca&vg=8c855a7d-b5fe-418c-a4db-6060711c8360&stid=165&mrrid=110081820&pti=4&pa=47968&pn=-1&pos=0&p=19000016941&rid=msk&mvtId=55814&mvtTs=1656858237280',
       'resized_gallery_images': [],
  'need_render_old_price': True,
  'main_product_sku': '19000016941',
  'main_product_id': '173372',}

fields = [
        'id',
        'sku',
        'name',
        'brand',
        'brand_type',
        'dimension17',
        'dimension18',
        'dimension19',
        'dimension20',
        'country',
        'price',
        'old_price',
        'category_type',
        'url',
        'images',
        'type',
        'volume',
        'main_product_sku',
        'main_product_id',
        'best_loyality_price',
        'dimension29',
        'dimension28',
        'description',
        'product_usage',
        'product_composition',
        'category',
        'category_ru'
        ]

dict_you_want = {your_key: old[your_key] if your_key in old.keys() else None  for your_key in fields }
dict_you_want

{'id': '173372',
 'sku': '19000016941',
 'name': 'К122',
 'brand': 'Manly PRO',
 'brand_type': 'standard',
 'dimension17': 'Кисти для лица',
 'dimension18': 'Женский',
 'dimension19': None,
 'dimension20': None,
 'country': 'Россия',
 'price': 1630,
 'old_price': 1630,
 'category_type': 'Большая плоская кисть для пудры ',
 'url': 'https://goldapple.ru/19000016941-k122',
 'images': ['https://goldapple.ru/media/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg',
  'https://goldapple.ru/media/catalog/product/6/9/6921166812419_2_chfmwll08kkefjw8.jpg',
  'https://goldapple.ru/media/catalog/product/6/9/6921166812419_3_3apxe5hspdnd5it6.jpg'],
 'type': 'simple',
 'volume': '1',
 'main_product_sku': '19000016941',
 'main_product_id': '173372',
 'best_loyality_price': None,
 'dimension29': False,
 'dimension28': False,
 'description': None,
 'product_usage': None,
 'product_composition': None,
 'category': None,
 'category_ru': None}

In [None]:
product_template = {
    'id': None,
    'sku': None,
    'name': None,
    'brand': None,
    'brand_type': None,
    'dimension17': None,
    'dimension18': None,
    'dimension19': None,
    'dimension20': None,
    'country': None,
    'price': None,
    'currency': None,
    'old_price': None,
    'category_type': None,
    'url': None,
    'images': None,
    'type': None,
    'volume': None,
    'main_product_sku': None,
    'main_product_id': None,
    'best_loyality_price': None,
    'dimension29': None,
    'dimension28': None,
    'description': None,
    'product_usage': None,
    'product_composition': None,
    'category': None,
    'category_ru': None
}

In [9]:
len(cat_urls)

10359

In [27]:
counter = 0
for url in cat_urls:
    url = url.split('https://goldapple.ru/')[1].split('/')
    print(url)
    counter += 1
    # if counter > 20:
    #     break

['uhod']
['volosy']
['parfjumerija']
['aksessuary']
['azija']
['organika']
['makijazh', 'lico']
['makijazh', 'glaza']
['makijazh', 'guby']
['makijazh', 'brovi']
['makijazh', 'nogti']
['makijazh', 'kisti']
['makijazh', 'paletki']
['makijazh', 'nabory']
['makijazh', 'aksessuary']
['makijazh', 'detjam']
['uhod', 'uhod-za-licom']
['uhod', 'uhod-za-telom']
['uhod', 'nabory']
['volosy', 'shampun']
['volosy', 'kondicioner']
['volosy', 'suhoj-shampun']
['volosy', 'uhod-za-volosami']
['volosy', 'stajling']
['volosy', 'aksessuary']
['volosy', 'okrashivanie']
['volosy', 'dlja-stilistov']
['parfjumerija', 'zhenskie-aromaty-parfume']
['parfjumerija', 'muzhskie-aromaty']
['parfjumerija', 'nishevaja-parfjumerija']
['parfjumerija', 'aromaty-dlja-doma']
['parfjumerija', 'nabory']
['parfjumerija', 'aksessuary']
['aksessuary', 'dlja-vizazhistov']
['azija', 'makijazh']
['catalog', 'category', 'view', 'id', '82']
['organika', 'volosy']
['makijazh', 'lico', 'prajmery']
['makijazh', 'lico', 'tonal-nye-sredst

In [35]:
class Product:
    def __init__(self, product_page, product_url:str=None, product_id:int=None, sku_id:int=None):
        self.product_page: etree._Element = etree.HTML(product_page)
        self.product_url = product_url
        self.product_id: int or None = product_id
        self.sku_id: int or None = sku_id
        self.title: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/header/p')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/header/p')) > 0 \
            else None
        self.first_subcategory: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[2]/a/span')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[2]/a/span')) > 0 \
            else None
        self.second_subcategory: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[3]/a/span')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[3]/a/span')) > 0 \
            else None
        self.third_subcategory: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[4]/a/span')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[4]/a/span')) > 0 \
            else None
        self.fourth_subcategory: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[5]/a/span')[0].text.strip() \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[5]/a/span')) > 0 \
            else None
        self.price: int or None = int(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/form/div[3]/div/div/span/span/span/span')[0].text.replace('₽', '').replace(u'\xa0', u' ').replace(u' ', u'').strip()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/form/div[3]/div/div/span/span/span/span')) > 0 \
            else None
        self.loyalty_price: int or None = int(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/form/div[3]/div/span/span/span/span')[0].text.replace('₽', '').replace(u'\xa0', u' ').replace(u' ', u'').strip()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[1]/section[3]/div/form/div[3]/div/span/span/span/span')) > 0 \
            else None
        self.description: str or None = ' '.join(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[1]')[0].text.replace('\n\n', '').split()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[1]')) > 0 \
            else None
        self.product_usage: str or None = ' '.join(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[2]/article/div/section')[0].text.replace('\n\n', '').split()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[2]/article/div/section')) > 0 \
            else None
        self.product_composition: str or None = ' '.join(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[3]/article/div/section')[0].text.replace('\n\n', '').split()) \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[3]/article/div/section')) > 0 \
            else None
        self.product_brand: str or None = self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[3]/article/div/header/h4/span')[0].text \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[3]/article/div/header/h4/span')) > 0 \
            else None or self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[2]/article/div/header/h4/span')[0].text \
            if len(self.product_page.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[2]/article/div/header/h4/span')) > 0 \
            else None


In [8]:
len(product_urls)

72182

In [58]:
url = product_urls[4000]
res = requests.get(url, timeout=(3, 5)).content.decode('utf-8')

In [59]:
url

'https://goldapple.ru/24461400018-nejlonovye-volokna-10-1682'

In [95]:
res = requests.get('https://goldapple.ru/web_scripts/discover/category/products/', params={'cat': 3, 'page': 1}).json()['products']
len(res)
res[0]

{'id': '173372',
 'sku': '19000016941',
 'name': 'К122',
 'brand': 'Manly PRO',
 'brand_type': 'standard',
 'dimension17': 'Кисти для лица',
 'dimension18': 'Женский',
 'dimension19': None,
 'dimension20': None,
 'country': 'Россия',
 'price': 1630,
 'price_object': {'amount': '1630', 'currency': 'RUB'},
 'old_price': 1630,
 'old_price_object': {'amount': '1630', 'currency': 'RUB'},
 'category_type': 'Большая плоская кисть для пудры ',
 'url': 'https://goldapple.ru/19000016941-k122',
 'image_url': 'https://goldapple.ru/media/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg',
 'webp_image_url': 'https://goldapple.ru/web_scripts/webp/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg?quality=83',
 'images': ['https://goldapple.ru/media/catalog/product/6/9/6921166812419_1_mixjzksioyjod7da.jpg',
  'https://goldapple.ru/media/catalog/product/6/9/6921166812419_2_chfmwll08kkefjw8.jpg',
  'https://goldapple.ru/media/catalog/product/6/9/6921166812419_3_3apxe5hspdnd5it6.jpg'],
 'we

In [112]:
res = requests.get('https://goldapple.ru/web_scripts/discover/category/products/', params={'cat': 3, 'page': 743}).json()['products']
len(res)

15

In [81]:
from selenium import webdriver

browser = webdriver.Edge(executable_path='msedgedriver.exe')
browser.get(url)
html = browser.page_source
tree = etree.HTML(html)
tree.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[2]/a/span')

  browser = webdriver.Edge(executable_path='msedgedriver.exe')


SessionNotCreatedException: Message: session not created: This version of Microsoft Edge WebDriver only supports Microsoft Edge version 105
Current browser version is 103.0.1264.44 with binary path C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe
Stacktrace:
Backtrace:
	Microsoft::Applications::Events::EventProperties::unpack [0x00007FF68415B4A2+27426]
	Microsoft::Applications::Events::GUID_t::GUID_t [0x00007FF6840AF0C2+339266]
	Ordinal0 [0x00007FF683C47801+620545]
	Ordinal0 [0x00007FF683C73AC7+801479]
	Ordinal0 [0x00007FF683C6F6EC+784108]
	Ordinal0 [0x00007FF683C6A888+764040]
	Ordinal0 [0x00007FF683CA4ABF+1002175]
	Ordinal0 [0x00007FF683C9F1D3+979411]
	Ordinal0 [0x00007FF683C76020+811040]
	Ordinal0 [0x00007FF683C77618+816664]
	Microsoft::Applications::Events::EventProperty::EventProperty [0x00007FF683F08A78+127864]
	Microsoft::Applications::Events::EventProperty::EventProperty [0x00007FF683EF24BB+36283]
	Microsoft::Applications::Events::EventProperty::EventProperty [0x00007FF683EF53FC+48380]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF683D50E23+22563]
	Microsoft::Applications::Events::GUID_t::GUID_t [0x00007FF6840B699B+370203]
	Microsoft::Applications::Events::GUID_t::GUID_t [0x00007FF6840BBED4+392020]
	Microsoft::Applications::Events::GUID_t::GUID_t [0x00007FF6840BC02D+392365]
	Microsoft::Applications::Events::time_ticks_t::time_ticks_t [0x00007FF6840C6C89+41193]
	BaseThreadInitThunk [0x00007FFAC6E554E0+16]
	RtlUserThreadStart [0x00007FFAC898485B+43]


In [None]:
res

In [60]:
res_soup = bs4.BeautifulSoup(res, 'html.parser')
tree = etree.HTML(res)

In [67]:
res_soup.find('div', {'class': 'product-card'})

<div class="product-card"><div class="product-card__i"><div class="product-card__images _skeletoned"></div><div class="product-card__details"><div class="product-card__type"><span class="_skeletoned"></span><span class="_skeletoned"></span></div><div class="product-card__full-name"><span class="_skeletoned"></span><span class="_skeletoned"></span></div><div class="product-card__pricing"><span class="_skeletoned"></span></div></div></div></div>

In [61]:
tree.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[2]/a/span')

[]

In [40]:
prod = Product(res)

In [41]:
prod.__dict__

{'product_page': <Element html at 0x1c85ba275c0>,
 'product_url': None,
 'product_id': None,
 'sku_id': None,
 'title': 'Кисть для тональной основы',
 'first_subcategory': None,
 'second_subcategory': None,
 'third_subcategory': None,
 'fourth_subcategory': None,
 'price': 390,
 'loyalty_price': 0,
 'description': None,
 'product_usage': None,
 'product_composition': None,
 'product_brand': None}

In [None]:
for x in res_soup.find_all('li', {'class': "pdp-breadcrumbs__crumb-item pdp-breadcrumbs__item"}):
    print(x.get_text())

In [52]:
str(etree.tostring(tree.xpath('/html/body')[0])).find('')

-1

In [None]:
product_properties = {}
for x in tree.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[2]/dl')[0]:
    print(x[0][0].text)
    if 'Тип продукта' in x[0][0].text:
        product_properties['product_type'] = x[1][0].text
    if 'Для кого' in x[0][0].text:
        product_properties['who_use'] = x[1][0].text
    if 'Страна бренда' in x[0][0].text:
        product_properties['brand_country'] = x[1][0].text
    if 'Область применения' in x[0][0].text:
        product_properties['use_scope'] = x[1][0].text
    if 'Назначение' in x[0][0].text:
        product_properties['purpose'] = x[1][0].text

product_properties

In [None]:
MyClass = type("MyClass", (object, ), product_properties)

In [None]:
link = 'https://goldapple.ru/14503-4551247598-rastusevka-bez-granicdsf'
pattern = re.compile(r'\d{4,20}')
res_sku = re.findall(pattern, link)
if len(res_sku) == 1:
    prod_id = None
    sku_id = res_sku[0]
elif len(res_sku) > 1:
    prod_id = res_sku[0]
    sku_id = res_sku[1]
else:
    prod_id = sku_id = None
print(res_sku)
print(prod_id, sku_id)

In [299]:
res



In [330]:
res_soup = bs4.BeautifulSoup(res_text, 'html.parser')
tree = etree.HTML(res)

In [57]:
prod = Product(res, )

In [107]:
prod.__dict__

{'product_page': <Element html at 0x15759f2acc0>,
 'product_id': None,
 'sku_id': None,
 'title': 'Ёршики межзубные M d 3.0 мм',
 'first_subcategory': None,
 'second_subcategory': None,
 'third_subcategory': None,
 'fourth_subcategory': None,
 'price': 504,
 'loyalty_price': 453,
 'description': 'Межзубные щетки (ершики) необходимы каждому человеку для тщательной очистки межзубных пространств. Также ими можно пользоваться для лучшей гигиены при ношении ортодонтических конструкций. Цилиндрические ершики Lacalut имеют высокопрочные нейлоновые щетинки и удобную цветную ручку. В упаковке комплект из 5 штук. Для подбора нужного диаметра ершиков необходимо обратиться за консультацией к стоматологу.',
 'product_usage': 'Перед первым применением рекомендуется промыть межзубной ершик в теплой воде. После чистки каждой зоны и в конце чистки необходимо тщательно промыть межзубной ершик теплой водой. При ежедневном использовании рекомендуется менять межзубной ершик каждую неделю. Средство индивиду

In [356]:
for x in res_soup.find_all('li', {'class': "pdp-breadcrumbs__crumb-item pdp-breadcrumbs__item"}):
    print(x.get_text())

In [349]:
tree.xpath('/html/body/div[1]/main/div/div/section/header/section[1]/nav/ul/li[2]/a/span')

[]

In [139]:
product_properties = {}
for x in tree.xpath('/html/body/div[1]/main/div/div/section/section[3]/section[2]/section/section[1]/div/ul/li[1]/article/div/section[2]/dl')[0]:
    print(x[0][0].text)
    if 'Тип продукта' in x[0][0].text:
        product_properties['product_type'] = x[1][0].text
    if 'Для кого' in x[0][0].text:
        product_properties['who_use'] = x[1][0].text
    if 'Страна бренда' in x[0][0].text:
        product_properties['brand_country'] = x[1][0].text
    if 'Область применения' in x[0][0].text:
        product_properties['use_scope'] = x[1][0].text
    if 'Назначение' in x[0][0].text:
        product_properties['purpose'] = x[1][0].text

product_properties

Тип продукта
Для кого
Назначение
Область применения
Страна бренда


{'product_type': 'межзубный ершик',
 'who_use': 'унисекс',
 'purpose': 'личная гигиена',
 'use_scope': 'для полости рта',
 'brand_country': 'Германия'}

In [242]:
MyClass = type("MyClass", (object, ), product_properties)

SyntaxError: invalid syntax (2261796374.py, line 2)

In [106]:
link = 'https://goldapple.ru/14503-4551247598-rastusevka-bez-granicdsf'
pattern = re.compile(r'\d{4,20}')
res_sku = re.findall(pattern, link)
if len(res_sku) == 1:
    prod_id = None
    sku_id = res_sku[0]
elif len(res_sku) > 1:
    prod_id = res_sku[0]
    sku_id = res_sku[1]
else:
    prod_id = sku_id = None
print(res_sku)
print(prod_id, sku_id)

['14503', '4551247598']
14503 4551247598


In [280]:
product_urls[30000]

'https://goldapple.ru/19760343449-platinum-blonde-toning-shampoo'