In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup as BS
import requests
from tqdm.notebook import tqdm
from time import sleep
from random import random
from pathlib import Path

In [2]:
p_excel = Path('data', 'excel')
p_csv = Path('data', 'csv')
p_parquet = Path('data', 'parquet')
p_excel.mkdir(parents=True, exist_ok=True)
p_csv.mkdir(parents=True, exist_ok=True)
p_parquet.mkdir(parents=True, exist_ok=True)

In [3]:
def get_queue(cookies: dict = None):
    """
    Получаем ДатаФрейм с сылками на категории и подкатегории
    Для дальнейшего парсинга.
    """
    
    url = 'https://pack24.ru/'
    response = requests.get(url, cookies=cookies, timeout=10)
    if response.status_code == 200:
        # return response.text
        soup = BS(response.text, 'html.parser')

        tk1_name = []
        tk1_link = []
        tk1_id = []

        elem : BS
        for elem in list(soup.find('header').find_next_sibling('div').find_all('li'))[:]:
            tk1_link.append(elem.find('a').get('href'))
            tk1_name.append(elem.find('a').get_text(strip=True))
            tk1_id.append(elem.find('a').get('data-id'))

        df1 = pd.DataFrame({
            'tk1_id': tk1_id,
            'tk1_name': tk1_name,
            'tk1_link': tk1_link,
        })

        tk2_name = []
        tk2_link = []
        tk1_id = []

        elem : BS
        for elem in list(soup.find('div', class_="catalog-submenu").find_all('ul'))[:]:
            item: BS
            for item in list(elem.find_all('li'))[:]:
                tk1_id.append(elem.get('id')[6:])
                tk2_link.append(item.find('a').get('href'))
                tk2_name.append(item.find('a').get_text(strip=True))

        df2 = pd.DataFrame({
            'tk1_id': tk1_id,
            'tk2_name': tk2_name,
            'tk2_link': tk2_link,
        })

        return df1.merge(right=df2, how='right').convert_dtypes().astype({'tk1_id': 'Int16'})
    return Exception('status_code not 200')

In [4]:
def worker_01(ss: BS) -> pd.DataFrame:
    """
    Парсинг sku с первого типа страниц
    """
    sku_name = []
    sku_id = []
    sku_href = []
    sku_desc = []
    price_old = []
    price_old_sale = []
    price = []
    price_opt = []
    nds = []

    item: BS
    desc : BS
    for item in ss.find('div', class_='catalog').find_all('div', class_="item__body")[:]:
        sku_name.append(item.find('a', class_='item__name').get_text(strip=True))
        sku_id.append(item.find('input', type="hidden").get('value'))
        
        sku_href.append(item.find('a', class_='item__name').get('href'))
        if li := item.find('ul', class_='item__desc').find_all('li'):
        # Парсинг свойств.
            item_desc = {}
            for desc in li:
                try:
                    item_desc[desc.find('span').get_text(strip=True)] = re.sub(r'\n*\s+', ' ', desc.find('span').next_sibling.get_text(strip=True))
                except AttributeError:
                    continue
            sku_desc.append(item_desc)
        
        # price_old
        price_old_soup = item.find('div', class_='item__info').find('div', class_="item__price--with-discount")
        try:
            price_old.append(re.sub(r'\s', '',
                                re.findall(r'(\d{,3}\s?\d+.\d+)', price_old_soup.find('div').get_text())[0]))
        except AttributeError:
            price_old.append(None)
        try:
            price_old_sale.append(price_old_soup.find('div').find_next_sibling('div').get_text(strip=True))
        except AttributeError:
            price_old_sale.append(None)
        # price
        price.append(re.sub(r'\s', '',
                        re.findall(r'(\d{,3}\s?\d+.\d+)', item.find('div', class_='item__price').find('span').previous_element.get_text(strip=True))[0]))
        nds.append(re.sub(r'\s', '',
                        re.findall(r'(\d{,3}\s?\d+.\d+)', item.find('div', class_='flex').find('p').find_next_sibling('p').get_text(strip=True))[0]))
        price_opt.append(re.sub(r'\s', '',
                            re.findall(r'(\d{,3}\s?\d+.\d+)', item.find('div', class_='item__info').find('div', class_="inline-block").get_text(strip=True))[0]))

    df = pd.DataFrame({
                        'sku_id': sku_id,
                        'sku_name': sku_name,
                        'sku_href': sku_href,
                        'sku_desc': sku_desc,
                        'price_old': price_old,
                        'price_old_sale': price_old_sale,
                        'price': price,
                        'price_opt': price_opt,
                        'nds': nds,
                    })
    return df
        

In [5]:
def worket_02(ss: BS) -> pd.DataFrame:
    """
    Парсинг sku с второго типа страниц
    """
    sku_name = []
    sku_id = []
    sku_href = []
    sku_desc = []
    price_old = []
    price_old_sale = []
    price = []
    price_opt = []
    nds = []


    item: BS
    desc : BS

    
    # print(ss.prettify())
    # raise(Exception('stop'))


    for item in ss.find('div', class_='category js_category').find_all('li', class_='category-item')[:]:
        sku_name.append(item.find('meta').get('content').strip())
        # sku_id.append(item.find('div', class_='item').get('data-sku'))
        # raise(AttributeError)
        sku_href.append(item.find('a').get('href'))
        if li := item.find('ul').find_all('li'):
        # Парсинг свойств.
            item_desc = {}
            for index, desc in enumerate(li):
                item_desc[f'Описание {index+1}'] = desc.get_text(strip=True)
            sku_desc.append(item_desc)

        # # price_old
        price_old.append(None)
        price_old_sale.append(None)
        # price
        price.append(re.sub(r'\s', '',
                        (re.findall(r'(\d{,3}\s?\d+.\d+)', item.find('span', class_="category-item__pricy").get_text(strip=True))[0].replace(',','.'))))
        nds.append(None)
        price_opt.append(None)
    
    df = pd.DataFrame({
                        # 'sku_id': sku_id,
                        'sku_name': sku_name,
                        'sku_href': sku_href,
                        'sku_desc': sku_desc,
                        'price_old': price_old,
                        'price_old_sale': price_old_sale,
                        'price': price,
                        'price_opt': price_opt,
                        'nds': nds,
                    })
    return df

In [6]:
def get_sku(queue_df: pd.DataFrame, cookies: dict = None):
    """
    Запуск парсера по очереди, в зависимости от типа страницы.
    Есть пара страниц, без sku.
    
    Так же на сайте присустсвует защита по ip.
    Для того что бы обойти ее, необходимо перейти по появишейся ссылке,
    после в запросе к документу странице найти среди cookies: ihead_captcha_#############
    Его значение передать в параметр token
    """
    list_isnot = [
        'https://pack24.ru/polipropilenovye-pakety-bopp/paket-polipropilenovyj-donnaya-skladka',
        'https://pack24.ru/upakovochnaya-bumaga/bumaga-ofisnaya',
        'https://pack24.ru/termopakety-termosumki/termosumki',
        'https://pack24.ru/plomby/silovye-plomby',
        'https://pack24.ru/polipropilenovye-pakety-bopp/polipropilenovye-pakety-pod-zakaz',
    ]
    
    
    
    
    queue_list = list(map(lambda x: dict(x._asdict()), queue_df.itertuples(index=False)))

    df_list = []

    for row in tqdm(queue_list, desc='Downloads'):
        if row.get('tk2_link') in list_isnot:
            continue
        for _ in range(3):
            try:
                sleep(random() + 0.1)
                response = requests.get(row.get('tk2_link'), cookies=cookies, timeout=10)
            except Exception:
                continue
            else:
                break
        else:
            print(('Error', row.get('tk2_link')))
            continue
        
        soup = BS(response.text, 'html.parser')

        try:
            df_tmp = worker_01(soup)
        except AttributeError:
            try:
                df_tmp = worket_02(soup)
            except AttributeError:
                print(row.get('tk2_link'))
                df_tmp = pd.DataFrame()
        if df_tmp.shape[0]:
            df_tmp = df_tmp.assign(
                                    tk1_id = row.get('tk1_id'),
                                    tk1_name = row.get('tk1_name'),
                                    tk1_link = row.get('tk1_link'),
                                    tk2_name = row.get('tk2_name'),
                                    tk2_link = row.get('tk2_link'),
                                )
            df_list.append(df_tmp)
    return df_list

In [7]:
cookies = None
# cookies = {'ihead_captcha_hR5qU8sK4yG6':'7.%5E-JhAP7x%292LB6b'}

In [8]:
"""ПОЛУЧЕНИЕ ОЧЕРЕДИ"""
queue = get_queue(cookies)
if isinstance(queue, Exception):
    print('status_code not 200')

In [9]:
"""ПОЛУЧЕНИЕ ДАННЫХ"""
# df_list = get_sku(queue, '%2F%40r2-%40%40%21BuYF%2Ai0b')
df_list = get_sku(queue, cookies)
df = (pd.concat(df_list, ignore_index=True).convert_dtypes()
    .astype({
        'price_old': 'Float64',
        'price': 'Float64',
        'price_opt': 'Float64',
        'nds': 'Float64',
    })
    )


Downloads:   0%|          | 0/107 [00:00<?, ?it/s]

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1577 entries, 0 to 1576
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sku_id          1552 non-null   string 
 1   sku_name        1577 non-null   string 
 2   sku_href        1577 non-null   string 
 3   sku_desc        1577 non-null   object 
 4   price_old       101 non-null    Float64
 5   price_old_sale  101 non-null    string 
 6   price           1577 non-null   Float64
 7   price_opt       1552 non-null   Float64
 8   nds             1552 non-null   Float64
 9   tk1_id          1577 non-null   Int16  
 10  tk1_name        1577 non-null   string 
 11  tk1_link        1577 non-null   string 
 12  tk2_name        1577 non-null   string 
 13  tk2_link        1577 non-null   string 
dtypes: Float64(4), Int16(1), object(1), string(8)
memory usage: 171.1+ KB


In [11]:
# df['tk2_name'].unique().tolist()
tk2_filter_list = ['Четырехклапанные гофрокороба',
                    'Самосборные коробки',
                    'Почтовые коробки',
                    'Архивные коробки',
                    'Подарочные коробки',
                    'ЭКО-коробки (продукты, сувениры)',
                    'Коробки для пиццы',
                    'Обувные коробки']

In [12]:
df2 = (pd.concat(
    [   
        df.drop(columns=['sku_desc']),
        pd.json_normalize(df.sku_desc)[['Материал', 'Плотность', 'Мин. покупка от']],
        (pd.json_normalize(df.sku_desc)['Размер']
            .str.extract(r'(\d+[-\*]\d+[-\*]\d+)')[0]
            .str.split(r'[-\*]', regex=True)
            .apply(pd.Series)
            .astype('uint16[pyarrow]')
            .rename(columns={0: 'Д, мм', 1: 'Ш, мм', 2: 'В, мм',})
            .assign(**{'Объем, мм3': lambda x: x.prod(axis=1, skipna=False).astype('uint32[pyarrow]')}) 
            )
        ], axis=1
)
# .query('tk2_name.isin(@tk2_filter_list)')
)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1577 entries, 0 to 1576
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype          
---  ------           --------------  -----          
 0   sku_id           1552 non-null   string         
 1   sku_name         1577 non-null   string         
 2   sku_href         1577 non-null   string         
 3   price_old        101 non-null    Float64        
 4   price_old_sale   101 non-null    string         
 5   price            1577 non-null   Float64        
 6   price_opt        1552 non-null   Float64        
 7   nds              1552 non-null   Float64        
 8   tk1_id           1577 non-null   Int16          
 9   tk1_name         1577 non-null   string         
 10  tk1_link         1577 non-null   string         
 11  tk2_name         1577 non-null   string         
 12  tk2_link         1577 non-null   string         
 13  Материал         1250 non-null   object         
 14  Плотность        530 non

In [13]:
df2.to_excel(p_excel.joinpath('result.xlsx'), index=False)
df2.to_csv(p_csv.joinpath('result.csv'), index=False)
df2.to_parquet(p_parquet.joinpath('result.parquet'), engine='pyarrow', compression='zstd')