In [1]:
import time
import re
import json
import requests
import urllib.parse
import pandas as pd
from lxml import html 
from bs4 import BeautifulSoup

In [2]:
def convert_to_json (text):
    return json.loads(re.sub(r'&q;', '\"', text))

In [3]:
def extract_json_from_html(link):
    html = requests.get(link).text
    soup = BeautifulSoup(html,'lxml')
    json_containers = soup.select('script#rz-client-state')
    
    if len(json_containers):  
        return convert_to_json(json_containers[0].text)
    
    return {}


def get_subcategories(url):
    categories_json = extract_json_from_html(url)
    categories = []

    json_with_links = categories_json["G.https://xl-catalog-api.rozetka.com.ua/v2/super-portals/get?front-type=xl&a;category_id=4626923&a;lang=ua?"]['body']['data']['blocks'][1]['content']
    section_names = {'Міцні напої', "Вино\t", "Пиво та сидр"}

    for k, v in json_with_links.items():
        for item in v:
            if item['title'] in section_names:
                categories.extend(item['items'])
    return categories

In [4]:
def parse_comments(comments_json):
    comments = []
    for comment in comments_json:
        comments.append({
            "comment_id": comment['id'],
            "text": comment['text'],
            "mark": comment['mark'],
            "percent_dignity": comment['percent_dignity'],
            "dignity": comment['dignity'], 
            "shortcomings": comment['shortcomings'], 
            "attachments_number": len(comment['attachments']),
            "replies_number": len(comment['replies']),
            "is_from_buyer": comment['from_buyer'],
            "created_at": comment['commentdatajson']["created_iso"],
        })
    return comments
        
def get_product_comments(product_id, timeout=1):
    link_template = 'https://product-api.rozetka.com.ua/v3/comments/get?'

    qs_dict = {
        "front-type": ["xl"],
        "goods": [product_id],
        "page": ["1"],
        "sort": ["date"],
        "limit": ["10"],
        "lang": ["ua"],
    }
    
    qs = urllib.parse.urlencode(qs_dict, True)
    response_text = requests.get(link_template + qs).text
    time.sleep(timeout)

    response_json = json.loads(response_text)

    pages_count = response_json["data"]["pages"]["count"]
    item_name = response_json['data']["record"]["fulltitle"]
    
    # add comments from the first page
    comments_json = response_json['data']['comments']
    comments = parse_comments(comments_json)

    for i in range(2, pages_count + 1):
        qs_dict['page'] = i
        qs = urllib.parse.urlencode(qs_dict, True)
        response_text = requests.get(link_template + qs).text
        response_json = json.loads(response_text)
        comments_json = response_json['data']['comments']
        comments.extend(parse_comments(comments_json))
    
    for i in range(len(comments)):
        comments[i]["item_name"] = item_name

    return comments

In [5]:
def get_products_ids(category_link):
    product_ids = []
    pages_count = 0
    category_json = extract_json_from_html(category_link)

    # get product ids from the page
    for k,v in category_json.items():
        if 'body' in v:
                if 'data' in v['body']:
                    if 'ids' in v['body']['data']:
                        product_ids.extend(v['body']['data']['ids']) 
                    # get number of pages per category
                    if not pages_count and 'total_pages' in v['body']['data']:
                        pages_count = v['body']['data']['total_pages']
                        
                        
    for i in range(2, pages_count + 1):
        next_page_link = category_link + f'page={i}/'
        category_json = extract_json_from_html(next_page_link)
        for k,v in category_json.items():
            if 'body' in v:
                if 'data' in v['body']:
                    if 'ids' in v['body']['data']:
                        product_ids.extend(v['body']['data']['ids'])
        
    return product_ids

In [6]:
def get_all_comments(category):
    result = []

    category_title, category_link = category["title"], category["link"]
    print(category_title)
    print(category_link)
        
    product_ids = get_products_ids(category_link)
        
    for product_id in product_ids:
        comments = get_product_comments(product_id)

        for i in range(len(comments)):
            comments[i]['category'] = category_title
        result.extend(comments)
    print(len(result))
    
    return result

In [7]:
categories = get_subcategories('https://rozetka.com.ua/ua/alkoholnie-napitki-i-produkty/c4626923/')

In [8]:
categories

[{'title': 'Віскі', 'link': 'https://rozetka.com.ua/viski/c4649130/'},
 {'title': 'Коньяк',
  'link': 'https://rozetka.com.ua/konyak-i-brendi/c4649136/'},
 {'title': 'Горілка', 'link': 'https://rozetka.com.ua/vodka/c4649154/'},
 {'title': 'Ром', 'link': 'https://rozetka.com.ua/rom/c4649142/'},
 {'title': 'Джин', 'link': 'https://rozetka.com.ua/dgin/c4649166/'},
 {'title': 'Текіла',
  'link': 'https://rozetka.com.ua/tekila-i-meskal/c4649148/'},
 {'title': 'Грапа та кальвадос',
  'link': 'https://rozetka.com.ua/grappa-i-kalvados/c4649178/'},
 {'title': 'Лікери та аперитиви',
  'link': 'https://rozetka.com.ua/liquor-vermouth-syrup/c4625409/'},
 {'title': 'Тихе вино', 'link': 'https://rozetka.com.ua/tihoe-vino/c4649052/'},
 {'title': 'Вермут', 'link': 'https://rozetka.com.ua/vermut/c4649064/'},
 {'title': 'Ігристе вино',
  'link': 'https://rozetka.com.ua/shampanskoe-i-igristoe-vino/c4649058/vid175512=asti-asti,fragolino-fragolino,franchakorta-franciacorta,frizzante-frizzante,igristoe,kava-

In [9]:
for category in categories:
    comments = get_all_comments(category)
    df = pd.DataFrame(comments)
    df.to_csv('comments.csv', mode='a', header=False)

        

Пиво
https://rozetka.com.ua/pivo/c4626589/
2022
Сидр
https://rozetka.com.ua/sidr/c4649196/
115
Слабоалкогольні напої
https://rozetka.com.ua/slaboalkogoljnye-napitki/c4628313/
40
