### Imports

In [1]:
import os
import json
import requests
from bs4 import BeautifulSoup
import datetime
from deep_translator import GoogleTranslator

### Utils

In [2]:
def translate(text, source='pl', target='en'):
    return GoogleTranslator(source, target).translate(text)

In [10]:
def extract(ancestor, selector=None, attribute=None, multiple=False):
    if selector:
        if multiple:
            if attribute:
                return [tag[attribute].strip() for tag in ancestor.select(selector)]
            return [tag.text.strip() for tag in ancestor.select(selector)]
        if attribute:
            try:
                return ancestor.select_one(selector)[attribute].strip()
            except TypeError:
                return None
            
        try:
            return ancestor.select_one(selector).text.strip()
        except AttributeError:
            return None
    if attribute:
        return ancestor[attribute]
    return ancestor.text.strip()

In [11]:
            
selectors = {
'opinion_id': (None, "data-entry-id"),
'author': ("span.user-post__author-name",),
'recomend': ("span.user-post__author-recomendation > em",),
'stars':("span.user-post__score-count",),
'content_pl': ("div.user-post__text",),
'pros_pl' :("div.review-feature__item.review-feature__item--positive", None, True),
'cons_pl' : ("div.review-feature__item.review-feature__item--negative", None, True),
'up_votes' : ("button.vote-yes", "data-total-vote"),
'down_votes' : ("button.vote-no", "data-total-vote"),
'published' : ("span.user-post__published > time:nth-child(1)", "datetime"),
'purchased' : ("span.user-post__published > span > time:nth-child(2)", "datetime")
} 

### Extraction of opinions

In [12]:
with open("./cookie.json", "r", encoding="UTF-8") as jf:
   headers = json.load(jf)

In [13]:
product_id = input("Please enter product code: ")
url = f"https://www.ceneo.pl/{product_id}#tab=reviews"
all_opinions = []
while url:
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        print(url)
        page_dom = BeautifulSoup(response.text, "html.parser")
        opinions = page_dom.select("div.js_product-review:not(.user-post--highlight)")
        for opinion in opinions:
            single_opinion = {
                key: extract(opinion, *values) 
                for key, values in selectors.items()
            }
            single_opinion["content_en"] = translate(single_opinion["content_pl"])
            single_opinion["pros_en"] = [translate(pros) for pros in single_opinion["pros_pl"]]
            single_opinion["cons_en"] = [translate(cons) for cons in single_opinion["cons_pl"]]
            single_opinion["recomendation"] = True if single_opinion["recomend"] == "Polecam" else False if single_opinion["recomend"] == "Nie polecam" else None
            single_opinion["stars"] = float(single_opinion["stars"].split("/")[0].replace(",", "."))
            #single_opinion["published"] = datetime.datetime.fromisoformat(single_opinion["published"])
            #single_opinion["purchased"] = datetime.datetime.fromisoformat(single_opinion["purchased"]) if single_opinion["purchased"] else None
            single_opinion["up_votes"] = int(single_opinion["up_votes"])
            single_opinion["down_votes"] = int(single_opinion["down_votes"])
            all_opinions.append(single_opinion)
        try:
            url = "https://www.ceneo.pl"+page_dom.select_one("a.pagination__next")["href"]
        except TypeError:
            url = None
print(all_opinions)


https://www.ceneo.pl/167976636#tab=reviews
https://www.ceneo.pl/167976636/opinie-2
[{'opinion_id': '19535811', 'author': 'a...h', 'recomend': 'Polecam', 'stars': 5.0, 'content_pl': 'Świetna suszarka, spełnia wszystkie moje oczekiwania. Szybko suszy długie włosy. Mała, lekka i poręczna. Testuję dodatkowe końcówki do stylizacji. Jest też cicha suszarka więc nie mam już wyrzutów sumienia że budzę rodzinę nad ranem.', 'pros_pl': ['bardzo estetyczny wy', 'głośność pracy', 'mały gabaryt', 'moc', 'szybkość nagrzewania', 'świetne dodatki', 'wygląd'], 'cons_pl': [], 'up_votes': 0, 'down_votes': 0, 'published': '2025-03-19 16:39:41', 'purchased': '2025-03-03 07:48:53', 'content_en': 'A great dryer meets all my expectations. Dry long hair quickly. Small, light and handy. I test additional styling tips. There is also a quiet dryer so I have no remorse that I wake my family in the morning.', 'pros_en': ['very aesthetic', 'volume of work', 'small size', 'power', 'heating speed', 'great accessories',

### Creation of database

In [14]:
if not os.path.exists("./opinions"):
    os.mkdir("./opinions")

In [15]:
with open(f"./opinions/{product_id}.json", "w", encoding="UTF-8") as jf:
    json.dump(all_opinions, jf, indent=4, ensure_ascii=False)