In [1]:
import os
import random
import time
from datetime import datetime

from playwright.async_api import async_playwright, expect, Keyboard

import pandas as pd
import numpy as np
import openpyxl

from tqdm import tqdm
import asyncio
import nest_asyncio

nest_asyncio.apply()

In [2]:
os.makedirs('data/', exist_ok=True)

In [3]:
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'

In [4]:
async def open_browser(headless=True, user_agent=user_agent):
    """
    Starts the automated browser and opens a new window
    """
    playwright = await async_playwright().start()

    browser = await playwright.chromium.launch(headless=headless)

    context = await browser.new_context(user_agent=user_agent)
  
    page = await browser.new_page()

    return browser, page

In [5]:
driver, page = await open_browser()

In [6]:
url = 'https://cenoteka.rs/'
await page.goto(url)

<Response url='https://cenoteka.rs/' request=<Request url='https://cenoteka.rs/' method='GET'>>

In [7]:
urls = {
    #groceries
    'brasno': 'https://cenoteka.rs/brasno/',
    'jaja': 'https://cenoteka.rs/jaja/',
    'pirinac': 'https://cenoteka.rs/pirinac/',
    'secer': 'https://cenoteka.rs/secer/',
    'testenine': 'https://cenoteka.rs/testenine/',
    'ulje': 'https://cenoteka.rs/ulje/',
    'pekara': 'https://cenoteka.rs/pekara/',

    #dairy
    'jogurt': 'https://cenoteka.rs/jogurt/',
    'kiselo-mleko': 'https://cenoteka.rs/kiselo-mleko/',
    'mleko': 'https://cenoteka.rs/mleko/',
    'pavlaka': 'https://cenoteka.rs/pavlaka/',
    'sirevi': 'https://cenoteka.rs/sirevi/',

    #healthy food
    'bezglutenski-proizvodi': 'https://cenoteka.rs/bezglutenski-proizvodi/',
    'organski-proizvodi': 'https://cenoteka.rs/organski-proizvodi/',

    #fruits and vegetables
    'voce': 'https://cenoteka.rs/voce/',
    'smrznuto-voce': 'https://cenoteka.rs/smrznuto-voce/',
    'povrce': 'https://cenoteka.rs/povrce/',
    'smrznuto-povrce': 'https://cenoteka.rs/smrznuto-povrce/',

    #meat and fish
    'sveze-meso': 'https://cenoteka.rs/sveze-meso/',
    'sveza-riba': 'https://cenoteka.rs/sveza-riba/',
    'kobasice': 'https://cenoteka.rs/kobasice/',
    'slajs-pakovanja': 'https://cenoteka.rs/slajs-pakovanja/',
    'pecenica-prsuta-slanina': 'https://cenoteka.rs/pecenica-prsuta-slanina/',
    'smrznuta-riba-i-meso': 'https://cenoteka.rs/smrznuta-riba-i-meso/',
    'cvarci': 'https://cenoteka.rs/cvarci/',

    #drinks
    "cajevi": "https://cenoteka.rs/cajevi/",
    "gazirani-sokovi": "https://cenoteka.rs/gazirani-sokovi/",
    "kafa": "https://cenoteka.rs/kafa/",
    "sokovi": "https://cenoteka.rs/sokovi/",
    "voda": "https://cenoteka.rs/voda/",

    #candy
    'sladoledi': 'https://cenoteka.rs/sladoledi/',
    'bombone': 'https://cenoteka.rs/bombone/',
    'cokolade': 'https://cenoteka.rs/cokolade/',
    'cokoladice-i-stanglice': 'https://cenoteka.rs/cokoladice-i-stanglice/',

    #hygiene and beauty
    'sapun': 'https://cenoteka.rs/sapun/',
    'paste-za-zube': 'https://cenoteka.rs/paste-za-zube/',
    'tamponi-i-ulosci': 'https://cenoteka.rs/tamponi-i-ulosci/',
    'toalet-papir': 'https://cenoteka.rs/toalet-papir/',

    #cleaning supplies
    "praskasti-deterdzenti-za-ves": "https://cenoteka.rs/praskasti-deterdzenti-za-ves/",
    "tecni-deterdzenti-za-ves": "https://cenoteka.rs/tecni-deterdzenti-za-ves/",
    "deterdzent-za-posude": "https://cenoteka.rs/deterdzent-za-posude/",
    "sredstva-za-ciscenje-domacinstva": "https://cenoteka.rs/sredstva-za-ciscenje-domacinstva/",
}

In [None]:
all_data = []

for category, url in urls.items():
    await page.goto(url)
    await page.wait_for_selector('d-flex pb-2 px-1 col-md-4 col-lg-4 col-6')
    data = []

    while True:
        product_tiles = await page.locator('d-flex pb-2 px-1 col-md-4 col-lg-4 col-6').all()

        for product in product_tiles:
            try:
                name_el = product.locator('div.product_info a')
                if await name_el.is_visible():
                    name = await name_el.text_content()
                else:
                    continue

                try:
                    discount_el = product.locator('div.product_savings span')
                    discount = await discount_el.text_content()
                except:
                    discount = None

                price_rows = await product.locator('div.product_info_wrap div.row').all()

                for row in price_rows:
                    try:
                        brand = await row.locator('div.col-5 img').get_attribute('alt')
                        price = await row.locator('div.col-7').text_content()

                        row_data = {
                            'brand': brand,
                            'price': price.strip(),
                            'product_name': name.strip(),
                            'discount': discount,
                            'timestamp': datetime.now().isoformat(sep=' ', timespec='seconds'),
                            'category': category
                        }

                        data.append(row_data)
                    except:
                        continue

            except:
                continue

        await page.wait_for_timeout(1000)
        next_btn = page.locator('a.page-link.next')

        if await next_btn.is_visible():
            next_url = await next_btn.get_attribute('href')
            if next_url:
                await page.goto(next_url)
            else:
                break
        else:
            break

    all_data.extend(data)

In [None]:
prices = pd.DataFrame(all_data)
prices.head()

In [None]:
prices.shape

In [None]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
filename = f"data/baby_prices_{timestamp}.csv"
prices.to_csv(filename, index=False)