In [1]:
import os
import random
import time
from datetime import datetime

from playwright.async_api import async_playwright, expect, Keyboard

import pandas as pd
import numpy as np
import openpyxl

from tqdm import tqdm
import asyncio
import nest_asyncio

nest_asyncio.apply()

In [2]:
os.makedirs('data/', exist_ok=True)

In [3]:
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'

In [None]:
async def open_browser(headless=True, user_agent=user_agent):
    """
    Starts the automated browser and opens a new window
    """
    playwright = await async_playwright().start()

    browser = await playwright.firefox.launch(headless=headless)

    context = await browser.new_context(user_agent=user_agent)
  
    page = await browser.new_page()

    return browser, page

In [5]:
driver, page = await open_browser()

In [6]:
url = 'https://cenoteka.rs/'
await page.goto(url)

<Response url='https://cenoteka.rs/' request=<Request url='https://cenoteka.rs/' method='GET'>>

In [None]:
urls = {
    #candy
    'sladoledi': 'https://cenoteka.rs/sladoledi/',
    'bombone': 'https://cenoteka.rs/bombone/',
    'bombonjere': 'https://cenoteka.rs/bombonjere/',
    'cokolade': 'https://cenoteka.rs/cokolade/',
    'cokoladice-i-stanglice': 'https://cenoteka.rs/cokoladice-i-stanglice/',
    'kakao-napitak-i-topla-cokolada': 'https://cenoteka.rs/kakao-napitak-i-topla-cokolada/',
    'grickalice': 'https://cenoteka.rs/grickalice/',
    'keks-vafl-biskvit': 'https://cenoteka.rs/keks-vafl-biskvit/',
    'kremovi': 'https://cenoteka.rs/kremovi/',
    'kroasani': 'https://cenoteka.rs/kroasani/',
    'napolitanke': 'https://cenoteka.rs/napolitanke/',
    'zvake': 'https://cenoteka.rs/zvake/',
    'novogodisnji-paketici': 'https://cenoteka.rs/novogodisnji-paketici/',
    'uskrsnji-slatikisi': 'https://cenoteka.rs/uskrsnji-slatikisi/',
    'deserti': 'https://cenoteka.rs/deserti/'
}

In [None]:
all_data = []

for category, url in urls.items():
    await page.goto(url)
    await page.wait_for_selector('div.d-flex.pb-2.px-1.col-md-4.col-lg-4.col-6')
    data = []

    while True:
        product_tiles = await page.locator('div.d-flex.pb-2.px-1.col-md-4.col-lg-4.col-6').all()

        for product in product_tiles:
            try:
                name_el = product.locator('div.product_info a')
                if await name_el.is_visible():
                    name = await name_el.text_content()
                else:
                    continue

                try:
                    discount_el = product.locator('div.product_savings span')
                    discount = await discount_el.text_content()
                except:
                    discount = None

                price_rows = await product.locator('div.product_info_wrap div.row').all()

                for row in price_rows:
                    try:
                        brand = await row.locator('div.col-5 img').get_attribute('alt')
                        price = await row.locator('div.col-7').text_content()

                        row_data = {
                            'brand': brand,
                            'price': price.strip(),
                            'product_name': name.strip(),
                            'discount': discount,
                            'timestamp': datetime.now().isoformat(sep=' ', timespec='seconds'),
                            'category': category
                        }

                        data.append(row_data)
                    except:
                        continue

            except:
                continue

        await page.wait_for_timeout(1000)
        next_btn = page.locator('a.page-link.next')

        if await next_btn.is_visible():
            next_url = await next_btn.get_attribute('href')
            if next_url:
                await page.goto(next_url)
            else:
                break
        else:
            break

    all_data.extend(data)

In [None]:
prices = pd.DataFrame(all_data)
prices.head()

In [None]:
prices.shape

In [None]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
filename = f"data/candy_prices_{timestamp}.csv"
prices.to_csv(filename, index=False)