## Nederland

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as uc

from bs4 import BeautifulSoup
from fake_useragent import UserAgent

import openpyxl
from openpyxl import Workbook, load_workbook

import pandas as pd
import csv
import os
import re
import time
import random
from datetime import datetime


### Aldi

In [76]:

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize Chrome driver with Service
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to loop through
urls =  ["https://www.aldi.nl/zoeken.html?query=noten",
    "https://www.aldi.nl/producten/chips--noten/noten--zaden-en-pitten.html",
    "https://www.aldi.nl/producten/chips--noten/zoutjes.html",
    "https://www.aldi.nl/zoeken.html?query=pitten",
    "https://www.aldi.nl/zoeken.html?query=cashew",
    "https://www.aldi.nl/zoeken.html?query=amandelen",
    "https://www.aldi.nl/zoeken.html?query=trader+joe+pinda",
    "https://www.aldi.nl/zoeken.html?query=dry+roasted",
    "https://www.aldi.nl/zoeken.html?query=rozijn",
    "https://www.aldi.nl/zoeken.html?query=macadamia",
    "https://www.aldi.nl/zoeken.html?query=time4choco",
    "https://www.aldi.nl/zoeken.html?query=rijstzoutjes",
    "https://www.aldi.nl/zoeken.html?query=chocoladepinda"]


# Define the file name
file_name = "Berrie.xlsx"

# Check if the Excel file already exists
if os.path.exists(file_name):
    wb = openpyxl.load_workbook(file_name)  # Load existing file
    ws = wb.active
else:
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Products"
    ws.append(["Product Title", "Price", "Promo Price", "Weight", "Country", "Store", "Timestamp"])  # Headers

# Loop through the URLs and scrape data
for url in urls:
    driver.get(url)  # Navigate to the page first

    # Wait for the products to load
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "product-tile__content"))
        )
    except:
        print(f"Warning: No products found for {url}")

    # Get the page source after JavaScript renders it
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # List to hold product data
    products = []

    # Loop through all product tiles
    for product in soup.find_all('div', class_='product-tile__content'):
        # Extract product title
        title_element = product.find('h2', class_='product-tile__content__upper__product-name')
        title = title_element.get_text(strip=True) if title_element else 'Title not found'

        # Extract current price
        current_price_element = product.find('div', class_='product-tile__content__lower__wrapper__price-section__amount')
        current_price = current_price_element.get_text(strip=True) if current_price_element else 'Price not found'

        # Extract promo price (only the number, exclude percentage discount)
        promo_price_element = product.find('p', class_='text product-tile__content__lower__wrapper__price-section__discount__striked')
        promo_price = promo_price_element.get_text(strip=True) if promo_price_element else 'No promo price'

        # Extract weight
        weight_element = product.find('p', class_='product-tile__content__lower__wrapper__legal__text')
        weight = weight_element.get_text(strip=True) if weight_element else 'Weight not found'

        # Append product data
        products.append((title, current_price, promo_price, weight, "NL", "Aldi"))

    # Add timestamp
    timestamp = datetime.now().strftime('%Y-%m-%d')

    # Write product data to Excel
    for product in products:
        ws.append((*product, timestamp))

# Save the workbook
wb.save(file_name)

print(f"Data successfully saved to {file_name}")

# Close the driver
driver.quit()


Data successfully saved to Berrie.xlsx


## Dirk

In [90]:


# Initialize Chrome driver with options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

url = "https://www.dirk.nl/boodschappen/snacks-snoep/chocolade"
driver.get(url)
time.sleep(5)  # Initial load wait

# Function to scroll down
def scroll_to_load_more(driver, wait_time=2, scroll_increment=1200, scroll_limit=2):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(scroll_limit):
        driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
        time.sleep(wait_time)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

scroll_to_load_more(driver)

# Function to safely click an element
def safe_click(xpath):
    try:
        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, xpath))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
        time.sleep(1)
        element.click()
    except Exception as e:
        print(f"Error clicking {xpath}: {e}")
        try:
            driver.execute_script("arguments[0].click();", element)  # JavaScript fallback
        except:
            print(f"JavaScript click failed for {xpath}")

# Close pop-ups or overlays if present
try:
    close_button = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept')]"))
    )
    close_button.click()
    time.sleep(2)
except:
    print("No pop-ups found.")

# Click filters
safe_click("//label[contains(text(), 'Overige chocolade & bonbons')]")
time.sleep(3)
safe_click("//label[contains(text(), '1 de Beste')]")
time.sleep(5)

# Wait for products to load
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, "//article[@data-product-id]"))
)

# Parse page content
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

# Extract product information
products = []

for article in soup.find_all('article', attrs={'data-product-id': True}):
    title = article.find('p', class_='title').get_text(strip=True) if article.find('p', class_='title') else 'Title not found'
    
    price_integer = article.find('span', class_='price-large')
    price_decimals = article.find('span', class_='price-small')
    price = f"{price_integer.get_text(strip=True)},{price_decimals.get_text(strip=True)}" if price_integer and price_decimals else 'Price not found'

    promo_price_span = article.find('div', class_='label price-label')
    promo_price = promo_price_span.find('span', class_='regular-price').find('span').get_text(strip=True) if promo_price_span else 'Promo price not found'

    weight_span = article.find('span', class_='subtitle')
    weight = weight_span.get_text(strip=True) if weight_span else 'Weight not found'

    products.append((title, price, promo_price, weight, "Non_Branded", "Dirk"))

# Save to Excel
timestamp = datetime.now().strftime('%Y-%m-%d')
file_name = "Berrie.xlsx"

if os.path.exists(file_name):
    wb = openpyxl.load_workbook(file_name)
    ws = wb.active
else:
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Products"
    ws.append(["Product Title", "Price", "Promo Price", "Weight", "Branded", "Retailer", "Timestamp"])

for product in products:
    ws.append((*product, timestamp))

wb.save(file_name)
print(f"Data has been successfully saved to {file_name}")

driver.quit()


No pop-ups found.
Data has been successfully saved to Berrie.xlsx


## Vomar

In [78]:
# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = [
    "https://www.vomar.nl/zoeken?search=g%27woon%20choco",
    "https://www.vomar.nl/zoeken?search=noten",
    "https://www.vomar.nl/zoeken?search=pitten",
    "https://www.vomar.nl/zoeken?search=rijstzoutjes"
]

products = []

for url in urls:
    driver.get(url)
    time.sleep(5)  # Allow time for page to load
    
    # Click the "Weigeren" button to reject cookies on the Vomar site
    try:
        deny_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "CybotCookiebotDialogBodyButtonDecline")))
        deny_button.click()
    except:
        print("No accept cookies button found.")

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # Loop through all product articles
    for article in soup.find_all('div', class_='col-xs-12 col-md-3 product'):
        title = article.find('p', class_='description').get_text(strip=True) if article.find('p', class_='description') else 'Title not found'

        price_integer = article.find('span', class_='large')
        price_decimals = article.find('span', class_='small')
        
        if price_integer and price_decimals:
            price = f"{price_integer.get_text(strip=True)}{price_decimals.get_text(strip=True)}"
        else:
            price = 'Price not found'

        promo_price = 'Promo price not found'
        weight = 'Weight not found'
        
        products.append((title, price, promo_price, weight, "NL", "Vomar"))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')

# Define the file name
file_name = "Berrie.xlsx"

# Check if the Excel file already exists
if os.path.exists(file_name):
    wb = openpyxl.load_workbook(file_name)
    ws = wb.active
else:
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Products"

# Write product data to Excel
for product in products:
    ws.append((*product, timestamp))

# Save the workbook to an Excel file
wb.save(file_name)

print(f"Data has been successfully saved to {file_name}")

driver.quit()


No accept cookies button found.
No accept cookies button found.
No accept cookies button found.
Data has been successfully saved to Berrie.xlsx


## Duitsland

### Aldi

In [79]:
# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize Chrome driver with Service
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = [
    "https://www.aldi-nord.de/suchergebnisse.html?query=asiatisce%20snack&searchCategory=Submitted%20Search&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true",
    "https://www.aldi-nord.de/suchergebnisse.html?query=kerne&searchCategory=Submitted%20Search&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true",
    "https://www.aldi-nord.de/sortiment/snacks-suessigkeiten/nuesse-trockenfruechte.html",
    "https://www.aldi-nord.de/suchergebnisse.html?query=trader%20joe%20n%C3%BCsse&searchCategory=Submitted%20Search&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true",
    "https://www.aldi-nord.de/suchergebnisse.html?query=trader%20joe%20mix&searchCategory=Submitted%20Search",
    "https://www.aldi-nord.de/suchergebnisse.html?query=schoko%20rosinen&searchCategory=Submitted%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12",
    "https://www.aldi-nord.de/suchergebnisse.html?searchCategory=Submitted%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&query=choceur%20peanuts"
]

# Create an empty list to store all product details
all_products = []

# Loop over the list of URLs
for url in urls:
    driver.get(url)

    # Wait for the articles to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "mod-article-tile--default")))

    # Retrieve the elements after the wait
    articles = driver.find_elements(By.CLASS_NAME, "mod-article-tile--default")

    # Extract details for each article on the page
    for article in articles:
        # Use BeautifulSoup to parse the individual article's HTML
        soup = BeautifulSoup(article.get_attribute('outerHTML'), "html.parser")

        title = soup.find('span', class_='mod-article-tile__title').get_text(strip=True) if soup.find('span', class_='mod-article-tile__title') else 'Title not found'
        promo_price_element = soup.find('s', class_='price__previous')
        promo_price = promo_price_element.get_text(strip=True) if promo_price_element else 'Promo price not found'
        current_price_element = soup.find('span', class_='price__wrapper')
        current_price = current_price_element.get_text(strip=True) if current_price_element else 'Price not found'
        weight = soup.find('span', class_='price__unit').get_text(strip=True) if soup.find('span', class_='price__unit') else 'Weight not found'

        all_products.append({
            "Product Title": title,
            "Price": current_price,
            "Promo Price": promo_price,
            "Weight": weight,
            "Country": "DE",
            "Store": "Aldi"
        })

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Convert the list of products to a pandas DataFrame
df = pd.DataFrame(all_products)

# Add the timestamp column to the DataFrame
df["Timestamp"] = timestamp

# Save to Excel (Append if file exists)
excel_filename = 'Berrie.xlsx'

if os.path.exists(excel_filename):
    # Load the existing Excel file and append the new data
    existing_df = pd.read_excel(excel_filename, engine='openpyxl')
    updated_df = pd.concat([existing_df, df], ignore_index=True)
    updated_df.to_excel(excel_filename, index=False, engine='openpyxl')
else:
    # If the file doesn't exist, create a new one
    df.to_excel(excel_filename, index=False, engine='openpyxl')

print(f"Data has been successfully saved to {excel_filename}.")

# Close the driver
driver.quit()


Data has been successfully saved to Berrie.xlsx.


### Globus

In [80]:


# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
url_list = [
    "https://produkte.globus.de/bobenheim-roxheim/search?query=jeden+tag+kern",
    "https://produkte.globus.de/bobenheim-roxheim/search?p=1&query=jeden%20tag%20n%C3%BCsse",
    "https://produkte.globus.de/bobenheim-roxheim/search?p=2&query=jeden%20tag%20n%C3%BCsse"
]

# Get the current timestamp for CSV file
timestamp = datetime.now().strftime('%Y-%m-%d')

# Create an empty list to store all product details
all_products = []

# Loop over each URL
for page_url in url_list:
    print(f"Scraping URL: {page_url}...")

    # Open the current URL
    driver.get(page_url)
    time.sleep(5)

    # Get the page source and parse it with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Loop through all product cards and extract data
    for product_card in soup.find_all("div", class_="product-info"):
        # Extract product title
        title_tag = product_card.find("a", class_="product-image-link product-name")
        title = title_tag.get("title").strip() if title_tag else "Title not found"

        # Extract price
        price_div = product_card.find("div", class_="unit-price js-unit-price")
        price = price_div.get("data-value") if price_div and price_div.has_attr("data-value") else "Price not found"

        # Extract weight
        weight_div = product_card.find("div", class_="price-unit-content")
        if weight_div:
            # Extract only the weight part before the first parenthesis
            weight = weight_div.text.split("(")[0].strip()
        else:
            weight = "Weight not found"


        # Extract promo price
        promo_price = "Promo price not found"  # Default value in case promo price is not found
        promo_price_div = product_card.find("div", class_="product-price-globus-discount")
        if promo_price_div:
            promo_price_element = promo_price_div.find("div", class_="unit-price js-unit-price discount-price")
            if promo_price_element:
                promo_price = promo_price_element.text.strip()

        # Append the product data to the list
        all_products.append({
            "Product Title": title,
            "Price": price,
            "Promo Price": promo_price,
            "Weight": weight,
            "Country": "DE",
            "Store": "Globus",
            "Timestamp": timestamp
        })

# Convert the list of products to a pandas DataFrame
df = pd.DataFrame(all_products)

# Append to Excel file
excel_filename = 'Berrie.xlsx'

if os.path.exists(excel_filename):
    # If the file exists, load the existing file and append the new data
    existing_df = pd.read_excel(excel_filename, engine='openpyxl')
    updated_df = pd.concat([existing_df, df], ignore_index=True)
    updated_df.to_excel(excel_filename, index=False, engine='openpyxl')
else:
    # If the file doesn't exist, create a new one
    df.to_excel(excel_filename, index=False, engine='openpyxl')

print(f"Data has been successfully saved to {excel_filename}")

# Close the driver after extracting data
driver.quit()

print("Scraping process completed successfully!")


Scraping URL: https://produkte.globus.de/bobenheim-roxheim/search?query=jeden+tag+kern...
Scraping URL: https://produkte.globus.de/bobenheim-roxheim/search?p=1&query=jeden%20tag%20n%C3%BCsse...
Scraping URL: https://produkte.globus.de/bobenheim-roxheim/search?p=2&query=jeden%20tag%20n%C3%BCsse...
Data has been successfully saved to Berrie.xlsx
Scraping process completed successfully!


### Edeka

In [2]:
# Initialize Chrome driver with options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Disable for debugging
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = [
    "https://www.edeka24.de/Lebensmittel/Suess-Salzig/Chips-Knabbereien/#search:query=n%C3%BCsse+edeka&skipQueryLogging=true&returnResultsForLandingpages=true&first=0",
    "https://www.edeka24.de/Lebensmittel/Suess-Salzig/Chips-Knabbereien/#search:query=kerne+edeka&skipQueryLogging=true&returnResultsForLandingpages=true&first=0",
    "https://www.edeka24.de/Lebensmittel/Suess-Salzig/Schoko-Leckereien/#search:query=alpia&skipQueryLogging=true&returnResultsForLandingpages=true&first=0",
    "https://www.edeka24.de/Lebensmittel/Suess-Salzig/Nuesse-getrocknete-Fruechte/#search:query=edeka+fruchtige+mi&skipQueryLogging=true&returnResultsForLandingpages=true&first=0"
]

# List to store all product data
all_products = []

# Function to extract weight from title
def extract_weight(title):
    match = re.search(r'(\d+)\s*(G|KG|ML|L)\b', title, re.IGNORECASE)
    return f"{match.group(1)} {match.group(2).upper()}" if match else "Not found"

# Loop through all URLs
for url in urls:
    print(f"Scraping data from {url}")
    driver.get(url)

    # Wait for product elements to appear
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "product-details")))
    except:
        print(f"Products did not load properly on {url}")
        continue

    # Keep track of previous product count to detect if more products are loading
    previous_product_count = 0

    # Click "Load More" button until it's no longer available
    while True:
        # Count the current number of products
        product_elements = driver.find_elements(By.CLASS_NAME, "product-details")
        current_product_count = len(product_elements)

        # Stop clicking if no new products have loaded
        if current_product_count == previous_product_count:
            print("No more new products loaded. Stopping.")
            break

        previous_product_count = current_product_count  # Update the count

        try:
            # Locate all buttons with ID 'loader-btn'
            load_more_buttons = driver.find_elements(By.ID, "loader-btn")

            button_clicked = False
            for button in load_more_buttons:
                button_classes = button.get_attribute("class")

                # Click only if it's the correct button (not 'endlist')
                if "button-primary" in button_classes and "endlist" not in button_classes:
                    ActionChains(driver).move_to_element(button).click().perform()
                    print("Clicked 'Load More' button")
                    button_clicked = True
                    break  # Exit loop after clicking

            if not button_clicked:
                print("Reached end of product list.")
                break  # Stop loop when no valid button is found

            time.sleep(2)  # Wait for new products to load

        except:
            print("No more 'Load More' button or could not click.")
            break  # Exit loop if there's an exception

    # Get page source after all products are loaded
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # List to store extracted product information for this URL
    products = []

    # Loop through all product elements
    for product_card in soup.find_all('div', class_='product-details'):
        # Extract product title
        title_tag = product_card.find('a', class_='title')
        title = title_tag.get_text(strip=True) if title_tag else 'Title not found'

        # Extract weight from title
        weight = extract_weight(title)

        # Extract price and promo price
        price_div = product_card.find('div', class_='price salesprice')  # For promo products
        normal_price_div = product_card.find('div', class_='price')  # For normal products

        promo_price = ""
        price = "0.00"

        if price_div:
            # Extracting prices for promo products
            price_texts = [text.strip().replace('€', '').strip() for text in price_div.stripped_strings]

            if len(price_texts) == 2:
                promo_price, price = price_texts  # First price = promo, second price = original
            elif len(price_texts) == 1:
                price = price_texts[0]  # Only one price found (no promo)
        
        elif normal_price_div:
            # Extracting price for non-promo products
            price = normal_price_div.get_text(strip=True).replace('€', '').strip()

        # Store product details
        products.append({
            'Product Title': title,
            'Price': price,
            'Promo Price': promo_price,
            'Weight': weight,
            'Country': 'DE',
            'Store': "Edeka"
        })

    # Add timestamp to each product
    timestamp = datetime.now().strftime('%Y-%m-%d')
    for product in products:
        product['Timestamp'] = timestamp

    # Append data to global list
    all_products.extend(products)

# Convert to DataFrame
df = pd.DataFrame(all_products)

# Save to Excel file
excel_filename = 'Berrie.xlsx'

# Append to Excel file if it exists
if os.path.exists(excel_filename):
    try:
        existing_df = pd.read_excel(excel_filename, engine='openpyxl')
        updated_df = pd.concat([existing_df, df], ignore_index=True)
        updated_df.to_excel(excel_filename, index=False, engine='openpyxl')
        print(f"Appended data to existing file: {excel_filename}")
    except Exception as e:
        print(f"Error appending to {excel_filename}: {e}")
else:
    try:
        df.to_excel(excel_filename, index=False, engine='openpyxl')
        print(f"Created new file and saved data: {excel_filename}")
    except Exception as e:
        print(f"Error saving to {excel_filename}: {e}")

# Close the driver only once after all URLs are processed
driver.quit()


Scraping data from https://www.edeka24.de/Lebensmittel/Suess-Salzig/Chips-Knabbereien/#search:query=n%C3%BCsse+edeka&skipQueryLogging=true&returnResultsForLandingpages=true&first=0
Clicked 'Load More' button
Reached end of product list.
Scraping data from https://www.edeka24.de/Lebensmittel/Suess-Salzig/Chips-Knabbereien/#search:query=kerne+edeka&skipQueryLogging=true&returnResultsForLandingpages=true&first=0
Reached end of product list.
Scraping data from https://www.edeka24.de/Lebensmittel/Suess-Salzig/Schoko-Leckereien/#search:query=alpia&skipQueryLogging=true&returnResultsForLandingpages=true&first=0
Reached end of product list.
Scraping data from https://www.edeka24.de/Lebensmittel/Suess-Salzig/Nuesse-getrocknete-Fruechte/#search:query=edeka+fruchtige+mi&skipQueryLogging=true&returnResultsForLandingpages=true&first=0
Reached end of product list.
Appended data to existing file: Berrie.xlsx


## Frankrijk

### Aldi

In [3]:


# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = [
    "https://www.aldi.fr/produits/epicerie-salee/biscuit-aperitif-chips.html",
    "https://www.aldi.fr/recherche.html?query=trader%20joe&searchCategory=Submitted%20Search",
    "https://www.aldi.fr/recherche.html?query=Pignons&searchCategory=Submitted%20Search",
    "https://www.aldi.fr/recherche.html?query=isaura%20choco%20peanut&searchCategory=Submitted%20Search"
]

# Create an empty list to store all product details
all_products = []

# Loop over the list of URLs
for url in urls:
    driver.get(url)

    # Wait for the articles to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "mod-article-tile--default")))

    # Retrieve the elements after the wait
    articles = driver.find_elements(By.CLASS_NAME, "mod-article-tile--default")

    # Extract details for each article on the page
    for article in articles:
        # Use BeautifulSoup to parse the individual article's HTML
        soup = BeautifulSoup(article.get_attribute('outerHTML'), "html.parser")

        title = soup.find('span', class_='mod-article-tile__title').get_text(strip=True) if soup.find('span', class_='mod-article-tile__title') else 'Title not found'
        
        promo_price_element = soup.find('s', class_='price__previous')
        promo_price = promo_price_element.get_text(strip=True) if promo_price_element else 'Promo price not found'
        
        current_price_element = soup.find('span', class_='price__wrapper')
        if current_price_element:
            # Remove euro sign and convert to correct format
            current_price = re.sub(r'[^\d,]', '', current_price_element.get_text(strip=True))
            current_price = current_price.replace(',', '.')  # Convert `2,99` to `2.99`
        else:
            current_price = 'Price not found'
        
        weight = soup.find('span', class_='price__unit').get_text(strip=True) if soup.find('span', class_='price__unit') else 'Weight not found'

        all_products.append({
            "Product Title": title,
            "Price": current_price,
            "Promo Price": promo_price,
            "Weight": weight,
            "Country": "FR",
            "Store": "Aldi"
        })

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Create a DataFrame
df = pd.DataFrame(all_products)

# Add a timestamp column
df["Timestamp"] = timestamp

# Convert price column to float if possible
df["Price"] = pd.to_numeric(df["Price"], errors='coerce')

# Append to Excel file
excel_filename = 'Berrie.xlsx'

if os.path.exists(excel_filename):
    # If the file exists, load the existing file and append the new data
    existing_df = pd.read_excel(excel_filename, engine='openpyxl')
    updated_df = pd.concat([existing_df, df], ignore_index=True)
    updated_df.to_excel(excel_filename, index=False, engine='openpyxl')
else:
    # If the file doesn't exist, create a new one
    df.to_excel(excel_filename, index=False, engine='openpyxl')

print(f"Data has been successfully saved to {excel_filename}")

# Close the driver
driver.quit()

Data has been successfully saved to Berrie.xlsx


### Carrefour

In [4]:

# Initialize Chrome driver with Service
options = Options()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# List of URLs to scrape
urls = [
    "https://www.carrefour.fr/s?filters%5Bfacet_marque%5D%5B0%5D=CARREFOUR&q=melange&noRedirect=1&userIsPro=0&page=1",
    "https://www.carrefour.fr/r/epicerie-sucree/sucres-farines-coulis-et-preparation-gateaux/aide-a-la-patisserie/fruits-secs-fruits-confits?filters%5Bfacet_marque%5D%5B0%5D=CARREFOUR&noRedirect=0&userIsPro=0",
    "https://www.carrefour.fr/r/epicerie-sucree/chocolats-et-bonbons/confiseries-chocolatees/billes-et-bonbons-au-chocolat?filters%5Bfacet_marque%5D%5B0%5D=CARREFOUR&noRedirect=0&userIsPro=0"
]

# List to store product information
all_products = []

for url in urls:
    # Open the URL
    time.sleep(2)
    driver.get(url)

    # Handle cookie consent
    try:
        # Wait for the cookie settings button to appear
        param_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "onetrust-pc-btn-handler")))
        param_button.click()

        # Wait for and click the "refuse all" button
        confirm_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, "ot-pc-refuse-all-handler")))
        confirm_button.click()
    except Exception as e:
        print(f"Cookie consent handling failed")

    # Parse page source with BeautifulSoup
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # Extract product details
    for product_pod in soup.find_all("div", class_="main-layout__info-zone"):
        # Extract title
        title_tag = product_pod.find("a", class_="product-card-title")
        title = title_tag.text.strip() if title_tag else "Title not found"

        # Extract weight
        weight_tag = product_pod.find("p", class_="pl-text--size-m")
        weight = weight_tag.text.strip() if weight_tag else "Weight not found"

        # Extract current price (main price)
        price_main_tag = product_pod.find("div", class_="product-price__amount--main")
        if price_main_tag:
            price_main_parts = price_main_tag.find_all("p", class_="product-price__content")
            if len(price_main_parts) >= 2:
                current_price = f"{price_main_parts[0].text.strip()}{price_main_parts[1].text.strip()} €"
            else:
                current_price = "Price not found"
        else:
            current_price = "Price not found"

        # Extract promotional price
        promo_price_tag = product_pod.find("div", class_="product-price__amount--old")
        if promo_price_tag:
            promo_price_parts = promo_price_tag.find_all("p", class_="product-price__content")
            if len(promo_price_parts) >= 2:
                promo_price = f"{promo_price_parts[0].text.strip()},{promo_price_parts[1].text.strip()} €"
            else:
                promo_price = "Promo price not found"
        else:
            promo_price = "Promo price not found"

        # Add static values
        Country = "FR"
        Store = "Carrefour"

        # Append extracted information to the list
        all_products.append((title, current_price, promo_price, weight, Country, Store))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Prepare the data for saving
df = pd.DataFrame(all_products, columns=["Product Title", "Price", "Promo Price", "Weight", "Country", "Store"])

# Add timestamp to the DataFrame
df["Timestamp"] = timestamp

# Excel file name
excel_filename = 'Berrie.xlsx'

# Check if the Excel file exists
if os.path.exists(excel_filename):
    # Read the existing data from the Excel file
    existing_df = pd.read_excel(excel_filename, engine='openpyxl')

    # Append the new data to the existing data
    combined_df = pd.concat([existing_df, df], ignore_index=True)

    # Save the combined data back to the same sheet
    with pd.ExcelWriter(excel_filename, engine='openpyxl', mode='w') as writer:
        combined_df.to_excel(writer, index=False)
else:
    # If the file doesn't exist, save the new data as a new Excel file
    df.to_excel(excel_filename, index=False, engine='openpyxl')

print(f"Data has been successfully saved to {excel_filename}")

# Close the driver
driver.quit()

Cookie consent handling failed
Cookie consent handling failed
Data has been successfully saved to Berrie.xlsx


## Polen

### Aldi

In [5]:


# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = [
    "https://www.aldi.pl/szukaj.html?query=orzechy%20trader&searchCategory=Suggested%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12",
    "https://www.aldi.pl/szukaj.html?query=asia&searchCategory=Suggested%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12",
    "https://www.aldi.pl/nasze-produkty/przekaski/pestki--nasiona--ziarna.html",
    "https://www.aldi.pl/szukaj.html?query=trader%20joe%27s%20&searchCategory=Suggested%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12",
    "https://www.aldi.pl/szukaj.html?query=orzeszki%20trader&searchCategory=Suggested%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12",
    "https://www.aldi.pl/szukaj.html?query=rodzynki&searchCategory=Suggested%20Search",
    "https://www.aldi.pl/szukaj.html?query=Orzechy%20laskowe%2FMigda%C5%82y%20w%20czekoladzie%20mlecznej&searchCategory=Submitted%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12"
]

# Create an empty list to store all product details
all_products = []

# Loop over the list of URLs
for url in urls:
    driver.get(url)

    # Wait for the articles to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "mod-article-tile--default")))

    # Retrieve the elements after the wait
    articles = driver.find_elements(By.CLASS_NAME, "mod-article-tile--default")

    # Extract details for each article on the page
    for article in articles:
        # Use BeautifulSoup to parse the individual article's HTML
        soup = BeautifulSoup(article.get_attribute('outerHTML'), "html.parser")

        title = soup.find('span', class_='mod-article-tile__title').get_text(strip=True) if soup.find('span', class_='mod-article-tile__title') else 'Title not found'
        promo_price_element = soup.find('s', class_='price__previous')
        promo_price = promo_price_element.get_text(strip=True) if promo_price_element else 'Promo price not found'
        current_price_element = soup.find('span', class_='price__wrapper')
        current_price = current_price_element.get_text(strip=True) if current_price_element else 'Price not found'
        weight = soup.find('span', class_='price__unit').get_text(strip=True) if soup.find('span', class_='price__unit') else 'Weight not found'

        Country = "PL"
        Store = "Aldi"

        all_products.append((title, current_price, promo_price, weight, Country, Store))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Prepare data for saving to CSV and Excel
df = pd.DataFrame(all_products, columns=["Product Title", "Price", "Promo Price", "Weight", "Country", "Store"])

# Add timestamp to the DataFrame
df["Timestamp"] = timestamp

# Excel file name
excel_filename = 'Berrie.xlsx'

# Check if the Excel file exists
if os.path.exists(excel_filename):
    # Read the existing data from the Excel file
    existing_df = pd.read_excel(excel_filename, engine='openpyxl')

    # Append the new data to the existing data
    combined_df = pd.concat([existing_df, df], ignore_index=True)

    # Write back the combined data to the same sheet
    with pd.ExcelWriter(excel_filename, engine='openpyxl', mode='w') as writer:
        combined_df.to_excel(writer, index=False, sheet_name='Sheet1')
else:
    # If the file doesn't exist, create a new file with the data
    df.to_excel(excel_filename, index=False, engine='openpyxl')

print(f"Data has been successfully saved to {excel_filename}")

# Close the driver
driver.quit()


Data has been successfully saved to Berrie.xlsx


### Biedronka

In [6]:


# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = [
    "https://zakupy.biedronka.pl/artykuly-spozywcze/przekaski/orzeszki/",
    "https://zakupy.biedronka.pl/artykuly-spozywcze/przekaski/bakalie/",
    "https://zakupy.biedronka.pl/search?q=Magnetic+w+czekoladzie",
    "https://zakupy.biedronka.pl/search?q=Wawel+%C5%9Aliwki+w+czekoladzie+180g",
    "https://zakupy.biedronka.pl/search?q=Baitz+Milk+Cookie+Balls+Koekjes+in+Melkchocolade+75+g"
]

# List to store all product information across multiple pages
all_products = []

# Loop over each URL
for url in urls:
    driver.get(url)

    try:
        # Wait for the cookie consent button to be clickable (increased timeout)
        param_button = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.ID, "onetrust-pc-btn-handler")))
        param_button.click()

        # Wait for and click the button to confirm cookie consent
        confirm_button = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.CLASS_NAME, "ot-pc-refuse-all-handler")))
        confirm_button.click()
    except TimeoutException:
        print(f"Cookie consent not found for URL: {url} or took too long to load")

    # Parse page source with BeautifulSoup
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # Extract product information for the current page
    for product_pod in soup.find_all("div", class_="product-tile js-product-tile"):
        # Extract title
        title_tag = product_pod.find("div", class_="product-tile__name product-tile__name--overflow")
        title = title_tag.text.strip() if title_tag else "Title not found"

        # Extract weight (only the weight value, e.g., "0.2kg")
        weight_tag = product_pod.find("div", class_="packaging-details")
        if weight_tag:
            weight = weight_tag.contents[0].strip()  # Get the first part before the <span> tag
        else:
            weight = "Weight not found"
        
        # Extract current price (main price)
        price_main_tag = product_pod.find("div", class_="price-tile__sales")
        if price_main_tag:
            # Extract the integer part of the price
            integer_part = price_main_tag.find(text=True, recursive=False).strip() if price_main_tag else None
            decimal_part = price_main_tag.find("span", class_="price-tile__decimal")
            if integer_part and decimal_part:
                # Combine integer and decimal parts into one properly formatted price
                raw_price = f"{integer_part.strip()}{decimal_part.text.strip()}"  # Combine without formatting
                if len(raw_price) > 2:
                    current_price = f"{raw_price[:-2]}.{raw_price[-2:]}"  # Insert decimal point two digits from the end
                else:
                    current_price = f"0,{raw_price}"  # Handle cases where price is less than 1 zł
            else:
                current_price = "Price not found"
        else:
            current_price = "Price not found"

        # Remove any extra spaces (just in case)
        current_price = current_price.replace(" ", "").strip()

        # Extract promo price if available
        promo_price_tag = product_pod.find("div", class_="product-tile-prices__regular")
        if promo_price_tag:
            promo_price = promo_price_tag.find("span", class_="product-tile-prices__amount")
            if promo_price:
                promo_price = promo_price.text.strip()
            else:
                promo_price = "Promo Price not found"
        else:
            promo_price = "Promo Price not found"

        Country = "PL"
        Store = "Biedronka"

        # Append extracted information to the list
        all_products.append((title, current_price, promo_price, weight, Country, Store))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Prepare the data for saving
df = pd.DataFrame(all_products, columns=["Product Title", "Price", "Promo Price", "Weight", "Country", "Store"])

# Add timestamp to the DataFrame
df["Timestamp"] = timestamp

# Excel file name
excel_filename = 'Berrie.xlsx'

# Check if the Excel file exists
if os.path.exists(excel_filename):
    # Read the existing data from the Excel file
    existing_df = pd.read_excel(excel_filename, engine='openpyxl')

    # Append the new data to the existing data
    combined_df = pd.concat([existing_df, df], ignore_index=True)

    # Save the combined data back to the same sheet
    with pd.ExcelWriter(excel_filename, engine='openpyxl', mode='w') as writer:
        combined_df.to_excel(writer, index=False)
else:
    # If the file doesn't exist, save the new data as a new Excel file
    df.to_excel(excel_filename, index=False, engine='openpyxl')

print(f"Data has been successfully saved to {excel_filename}")

# Quit the driver
driver.quit()

  integer_part = price_main_tag.find(text=True, recursive=False).strip() if price_main_tag else None


Cookie consent not found for URL: https://zakupy.biedronka.pl/artykuly-spozywcze/przekaski/bakalie/ or took too long to load
Cookie consent not found for URL: https://zakupy.biedronka.pl/search?q=Magnetic+w+czekoladzie or took too long to load
Cookie consent not found for URL: https://zakupy.biedronka.pl/search?q=Wawel+%C5%9Aliwki+w+czekoladzie+180g or took too long to load
Cookie consent not found for URL: https://zakupy.biedronka.pl/search?q=Baitz+Milk+Cookie+Balls+Koekjes+in+Melkchocolade+75+g or took too long to load
Data has been successfully saved to Berrie.xlsx


### Action

In [7]:


# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = [
    "https://www.action.com/nl-nl/search/?q=choco+moment",
    "https://www.action.com/nl-nl/search/?q=snacks+of+the+world",
    "https://www.action.com/nl-nl/search/?q=natural+happiness",
    "https://www.action.com/fr-fr/search/?q=choco+moment",
    "https://www.action.com/fr-fr/search/?q=snacks+of+the+world",
    "https://www.action.com/fr-fr/search/?q=natural+happiness",
    "https://www.action.com/de-de/search/?q=choco+moment",
    "https://www.action.com/de-de/search/?q=snacks+of+the+world",
    "https://www.action.com/de-de/search/?q=natural+happiness",    
    "https://www.action.com/pl-pl/search/?q=choco+moment",
    "https://www.action.com/pl-pl/search/?q=snacks+of+the+world",
    "https://www.action.com/pl-pl/search/?q=natural+happiness"
]

# List to store all product data
all_products = []

# Loop through all URLs
for url in urls:
    print(f"Scraping data from {url}")
    
    # Extract the country code from the URL
    country = url.split("https://www.action.com/")[1].split("/")[0][:2]

    driver.get(url)
    time.sleep(5)  # Allow time for page load

    # Accept cookies if the popup appears
    try:
        accept_button = WebDriverWait(driver, 3).until(
            EC.element_to_be_clickable((By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinDeclineAll"))
        )
        accept_button.click()
    except Exception:
        print("No cookies popup found.")
    
    time.sleep(5)
    
    # Parse the page source with BeautifulSoup
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # List to store extracted product information for this URL
    products = []

    # Loop through all product elements
    for product_card in soup.find_all('a', {'data-testid': 'product-card-link'}):
        # Extract product title
        title_tag = product_card.find('span', {'data-testid': 'product-card-title'})
        title = title_tag.get_text(strip=True) if title_tag else 'Title not found'

        # Extract product description
        description_tag = product_card.find('span', {'data-testid': 'product-card-description'})
        description = description_tag.get_text(strip=True) if description_tag else 'Description not found'

        # Extract price whole part
        price_whole_tag = product_card.find('span', {'data-testid': 'product-card-price-whole'})
        price_whole = price_whole_tag.get_text(strip=True) if price_whole_tag else '0'

        # Extract price fractional part
        price_fractional_tag = product_card.find('span', {'data-testid': 'product-card-price-fractional'})
        price_fractional = price_fractional_tag.get_text(strip=True) if price_fractional_tag else '00'

        # Combine whole and fractional prices
        price = f"{price_whole}.{price_fractional}"

        # Extract price per kilo (if available)
        priceperkilo_tag = product_card.find('span', {'data-testid': 'product-card-price-description'})
        priceperkilo = priceperkilo_tag.get_text(strip=True) if priceperkilo_tag else 'Weight not found'

        # Extract product code from image URL
        image_tag = product_card.find('img', {'data-testid': 'product-card-image'})
        product_code = 'Code not found'
        if image_tag and 'src' in image_tag.attrs:
            image_url = image_tag['src']
            match = re.search(r"/(\d+)_", image_url)
            if match:
                product_code = match.group(1)

        # Store product details
        products.append({
            'Product Title': title,
            'Price': price,
            'Promo Price': "",  # Placeholder, since no promo price is extracted here
            'Weight': description,  # Reusing description for weight
            'Country': country,
            'Store': "Action"
        })

    # Add the current URL and timestamp to each product
    timestamp = datetime.now().strftime('%Y-%m-%d')
    for product in products:
        product['Timestamp'] = timestamp

    # Add the products for this URL to the overall list
    all_products.extend(products)

# Convert to DataFrame
df = pd.DataFrame(all_products)

# Save to Excel file
excel_filename = 'Berrie.xlsx'

# Append to Excel file if it exists
if os.path.exists(excel_filename):
    try:
        existing_df = pd.read_excel(excel_filename, engine='openpyxl')
        updated_df = pd.concat([existing_df, df], ignore_index=True)
        updated_df.to_excel(excel_filename, index=False, engine='openpyxl')
        print(f"Appended data to existing file: {excel_filename}")
    except Exception as e:
        print(f"Error appending to {excel_filename}: {e}")
else:
    try:
        df.to_excel(excel_filename, index=False, engine='openpyxl')
        print(f"Created new file and saved data: {excel_filename}")
    except Exception as e:
        print(f"Error saving to {excel_filename}: {e}")

# Close the driver
driver.quit()

Scraping data from https://www.action.com/nl-nl/search/?q=choco+moment
Scraping data from https://www.action.com/nl-nl/search/?q=snacks+of+the+world
No cookies popup found.
Scraping data from https://www.action.com/nl-nl/search/?q=natural+happiness
No cookies popup found.
Scraping data from https://www.action.com/fr-fr/search/?q=choco+moment
No cookies popup found.
Scraping data from https://www.action.com/fr-fr/search/?q=snacks+of+the+world
No cookies popup found.
Scraping data from https://www.action.com/fr-fr/search/?q=natural+happiness
No cookies popup found.
Scraping data from https://www.action.com/de-de/search/?q=choco+moment
No cookies popup found.
Scraping data from https://www.action.com/de-de/search/?q=snacks+of+the+world
No cookies popup found.
Scraping data from https://www.action.com/de-de/search/?q=natural+happiness
No cookies popup found.
Scraping data from https://www.action.com/pl-pl/search/?q=choco+moment
No cookies popup found.
Scraping data from https://www.action.

### Albert Heijn

In [8]:


# Instellen van User-Agent
ua = UserAgent()
user_agent = ua.random

# Chrome opties configureren
options = uc.options.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--disable-features=VizDisplayCompositor")
options.add_argument(f"user-agent={user_agent}")


# Start de WebDriver met undetected_chromedriver
driver = uc.Chrome(options=options)

# URL die je wilt scrapen
url = "https://www.ah.nl/producten/chips-noten-toast-popcorn/noten?merk=AH&page=6"

# Ga naar de pagina
driver.get(url)

# Wacht een paar seconden zodat de pagina volledig laadt
time.sleep(random.randint(3, 5))

# Verkrijg de HTML van de pagina
html = driver.page_source

# Parse de HTML met BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# Lijst om productdata op te slaan
products = []

# Loop door alle productartikelen op de pagina
for article in soup.find_all('article', class_='product-card-portrait_root__ZiRpZ'):
    # Extract prijs
    price_span = article.find('span', class_='sr-only')
    price = price_span.get('aria-label') if price_span else 'N/A'
    
    # Verwijder "Prijs: €" en extra spaties
    if price != 'N/A':
        price = re.sub(r'Prijs:\s*€\s*', '', price)  # Verwijder "Prijs: €"
        price = price.strip()  # Verwijder extra spaties rondom de prijs

    # Extract promo prijs
    promo_price_span = article.find('div', class_='price-amount_highlight__ekL92')
    promo_price = "N/A"
    if promo_price_span:
        promo_price_span_inner = promo_price_span.find('span', class_='sr-only')
        if promo_price_span_inner:
            promo_price = promo_price_span_inner.get('aria-label')

    # Verwijder promo prijs "Prijs: €" en extra spaties indien nodig
    if promo_price != "N/A":
        promo_price = re.sub(r'Prijs:\s*€\s*', '', promo_price)
        promo_price = promo_price.strip()  # Verwijder extra spaties rondom de promo prijs

    # Extract titel
    title_tag = article.find('a', class_='link_root__EqRHd')
    title = title_tag.get('title') if title_tag else 'N/A'

    # Extract gewicht
    weight_span = article.find('span', class_='price_unitSize__Hk6E4')
    weight = weight_span.get_text(strip=True) if weight_span else 'N/A'

    # Voeg de verkregen data toe aan de lijst
    products.append((title, price, promo_price, weight, "NL", "AH"))

# Verkrijg de huidige timestamp voor wanneer de data werd gescrapet
timestamp = datetime.now().strftime('%Y-%m-%d')

# Bestandsnaam en sheetnaam
file_name = 'Berrie.xlsx'
sheet_name = 'AH'

# Laad of maak een nieuw werkboek aan
if os.path.exists(file_name):
    workbook = load_workbook(file_name)
    sheet = workbook[sheet_name] if sheet_name in workbook.sheetnames else workbook.create_sheet(sheet_name)
else:
    workbook = Workbook()
    sheet = workbook.active
    sheet.title = sheet_name

# Schrijf de header als het een nieuw blad is
if sheet.max_row == 1:
    sheet.append(['Product Title', 'Price', 'Promo Price', 'Weight', 'Country', 'Store', 'Timestamp'])

# Voeg de productdata toe
for product in products:
    sheet.append([*product, timestamp])

# Sla het Excel-bestand op
workbook.save(file_name)
print(f"✅ Data succesvol opgeslagen naar {file_name} in blad '{sheet_name}'.")

# Sluit de browser na het scrapen
driver.quit()


✅ Data succesvol opgeslagen naar Berrie.xlsx in blad 'AH'.


In [9]:


# Instellen van User-Agent
ua = UserAgent()
user_agent = ua.random

# Chrome opties configureren
options = uc.options.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--disable-features=VizDisplayCompositor")
options.add_argument(f"user-agent={user_agent}")


# Start de WebDriver met undetected_chromedriver
driver = uc.Chrome(options=options)

# URL die je wilt scrapen
url = "https://www.ah.nl/producten/snoep-chocolade-koek/chocolade/chocoladesnoepjes?merk=AH&kenmerk=prijsfavoriet"

# Ga naar de pagina
driver.get(url)

# Wacht een paar seconden zodat de pagina volledig laadt
time.sleep(random.randint(3, 5))

# Verkrijg de HTML van de pagina
html = driver.page_source

# Parse de HTML met BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# Lijst om productdata op te slaan
products = []

# Loop door alle productartikelen op de pagina
for article in soup.find_all('article', class_='product-card-portrait_root__ZiRpZ'):
    # Extract prijs
    price_span = article.find('span', class_='sr-only')
    price = price_span.get('aria-label') if price_span else 'N/A'
    
    # Verwijder "Prijs: €" en extra spaties
    if price != 'N/A':
        price = re.sub(r'Prijs:\s*€\s*', '', price)  # Verwijder "Prijs: €"
        price = price.strip()  # Verwijder extra spaties rondom de prijs

    # Extract promo prijs
    promo_price_span = article.find('div', class_='price-amount_highlight__ekL92')
    promo_price = "N/A"
    if promo_price_span:
        promo_price_span_inner = promo_price_span.find('span', class_='sr-only')
        if promo_price_span_inner:
            promo_price = promo_price_span_inner.get('aria-label')

    # Verwijder promo prijs "Prijs: €" en extra spaties indien nodig
    if promo_price != "N/A":
        promo_price = re.sub(r'Prijs:\s*€\s*', '', promo_price)
        promo_price = promo_price.strip()  # Verwijder extra spaties rondom de promo prijs

    # Extract titel
    title_tag = article.find('a', class_='link_root__EqRHd')
    title = title_tag.get('title') if title_tag else 'N/A'

    # Extract gewicht
    weight_span = article.find('span', class_='price_unitSize__Hk6E4')
    weight = weight_span.get_text(strip=True) if weight_span else 'N/A'

    # Voeg de verkregen data toe aan de lijst
    products.append((title, price, promo_price, weight, "NL", "AH"))

# Verkrijg de huidige timestamp voor wanneer de data werd gescrapet
timestamp = datetime.now().strftime('%Y-%m-%d')

# Bestandsnaam en sheetnaam
file_name = 'Berrie.xlsx'
sheet_name = 'AH'

# Laad of maak een nieuw werkboek aan
if os.path.exists(file_name):
    workbook = load_workbook(file_name)
    sheet = workbook[sheet_name] if sheet_name in workbook.sheetnames else workbook.create_sheet(sheet_name)
else:
    workbook = Workbook()
    sheet = workbook.active
    sheet.title = sheet_name

# Schrijf de header als het een nieuw blad is
if sheet.max_row == 1:
    sheet.append(['Product Title', 'Price', 'Promo Price', 'Weight', 'Country', 'Store', 'Timestamp'])

# Voeg de productdata toe
for product in products:
    sheet.append([*product, timestamp])

# Sla het Excel-bestand op
workbook.save(file_name)
print(f"✅ Data succesvol opgeslagen naar {file_name} in blad '{sheet_name}'.")

# Sluit de browser na het scrapen
driver.quit()


✅ Data succesvol opgeslagen naar Berrie.xlsx in blad 'AH'.


In [10]:

# Instellen van User-Agent
ua = UserAgent()
user_agent = ua.random

# Chrome opties configureren
options = uc.options.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--disable-features=VizDisplayCompositor")
options.add_argument(f"user-agent={user_agent}")


# Start de WebDriver met undetected_chromedriver
driver = uc.Chrome(options=options)

# URL die je wilt scrapen
url = "https://www.ah.nl/producten/chips-noten-toast-popcorn/zoutjes/rijstzoutjes"

# Ga naar de pagina
driver.get(url)

# Wacht een paar seconden zodat de pagina volledig laadt
time.sleep(random.randint(3, 5))

# Verkrijg de HTML van de pagina
html = driver.page_source

# Parse de HTML met BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# Lijst om productdata op te slaan
products = []

# Loop door alle productartikelen op de pagina
for article in soup.find_all('article', class_='product-card-portrait_root__ZiRpZ'):
    # Extract prijs
    price_span = article.find('span', class_='sr-only')
    price = price_span.get('aria-label') if price_span else 'N/A'
    
    # Verwijder "Prijs: €" en extra spaties
    if price != 'N/A':
        price = re.sub(r'Prijs:\s*€\s*', '', price)  # Verwijder "Prijs: €"
        price = price.strip()  # Verwijder extra spaties rondom de prijs

    # Extract promo prijs
    promo_price_span = article.find('div', class_='price-amount_highlight__ekL92')
    promo_price = "N/A"
    if promo_price_span:
        promo_price_span_inner = promo_price_span.find('span', class_='sr-only')
        if promo_price_span_inner:
            promo_price = promo_price_span_inner.get('aria-label')

    # Verwijder promo prijs "Prijs: €" en extra spaties indien nodig
    if promo_price != "N/A":
        promo_price = re.sub(r'Prijs:\s*€\s*', '', promo_price)
        promo_price = promo_price.strip()  # Verwijder extra spaties rondom de promo prijs

    # Extract titel
    title_tag = article.find('a', class_='link_root__EqRHd')
    title = title_tag.get('title') if title_tag else 'N/A'

    # Extract gewicht
    weight_span = article.find('span', class_='price_unitSize__Hk6E4')
    weight = weight_span.get_text(strip=True) if weight_span else 'N/A'

    # Voeg de verkregen data toe aan de lijst
    products.append((title, price, promo_price, weight, "NL", "AH"))

# Verkrijg de huidige timestamp voor wanneer de data werd gescrapet
timestamp = datetime.now().strftime('%Y-%m-%d')

# Bestandsnaam en sheetnaam
file_name = 'Berrie.xlsx'
sheet_name = 'AH'

# Laad of maak een nieuw werkboek aan
if os.path.exists(file_name):
    workbook = load_workbook(file_name)
    sheet = workbook[sheet_name] if sheet_name in workbook.sheetnames else workbook.create_sheet(sheet_name)
else:
    workbook = Workbook()
    sheet = workbook.active
    sheet.title = sheet_name

# Schrijf de header als het een nieuw blad is
if sheet.max_row == 1:
    sheet.append(['Product Title', 'Price', 'Promo Price', 'Weight', 'Country', 'Store', 'Timestamp'])

# Voeg de productdata toe
for product in products:
    sheet.append([*product, timestamp])

# Sla het Excel-bestand op
workbook.save(file_name)
print(f"✅ Data succesvol opgeslagen naar {file_name} in blad '{sheet_name}'.")

# Sluit de browser na het scrapen
driver.quit()


✅ Data succesvol opgeslagen naar Berrie.xlsx in blad 'AH'.
