## Nederland

### Aldi

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import pandas as pd
from datetime import datetime

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = [
    "https://www.aldi.nl/zoeken.html?query=noten&searchCategory=Submitted%20Search&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&configure%5BclickAnalytics%5D=true",
    "https://www.aldi.nl/producten/chips-noten/noten-zaden-en-pitten.html",
    "https://www.aldi.nl/producten/chips-noten/zoutjes.html",
    "https://www.aldi.nl/zoeken.html?query=pitten&searchCategory=Submitted%20Search&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true",
    "https://www.aldi.nl/zoeken.html?query=cashew&searchCategory=Submitted%20Search&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true",
    "https://www.aldi.nl/zoeken.html?query=trader%20joe%20amandelen%20walnoten&searchCategory=Submitted%20Search&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true",
    "https://www.aldi.nl/zoeken.html?query=trader%20joe%20pinda&searchCategory=Submitted%20Search&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true",
    "https://www.aldi.nl/zoeken.html?query=dry%20roasted&searchCategory=Submitted%20Search&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true",
    "https://www.aldi.nl/zoeken.html?query=rozijnen%20&searchCategory=Submitted%20Search&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true",
    "https://www.aldi.nl/zoeken.html?query=macadamia&searchCategory=Submitted%20Search&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true",
    "https://www.aldi.nl/zoeken.html?query=time4choco&searchCategory=Submitted%20Search&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_nl_nl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true"
    
]

# Create an empty list to store all product details
all_products = []

# Loop over the list of URLs
for url in urls:
    driver.get(url)

    # Retrieve the elements after the wait
    articles = driver.find_elements(By.CLASS_NAME, "mod-article-tile--default")

    # Extract details for each article on the page
    for article in articles:
        # Use BeautifulSoup to parse the individual article's HTML
        soup = BeautifulSoup(article.get_attribute('outerHTML'), "html.parser")

        title = soup.find('span', class_='mod-article-tile__title').get_text(strip=True) if soup.find('span', class_='mod-article-tile__title') else 'Title not found'
        promo_price_element = soup.find('s', class_='price__previous')
        promo_price = promo_price_element.get_text(strip=True) if promo_price_element else 'Promo price not found'
        current_price_element = soup.find('span', class_='price__wrapper')
        current_price = current_price_element.get_text(strip=True) if current_price_element else 'Price not found'
        weight = soup.find('span', class_='price__unit').get_text(strip=True) if soup.find('span', class_='price__unit') else 'Weight not found'

        all_products.append({
            "Title": title,
            "Price": current_price,
            "Promo Price": promo_price,
            "Weight": weight,
            "Country": "NL",
        "Store": "Aldi"
        })

# Create a DataFrame
df = pd.DataFrame(all_products)

# Add a timestamp column
df["Timestamp"] = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Save to Excel file
excel_filename = 'Berrie.xlsx'
df.to_excel(excel_filename, index=False, engine='openpyxl')
print(f"Data has been successfully saved to {excel_filename}")

# Close the driver
driver.quit()


Data has been successfully saved to Berrie.xlsx


## Duitsland

### Aldi

In [9]:
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime
from selenium.webdriver.chrome.service import Service
import time

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize Chrome driver with Service
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = [
    "https://www.aldi-nord.de/suchergebnisse.html?query=asiatisce%20snack&searchCategory=Submitted%20Search&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true",
    "https://www.aldi-nord.de/suchergebnisse.html?query=kerne&searchCategory=Submitted%20Search&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true",
    "https://www.aldi-nord.de/sortiment/snacks-suessigkeiten/nuesse-trockenfruechte.html",
    "https://www.aldi-nord.de/suchergebnisse.html?query=trader%20joe%20n%C3%BCsse&searchCategory=Submitted%20Search&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&configure%5BclickAnalytics%5D=true",
    "https://www.aldi-nord.de/suchergebnisse.html?query=trader%20joe%20mix&searchCategory=Submitted%20Search",
    "https://www.aldi-nord.de/suchergebnisse.html?query=schoko%20rosinen&searchCategory=Submitted%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12",
    "https://www.aldi-nord.de/suchergebnisse.html?searchCategory=Submitted%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_de_de_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&query=choceur%20peanuts"
]

# Create an empty list to store all product details
all_products = []

# Loop over the list of URLs
for url in urls:
    driver.get(url)

    # Wait for the articles to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "mod-article-tile--default")))

    # Retrieve the elements after the wait
    articles = driver.find_elements(By.CLASS_NAME, "mod-article-tile--default")

    # Extract details for each article on the page
    for article in articles:
        # Use BeautifulSoup to parse the individual article's HTML
        soup = BeautifulSoup(article.get_attribute('outerHTML'), "html.parser")

        title = soup.find('span', class_='mod-article-tile__title').get_text(strip=True) if soup.find('span', class_='mod-article-tile__title') else 'Title not found'
        promo_price_element = soup.find('s', class_='price__previous')
        promo_price = promo_price_element.get_text(strip=True) if promo_price_element else 'Promo price not found'
        current_price_element = soup.find('span', class_='price__wrapper')
        current_price = current_price_element.get_text(strip=True) if current_price_element else 'Price not found'
        weight = soup.find('span', class_='price__unit').get_text(strip=True) if soup.find('span', class_='price__unit') else 'Weight not found'

        all_products.append({
            "Title": title,
            "Price": current_price,
            "Promo Price": promo_price,
            "Weight": weight,
            "Country": "DE",
        "Store": "Aldi"
        })

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Convert the list of products to a pandas DataFrame
df = pd.DataFrame(all_products)

# Add the timestamp column to the DataFrame
df["Timestamp"] = timestamp

# Save to Excel (Append if file exists)
excel_filename = 'Berrie.xlsx'

if os.path.exists(excel_filename):
    # Load the existing Excel file and append the new data
    existing_df = pd.read_excel(excel_filename, engine='openpyxl')
    updated_df = pd.concat([existing_df, df], ignore_index=True)
    updated_df.to_excel(excel_filename, index=False, engine='openpyxl')
else:
    # If the file doesn't exist, create a new one
    df.to_excel(excel_filename, index=False, engine='openpyxl')

print(f"Data has been successfully saved to {excel_filename}.")

# Close the driver
driver.quit()


Data has been successfully saved to Berrie.xlsx.


In [None]:
# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

, options=chrome_options

### Globus

In [4]:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import time
from datetime import datetime  # Importing datetime module
import pandas as pd  # Importing pandas for Excel file saving
import os  # Importing os to check if file exists

# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
url_list = [
    "https://produkte.globus.de/bobenheim-roxheim/search?query=jeden+tag+kern",
    "https://produkte.globus.de/bobenheim-roxheim/search?p=1&query=jeden%20tag%20n%C3%BCsse",
    "https://produkte.globus.de/bobenheim-roxheim/search?p=2&query=jeden%20tag%20n%C3%BCsse",
    "https://produkte.globus.de/bobenheim-roxheim/search?query=jeden+tag+schokolierte"
]

# Get the current timestamp for CSV file
timestamp = datetime.now().strftime('%Y-%m-%d')

# Create an empty list to store all product details
all_products = []

# Loop over each URL
for page_url in url_list:
    print(f"Scraping URL: {page_url}...")

    # Open the current URL
    driver.get(page_url)
    time.sleep(5)

    # Get the page source and parse it with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Loop through all product cards and extract data
    for product_card in soup.find_all("div", class_="product-info"):
        # Extract product title
        title_tag = product_card.find("a", class_="product-image-link product-name")
        title = title_tag.get("title").strip() if title_tag else "Title not found"

        # Extract price
        price_div = product_card.find("div", class_="unit-price js-unit-price")
        price = price_div.get("data-value") if price_div and price_div.has_attr("data-value") else "Price not found"

        # Extract weight
        weight_div = product_card.find("div", class_="price-unit-content")
        weight = weight_div.text.strip() if weight_div else "Weight not found"

        # Extract promo price
        promo_price = "Promo price not found"  # Default value in case promo price is not found
        promo_price_div = product_card.find("div", class_="product-price-globus-discount")
        if promo_price_div:
            promo_price_element = promo_price_div.find("div", class_="unit-price js-unit-price discount-price")
            if promo_price_element:
                promo_price = promo_price_element.text.strip()

        # Append the product data to the list
        all_products.append({
            "Title": title,
            "Price": price,
            "Promo Price": promo_price,
            "Weight": weight,
            "Country": "DE",
            "Store": "Globus",
            "Timestamp": timestamp
        })

# Convert the list of products to a pandas DataFrame
df = pd.DataFrame(all_products)

# Append to Excel file
excel_filename = 'Berrie.xlsx'

if os.path.exists(excel_filename):
    # If the file exists, load the existing file and append the new data
    existing_df = pd.read_excel(excel_filename, engine='openpyxl')
    updated_df = pd.concat([existing_df, df], ignore_index=True)
    updated_df.to_excel(excel_filename, index=False, engine='openpyxl')
else:
    # If the file doesn't exist, create a new one
    df.to_excel(excel_filename, index=False, engine='openpyxl')

print(f"Data has been successfully saved to {excel_filename}")

# Close the driver after extracting data
driver.quit()

print("Scraping process completed successfully!")


Total Pages: 7
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Data has been successfully saved to Berrie.xlsx
Scraping process completed successfully!


## Frankrijk

### Aldi

In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime
from selenium.webdriver.chrome.service import Service
import os

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = [
    "https://www.aldi.fr/produits/epicerie-salee/biscuit-aperitif-chips.html",
    "https://www.aldi.fr/recherche.html?query=trader%20joe&searchCategory=Submitted%20Search",
    "https://www.aldi.fr/recherche.html?query=Pignons&searchCategory=Submitted%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_fr_fr_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_fr_fr_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_fr_fr_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_fr_fr_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_fr_fr_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_fr_fr_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_fr_fr_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_fr_fr_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12",
    "https://www.aldi.fr/recherche.html?query=isaura%20choco%20peanut&searchCategory=Submitted%20Search"
]

# Create an empty list to store all product details
all_products = []

# Loop over the list of URLs
for url in urls:
    driver.get(url)

    # Wait for the articles to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "mod-article-tile--default")))

    # Retrieve the elements after the wait
    articles = driver.find_elements(By.CLASS_NAME, "mod-article-tile--default")

    # Extract details for each article on the page
    for article in articles:
        # Use BeautifulSoup to parse the individual article's HTML
        soup = BeautifulSoup(article.get_attribute('outerHTML'), "html.parser")

        title = soup.find('span', class_='mod-article-tile__title').get_text(strip=True) if soup.find('span', class_='mod-article-tile__title') else 'Title not found'
        promo_price_element = soup.find('s', class_='price__previous')
        promo_price = promo_price_element.get_text(strip=True) if promo_price_element else 'Promo price not found'
        current_price_element = soup.find('span', class_='price__wrapper')
        current_price = current_price_element.get_text(strip=True) if current_price_element else 'Price not found'
        weight = soup.find('span', class_='price__unit').get_text(strip=True) if soup.find('span', class_='price__unit') else 'Weight not found'

        all_products.append({
            "Title": title,
            "Price": current_price,
            "Promo Price": promo_price,
            "Weight": weight,
            "Country": "FR",
        "Store": "Aldi"
        })

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Create a DataFrame
df = pd.DataFrame(all_products)

# Add a timestamp column
df["Timestamp"] = timestamp


# Append to Excel file
excel_filename = 'Berrie.xlsx'

if os.path.exists(excel_filename):
    # If the file exists, load the existing file and append the new data
    existing_df = pd.read_excel(excel_filename, engine='openpyxl')
    updated_df = pd.concat([existing_df, df], ignore_index=True)
    updated_df.to_excel(excel_filename, index=False, engine='openpyxl')
else:
    # If the file doesn't exist, create a new one
    df.to_excel(excel_filename, index=False, engine='openpyxl')

print(f"Data has been successfully saved to {excel_filename}")

# Close the driver
driver.quit()


Data has been successfully saved to Berrie.xlsx


### Carrefour

In [31]:
import csv
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
import time
import os
import pandas as pd

# Initialize Chrome driver with Service
options = Options()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# List of URLs to scrape
urls = [
    "https://www.carrefour.fr/s?filters%5Bfacet_marque%5D%5B0%5D=CARREFOUR&q=melange&noRedirect=1&userIsPro=0&page=1",
    "https://www.carrefour.fr/r/epicerie-sucree/sucres-farines-coulis-et-preparation-gateaux/aide-a-la-patisserie/fruits-secs-fruits-confits?filters%5Bfacet_marque%5D%5B0%5D=CARREFOUR&noRedirect=0&userIsPro=0",
    "https://www.carrefour.fr/r/epicerie-sucree/chocolats-et-bonbons/confiseries-chocolatees/billes-et-bonbons-au-chocolat?filters%5Bfacet_marque%5D%5B0%5D=CARREFOUR&noRedirect=0&userIsPro=0"
]

# List to store product information
all_products = []

for url in urls:
    # Open the URL
    time.sleep(2)
    driver.get(url)

    # Handle cookie consent
    try:
        # Wait for the cookie settings button to appear
        param_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "onetrust-pc-btn-handler")))
        param_button.click()

        # Wait for and click the "refuse all" button
        confirm_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, "ot-pc-refuse-all-handler")))
        confirm_button.click()
    except Exception as e:
        print(f"Cookie consent handling failed")

    # Parse page source with BeautifulSoup
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # Extract product details
    for product_pod in soup.find_all("div", class_="main-layout__info-zone"):
        # Extract title
        title_tag = product_pod.find("a", class_="product-card-title")
        title = title_tag.text.strip() if title_tag else "Title not found"

        # Extract weight
        weight_tag = product_pod.find("p", class_="pl-text--size-m")
        weight = weight_tag.text.strip() if weight_tag else "Weight not found"

        # Extract current price (main price)
        price_main_tag = product_pod.find("div", class_="product-price__amount--main")
        if price_main_tag:
            price_main_parts = price_main_tag.find_all("p", class_="product-price__content")
            if len(price_main_parts) >= 2:
                current_price = f"{price_main_parts[0].text.strip()}{price_main_parts[1].text.strip()} €"
            else:
                current_price = "Price not found"
        else:
            current_price = "Price not found"

        # Extract promotional price
        promo_price_tag = product_pod.find("div", class_="product-price__amount--old")
        if promo_price_tag:
            promo_price_parts = promo_price_tag.find_all("p", class_="product-price__content")
            if len(promo_price_parts) >= 2:
                promo_price = f"{promo_price_parts[0].text.strip()},{promo_price_parts[1].text.strip()} €"
            else:
                promo_price = "Promo price not found"
        else:
            promo_price = "Promo price not found"

        # Add static values
        Country = "FR"
        Store = "Carrefour"

        # Append extracted information to the list
        all_products.append((title, current_price, promo_price, weight, Country, Store))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Prepare the data for saving
df = pd.DataFrame(all_products, columns=["Title", "Price", "Promo Price", "Weight", "Country", "Store"])

# Add timestamp to the DataFrame
df["Timestamp"] = timestamp

# Excel file name
excel_filename = 'Berrie.xlsx'

# Check if the Excel file exists
if os.path.exists(excel_filename):
    # Read the existing data from the Excel file
    existing_df = pd.read_excel(excel_filename, engine='openpyxl')

    # Append the new data to the existing data
    combined_df = pd.concat([existing_df, df], ignore_index=True)

    # Save the combined data back to the same sheet
    with pd.ExcelWriter(excel_filename, engine='openpyxl', mode='w') as writer:
        combined_df.to_excel(writer, index=False)
else:
    # If the file doesn't exist, save the new data as a new Excel file
    df.to_excel(excel_filename, index=False, engine='openpyxl')

print(f"Data has been successfully saved to {excel_filename}")

# Close the driver
driver.quit()

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.205)
Stacktrace:
	GetHandleVerifier [0x003FFD53+23747]
	(No symbol) [0x00387D54]
	(No symbol) [0x0025BE53]
	(No symbol) [0x0023D91B]
	(No symbol) [0x002C7EFF]
	(No symbol) [0x002DAD49]
	(No symbol) [0x002C1B96]
	(No symbol) [0x00293F3C]
	(No symbol) [0x00294EBD]
	GetHandleVerifier [0x006DAC73+3017699]
	GetHandleVerifier [0x006EB93B+3086507]
	GetHandleVerifier [0x006E40F2+3055714]
	GetHandleVerifier [0x00495AF0+637536]
	(No symbol) [0x00390A5D]
	(No symbol) [0x0038DA28]
	(No symbol) [0x0038DBC5]
	(No symbol) [0x003807F0]
	BaseThreadInitThunk [0x755A5D49+25]
	RtlInitializeExceptionChain [0x7731CEBB+107]
	RtlGetAppContainerNamedObjectPath [0x7731CE41+561]


## Polen

### Aldi

In [27]:
import csv
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os

# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = [
    "https://www.aldi.pl/szukaj.html?query=orzechy%20trader&searchCategory=Suggested%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12",
    "https://www.aldi.pl/szukaj.html?query=asia&searchCategory=Suggested%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12",
    "https://www.aldi.pl/nasze-produkty/przekaski/pestki--nasiona--ziarna.html",
    "https://www.aldi.pl/szukaj.html?query=trader%20joe%27s%20&searchCategory=Suggested%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12",
    "https://www.aldi.pl/szukaj.html?query=orzeszki%20trader&searchCategory=Suggested%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12",
    "https://www.aldi.pl/szukaj.html?query=rodzynki&searchCategory=Suggested%20Search",
    "https://www.aldi.pl/szukaj.html?query=Orzechy%20laskowe%2FMigda%C5%82y%20w%20czekoladzie%20mlecznej&searchCategory=Submitted%20Search&configure%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_offers%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_assortment%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_recipes%5D%5Bconfigure%5D%5BhitsPerPage%5D=12&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_pl_pl_content%5D%5Bconfigure%5D%5BhitsPerPage%5D=12"
]

# Create an empty list to store all product details
all_products = []

# Loop over the list of URLs
for url in urls:
    driver.get(url)

    # Wait for the articles to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "mod-article-tile--default")))

    # Retrieve the elements after the wait
    articles = driver.find_elements(By.CLASS_NAME, "mod-article-tile--default")

    # Extract details for each article on the page
    for article in articles:
        # Use BeautifulSoup to parse the individual article's HTML
        soup = BeautifulSoup(article.get_attribute('outerHTML'), "html.parser")

        title = soup.find('span', class_='mod-article-tile__title').get_text(strip=True) if soup.find('span', class_='mod-article-tile__title') else 'Title not found'
        promo_price_element = soup.find('s', class_='price__previous')
        promo_price = promo_price_element.get_text(strip=True) if promo_price_element else 'Promo price not found'
        current_price_element = soup.find('span', class_='price__wrapper')
        current_price = current_price_element.get_text(strip=True) if current_price_element else 'Price not found'
        weight = soup.find('span', class_='price__unit').get_text(strip=True) if soup.find('span', class_='price__unit') else 'Weight not found'

        Country = "PL"
        Store = "Aldi"

        all_products.append((title, current_price, promo_price, weight, Country, Store))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Prepare data for saving to CSV and Excel
df = pd.DataFrame(all_products, columns=["Title", "Price", "Promo Price", "Weight", "Country", "Store"])

# Add timestamp to the DataFrame
df["Timestamp"] = timestamp

# Excel file name
excel_filename = 'Berrie.xlsx'

# Check if the Excel file exists
if os.path.exists(excel_filename):
    # Read the existing data from the Excel file
    existing_df = pd.read_excel(excel_filename, engine='openpyxl')

    # Append the new data to the existing data
    combined_df = pd.concat([existing_df, df], ignore_index=True)

    # Write back the combined data to the same sheet
    with pd.ExcelWriter(excel_filename, engine='openpyxl', mode='w') as writer:
        combined_df.to_excel(writer, index=False, sheet_name='Sheet1')
else:
    # If the file doesn't exist, create a new file with the data
    df.to_excel(excel_filename, index=False, engine='openpyxl')

print(f"Data has been successfully saved to {excel_filename}")

# Close the driver
driver.quit()


Data has been successfully saved to Berrie.xlsx


### Biedronka

In [19]:
import pandas as pd
from selenium.common.exceptions import TimeoutException
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
import time
import os

# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = [
    "https://zakupy.biedronka.pl/artykuly-spozywcze/przekaski/orzeszki/",
    "https://zakupy.biedronka.pl/artykuly-spozywcze/przekaski/bakalie/",
    "https://zakupy.biedronka.pl/search?q=Magnetic+w+czekoladzie",
    "https://zakupy.biedronka.pl/search?q=Wawel+%C5%9Aliwki+w+czekoladzie+180g",
    "https://zakupy.biedronka.pl/search?q=Baitz+Milk+Cookie+Balls+Koekjes+in+Melkchocolade+75+g"
]

# List to store all product information across multiple pages
all_products = []

# Loop over each URL
for url in urls:
    driver.get(url)

    try:
        # Wait for the cookie consent button to be clickable (increased timeout)
        param_button = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.ID, "onetrust-pc-btn-handler")))
        param_button.click()

        # Wait for and click the button to confirm cookie consent
        confirm_button = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.CLASS_NAME, "ot-pc-refuse-all-handler")))
        confirm_button.click()
    except TimeoutException:
        print(f"Cookie consent not found for URL: {url} or took too long to load")

    # Parse page source with BeautifulSoup
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # Extract product information for the current page
    for product_pod in soup.find_all("div", class_="product-tile js-product-tile"):
        # Extract title
        title_tag = product_pod.find("div", class_="product-tile__name product-tile__name--overflow")
        title = title_tag.text.strip() if title_tag else "Title not found"

        # Extract weight (only the weight value, e.g., "0.2kg")
        weight_tag = product_pod.find("div", class_="packaging-details")
        if weight_tag:
            weight = weight_tag.contents[0].strip()  # Get the first part before the <span> tag
        else:
            weight = "Weight not found"
        
        # Extract current price (main price)
        price_main_tag = product_pod.find("div", class_="price-tile__sales")
        if price_main_tag:
            # Extract the integer part of the price
            integer_part = price_main_tag.find(text=True, recursive=False).strip() if price_main_tag else None
            decimal_part = price_main_tag.find("span", class_="price-tile__decimal")
            if integer_part and decimal_part:
                # Combine integer and decimal parts into one properly formatted price
                raw_price = f"{integer_part.strip()}{decimal_part.text.strip()}"  # Combine without formatting
                if len(raw_price) > 2:
                    current_price = f"{raw_price[:-2]}.{raw_price[-2:]}"  # Insert decimal point two digits from the end
                else:
                    current_price = f"0,{raw_price}"  # Handle cases where price is less than 1 zł
            else:
                current_price = "Price not found"
        else:
            current_price = "Price not found"

        # Remove any extra spaces (just in case)
        current_price = current_price.replace(" ", "").strip()

        # Extract promo price if available
        promo_price_tag = product_pod.find("div", class_="product-tile-prices__regular")
        if promo_price_tag:
            promo_price = promo_price_tag.find("span", class_="product-tile-prices__amount")
            if promo_price:
                promo_price = promo_price.text.strip()
            else:
                promo_price = "Promo Price not found"
        else:
            promo_price = "Promo Price not found"

        Country = "PL"
        Store = "Biedronka"

        # Append extracted information to the list
        all_products.append((title, current_price, promo_price, weight, Country, Store))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Prepare the data for saving
df = pd.DataFrame(all_products, columns=["Title", "Price", "Promo Price", "Weight", "Country", "Store"])

# Add timestamp to the DataFrame
df["Timestamp"] = timestamp

# Excel file name
excel_filename = 'Berrie.xlsx'

# Check if the Excel file exists
if os.path.exists(excel_filename):
    # Read the existing data from the Excel file
    existing_df = pd.read_excel(excel_filename, engine='openpyxl')

    # Append the new data to the existing data
    combined_df = pd.concat([existing_df, df], ignore_index=True)

    # Save the combined data back to the same sheet
    with pd.ExcelWriter(excel_filename, engine='openpyxl', mode='w') as writer:
        combined_df.to_excel(writer, index=False)
else:
    # If the file doesn't exist, save the new data as a new Excel file
    df.to_excel(excel_filename, index=False, engine='openpyxl')

print(f"Data has been successfully saved to {excel_filename}")

# Quit the driver
driver.quit()

  integer_part = price_main_tag.find(text=True, recursive=False).strip() if price_main_tag else None


Cookie consent not found for URL: https://zakupy.biedronka.pl/artykuly-spozywcze/przekaski/bakalie/ or took too long to load
Cookie consent not found for URL: https://zakupy.biedronka.pl/search?q=Magnetic+w+czekoladzie or took too long to load
Cookie consent not found for URL: https://zakupy.biedronka.pl/search?q=Wawel+%C5%9Aliwki+w+czekoladzie+180g or took too long to load
Cookie consent not found for URL: https://zakupy.biedronka.pl/search?q=Baitz+Milk+Cookie+Balls+Koekjes+in+Melkchocolade+75+g or took too long to load
Data has been successfully saved to Berrie.xlsx


### Albert Heijn

In [28]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import re  # For regular expressions
from datetime import datetime  # For timestamps
from selenium.webdriver.chrome.service import Service
import time
from openpyxl import load_workbook  # For appending to existing Excel file

# Initialize Chrome driver with Service
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Target URL
url = "https://www.ah.nl/producten/chips-noten-toast-popcorn/noten?merk=AH&page=6"
driver.get(url)
time.sleep(5)

# Accept cookies (specific to the website)
accept_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "decline-cookies")))
accept_button.click()

# Get page source and parse with BeautifulSoup
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

# List to store the extracted product information
products = []

# Loop through all product articles
for article in soup.find_all('article', class_='product-card-portrait_root__ZiRpZ'):
    # Extract the price from the aria-label of the sr-only span
    price_span = article.find('span', class_='sr-only')
    if price_span:
        # Use regular expression to extract the numeric price (e.g., 1.99)
        match = re.search(r'[\d]+[.,][\d]+', price_span.get('aria-label'))
        price = match.group() if match else 'Price not found'
    else:
        price = 'Price not found'
        
    # Extract the promo price (if available)
    promo_price_span = article.find('div', class_='price-amount_highlight__ekL92')
    if promo_price_span:
        promo_price_span_inner = promo_price_span.find('span', class_='sr-only')
        if promo_price_span_inner:
            match_promo_price = re.search(r'[\d]+[.,][\d]+', promo_price_span_inner.get('aria-label'))
            promo_price = match_promo_price.group() if match_promo_price else 'Promo price not found'
        else:
            promo_price = 'Promo price not found'
    else:
        promo_price = 'Promo price not found'

    # Extract the product title
    title_tag = article.find('a', class_='link_root__EqRHd')
    title = title_tag.get('title') if title_tag else 'Title not found'
    
    # Extract the weight
    weight_span = article.find('span', class_='price_unitSize__Hk6E4')
    weight = weight_span.get_text(strip=True) if weight_span else 'Weight not found'

    # Append data to products list
    products.append({
        "Title": title,
        "Price": price,
        "Promo Price": promo_price,
        "Weight": weight,
        "Country": "NL",
        "Store": "AH"
    })

# Create a DataFrame from the products list
df = pd.DataFrame(products)

# Add a timestamp column
df["Timestamp"] = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Define Excel filename
excel_filename = 'Berrie.xlsx'

# Check if the file already exists
try:
    # Try to open the existing workbook and append the new data to a new sheet
    with pd.ExcelWriter(excel_filename, engine='openpyxl', mode='a') as writer:
        df.to_excel(writer, index=False, sheet_name=f"AH")
    print("Data has been successfully saved to a new sheet in the existing workbook.")
except FileNotFoundError:
    # If the file doesn't exist, create a new workbook and save the data
    with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
        df.to_excel(writer, index=False)
    print("New Excel file created and data saved.")

# Close the driver
driver.quit()


Data has been successfully saved to a new sheet in the existing workbook.


In [30]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import re  # For regular expressions
from datetime import datetime  # For timestamps
from selenium.webdriver.chrome.service import Service
import time
from openpyxl import load_workbook  # For handling existing Excel files

# Initialize Chrome driver with Service
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Target URL
url = "https://www.ah.nl/producten/snoep-chocolade-koek/chocolade/chocoladesnoepjes?merk=AH&kenmerk=prijsfavoriet"
driver.get(url)
time.sleep(5)

# Accept cookies (specific to the website)
accept_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "decline-cookies")))
accept_button.click()

# Get page source and parse with BeautifulSoup
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

# List to store the extracted product information
products = []

# Loop through all product articles
for article in soup.find_all('article', class_='product-card-portrait_root__ZiRpZ'):
    # Extract the price from the aria-label of the sr-only span
    price_span = article.find('span', class_='sr-only')
    if price_span:
        # Use regular expression to extract the numeric price (e.g., 1.99)
        match = re.search(r'[\d]+[.,][\d]+', price_span.get('aria-label'))
        price = match.group() if match else 'Price not found'
    else:
        price = 'Price not found'
        
    # Extract the promo price (if available)
    promo_price_span = article.find('div', class_='price-amount_highlight__ekL92')
    if promo_price_span:
        promo_price_span_inner = promo_price_span.find('span', class_='sr-only')
        if promo_price_span_inner:
            match_promo_price = re.search(r'[\d]+[.,][\d]+', promo_price_span_inner.get('aria-label'))
            promo_price = match_promo_price.group() if match_promo_price else 'Promo price not found'
        else:
            promo_price = 'Promo price not found'
    else:
        promo_price = 'Promo price not found'

    # Extract the product title
    title_tag = article.find('a', class_='link_root__EqRHd')
    title = title_tag.get('title') if title_tag else 'Title not found'
    
    # Extract the weight
    weight_span = article.find('span', class_='price_unitSize__Hk6E4')
    weight = weight_span.get_text(strip=True) if weight_span else 'Weight not found'

    # Append data to products list
    products.append({
        "Title": title,
        "Price": price,
        "Promo Price": promo_price,
        "Weight": weight,
        "Country": "NL",
        "Store": "AH"
    })

# Create a DataFrame from the products list
df = pd.DataFrame(products)

# Add a timestamp column
df["Timestamp"] = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Define Excel filename
excel_filename = 'Berrie.xlsx'

try:
    # Load existing workbook
    book = load_workbook(excel_filename)
    
    # Check if sheet exists
    if "AH" in book.sheetnames:
        # Load existing sheet into a DataFrame
        existing_data = pd.read_excel(excel_filename, sheet_name="AH")
        
        # Concatenate existing and new data
        combined_data = pd.concat([existing_data, df], ignore_index=True)
    else:
        # If sheet doesn't exist, just use new data
        combined_data = df
    
    # Write the updated data back to the same sheet
    with pd.ExcelWriter(excel_filename, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
        combined_data.to_excel(writer, index=False, sheet_name="AH")
    
    print("Data successfully appended to the existing sheet.")
except FileNotFoundError:
    # If the file doesn't exist, create a new workbook and save the data
    with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name="AH")
    print("New Excel file created and data saved.")

# Close the driver
driver.quit()


Data successfully appended to the existing sheet.
