# Jumbo

In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
from datetime import datetime
import re
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import openpyxl
from openpyxl import load_workbook

# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of Jumbo product search URLs
urls = [
    "https://www.jumbo.com/producten/koek,-snoep,-chocolade-en-chips/chocolade/chocoladepindas,-snoepjes/jumbo/?offSet=0",
    "https://www.jumbo.com/producten/?searchType=keyword&searchTerms=melkchocolade%20pinda%20zoet"
]

# Prepare Excel file
timestamp = datetime.now().strftime('%Y-%m-%d')
file_name = "choco.xlsx"

try:
    # Try loading the existing workbook
    workbook = load_workbook(file_name)
    sheet = workbook.active
except FileNotFoundError:
    # If file does not exist, create a new workbook and sheet
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    # Write header row
    sheet.append(["Title", "Promo Price", "Price", "Weight", "Brand", "Store", "Timestamp"])

# Loop through each URL
total_products = 0

for url in urls:
    print(f"Scraping: {url}")
    driver.get(url)

    # Accept cookies if present
    try:
        accept_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, "onetrust-reject-all-handler"))
        )
        accept_button.click()
    except:
        print("No accept cookies button found.")

    # Wait for products to load
    WebDriverWait(driver, 20).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, "jum-card"))
    )

    # Load page source into BeautifulSoup
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # Extract product data
    products = []

    for product_card in soup.find_all("div", class_="jum-card"):
        # Extract product title
        title_tag = product_card.find("a", class_="title-link")
        title = title_tag.text.strip() if title_tag else "Title not found"

        # Extract promo price
        promo_price_div = product_card.find("div", class_="promo-price")
        promo_price = (
            re.search(r"[\d]+[.,][\d]+", promo_price_div.text.strip()).group()
            if promo_price_div and promo_price_div.text
            else "Promo price not found"
        )

        # Extract price
        price_whole = product_card.find("span", class_="whole")
        price_fractional = product_card.find("span", class_="fractional")
        price = (
            f"{price_whole.text.strip()},{price_fractional.text.strip()}"
            if price_whole and price_fractional
            else "Price not found"
        )

        # Extract weight
        subtitle_div = product_card.find("div", class_="subtitle")
        weight_span = subtitle_div.find("span", class_="text") if subtitle_div else None
        weight = weight_span.text.strip() if weight_span else "Weight not found"

        # Append to products list
        products.append((title, promo_price, price, weight, "Non_Branded", "Jumbo"))

    # Write data to the Excel sheet
    for product in products:
        sheet.append((*product, timestamp))

    total_products += len(products)
    print(f"Extracted {len(products)} products from this page.")

# Save the workbook
workbook.save(file_name)

print(f"Extracted a total of {total_products} products and saved to {file_name}.")
driver.quit()


Scraping: https://www.jumbo.com/producten/koek,-snoep,-chocolade-en-chips/chocolade/chocoladepindas,-snoepjes/jumbo/?offSet=0
Extracted 13 products from this page.
Scraping: https://www.jumbo.com/producten/koek,-snoep,-chocolade-en-chips/chocolade/chocoladepindas,-snoepjes/jumbo/?offSet=24
No accept cookies button found.


TimeoutException: Message: 


# Plus

In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re
from datetime import datetime
from selenium.webdriver.chrome.service import Service
import openpyxl  # Importing openpyxl for Excel file handling
import os  # For checking if file exists

# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = [
    "https://www.plus.nl/producten/snoep-koek-chocolade-chips-noten/chocolade/chocoladesnoepjes?merk=PLUS",
    "https://www.plus.nl/zoekresultaten?SearchTerm=rotsjes&merk=PLUS",
    "https://www.plus.nl/zoekresultaten?SearchTerm=chocolade%20pinda%27s"
]

# Define the file name
file_name = "choco.xlsx"

# Check if the Excel file already exists
if os.path.exists(file_name):
    # If the file exists, load it
    wb = openpyxl.load_workbook(file_name)
    ws = wb.active
else:
    # If the file does not exist, create a new workbook and worksheet
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Products"
    # Write the headers
    ws.append(["Product Title", "Price", "Promo Price", "Weight", "Branded", "Retailer", "Timestamp", "URL"])

# Loop over each URL
for url in urls:
    driver.get(url)
    time.sleep(5)

    # Click the "Weigeren" button to reject cookies if present
    try:
        accept_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'btn-cookies-refuse')]"))
        )
        accept_button.click()
    except:
        pass  # If the button is not found, continue execution

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    time.sleep(5)

    # Loop through all product articles
    for article in soup.find_all('a', id=re.compile(".*-produt_item_link")):
        title = article.get('title', 'Title not found')
        price_integer = article.find('div', class_='font-bold product-header-price-integer')
        price_decimals = article.find('div', class_='font-black product-header-price-decimals')
        
        if price_integer and price_decimals:
            price = f"{price_integer.get_text(strip=True)}{price_decimals.get_text(strip=True)}"
        else:
            price = 'Price not found'
        
        previous_price_span = article.find('div', class_='product-header-price-previous')
        promo_price = previous_price_span.get_text(strip=True) if previous_price_span else 'Promo price not found'
        
        weight_span = article.find('span', class_='OSFillParent')
        weight = weight_span.get_text(strip=True) if weight_span else 'Weight not found'
        
        # Get current timestamp
        timestamp = datetime.now().strftime('%Y-%m-%d')
        
        # Write product data to Excel
        ws.append([title, price, promo_price, weight, "Non_Branded", "Plus", timestamp])

# Save the workbook to an Excel file
wb.save(file_name)

print(f"Data has been successfully saved to {file_name}")

# Close the driver
driver.quit()

Data has been successfully saved to choco.xlsx


# Dirk

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
import openpyxl
import os
import time
from datetime import datetime

# Initialize Chrome driver with options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

url = "https://www.dirk.nl/boodschappen/snacks-snoep/chocolade"
driver.get(url)
time.sleep(5)  # Initial load wait

# Function to scroll down
def scroll_to_load_more(driver, wait_time=2, scroll_increment=1200, scroll_limit=2):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(scroll_limit):
        driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
        time.sleep(wait_time)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

scroll_to_load_more(driver)

# Function to safely click an element
def safe_click(xpath):
    try:
        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, xpath))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
        time.sleep(1)
        element.click()
    except Exception as e:
        print(f"Error clicking {xpath}: {e}")
        try:
            driver.execute_script("arguments[0].click();", element)  # JavaScript fallback
        except:
            print(f"JavaScript click failed for {xpath}")

# Close pop-ups or overlays if present
try:
    close_button = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept')]"))
    )
    close_button.click()
    time.sleep(2)
except:
    print("No pop-ups found.")

# Click filters
safe_click("//label[contains(text(), 'Overige chocolade & bonbons')]")
time.sleep(3)
safe_click("//label[contains(text(), '1 de Beste')]")
time.sleep(5)

# Wait for products to load
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, "//article[@data-product-id]"))
)

# Parse page content
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

# Extract product information
products = []

for article in soup.find_all('article', attrs={'data-product-id': True}):
    title = article.find('p', class_='title').get_text(strip=True) if article.find('p', class_='title') else 'Title not found'
    
    price_integer = article.find('span', class_='price-large')
    price_decimals = article.find('span', class_='price-small')
    price = f"{price_integer.get_text(strip=True)},{price_decimals.get_text(strip=True)}" if price_integer and price_decimals else 'Price not found'

    promo_price_span = article.find('div', class_='label price-label')
    promo_price = promo_price_span.find('span', class_='regular-price').find('span').get_text(strip=True) if promo_price_span else 'Promo price not found'

    weight_span = article.find('span', class_='subtitle')
    weight = weight_span.get_text(strip=True) if weight_span else 'Weight not found'

    products.append((title, price, promo_price, weight, "Non_Branded", "Dirk"))

# Save to Excel
timestamp = datetime.now().strftime('%Y-%m-%d')
file_name = "choco.xlsx"

if os.path.exists(file_name):
    wb = openpyxl.load_workbook(file_name)
    ws = wb.active
else:
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Products"
    ws.append(["Product Title", "Price", "Promo Price", "Weight", "Branded", "Retailer", "Timestamp"])

for product in products:
    ws.append((*product, timestamp))

wb.save(file_name)
print(f"Data has been successfully saved to {file_name}")

driver.quit()


No pop-ups found.
Data has been successfully saved to choco.xlsx


#### Dirk Rotsjes & Pinda


In [20]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
import openpyxl  # For Excel file handling
import os  # For checking if file exists
import time
from datetime import datetime

# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to scrape
urls = [
    "https://www.dirk.nl/zoeken/producten/1%20de%20beste%20chocolade%20pinda",
    "https://www.dirk.nl/zoeken/producten/chocolade%20rotsjes"
]

# Define the file name
file_name = "choco.xlsx"

# Check if the Excel file already exists
if os.path.exists(file_name):
    wb = openpyxl.load_workbook(file_name)
    ws = wb.active
else:
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Products"
    ws.append(["Product Title", "Price", "Promo Price", "Weight", "Branded", "Retailer", "Timestamp"])

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Loop over each URL
for url in urls:
    driver.get(url)
    time.sleep(10)  # Wait for the page to load

    # Scrape the page source
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # Extract product information
    for article in soup.find_all('article', attrs={'data-product-id': True}):
        title = article.find('p', class_='title').get_text(strip=True) if article.find('p', class_='title') else 'Title not found'
        price_integer = article.find('span', class_='price-large')
        price_decimals = article.find('span', class_='price-small')
        price = f"{price_integer.get_text(strip=True)},{price_decimals.get_text(strip=True)}" if price_integer and price_decimals else 'Price not found'
        promo_price_span = article.find('div', class_='label price-label')
        promo_price = promo_price_span.find('span', class_='regular-price').find('span').get_text(strip=True) if promo_price_span else 'Promo price not found'
        weight_span = article.find('span', class_='subtitle')
        weight = weight_span.get_text(strip=True) if weight_span else 'Weight not found'
        
        # Write product data to Excel
        ws.append([title, price, promo_price, weight, "Non_Branded", "Dirk", timestamp])

# Save the workbook
wb.save(file_name)
print(f"Data has been successfully saved to {file_name}")

# Close the driver
driver.quit()

Data has been successfully saved to choco.xlsx


# Vomar

In [21]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re
from datetime import datetime
from selenium.webdriver.chrome.service import Service
import openpyxl  # For Excel file handling
import os  # For checking if file exists

# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

url = "https://www.vomar.nl/zoeken?search=g%27woon%20choco"
driver.get(url)
time.sleep(5)

# Click the "Weigeren" button to reject cookies on the Vomar site
try:
    deny_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "CybotCookiebotDialogBodyButtonDecline")))
    deny_button.click()
except:
    print("No accept cookies button found.")

html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

products = []

# Loop through all product articles
for article in soup.find_all('div', class_='col-xs-12 col-md-3 product'):
    # Extract the product title from the 'description' class
    title = article.find('p', class_='description').get_text(strip=True) if article.find('p', class_='description') else 'Title not found'

    # Extract the price from the 'price right' class
    price_integer = article.find('span', class_='large')
    price_decimals = article.find('span', class_='small')

    if price_integer and price_decimals:
        price = f"{price_integer.get_text(strip=True)}{price_decimals.get_text(strip=True)}"
    else:
        price = 'Price not found'

    # Extract the promotional price (if applicable, based on previous logic)
    promo_price = 'Promo price not found'  # Placeholder since no promo price was in the provided HTML

    # Weight extraction can be omitted as there is no weight data in the provided HTML
    weight = 'Weight not found'  # Placeholder since no weight was provided

    # Store the extracted information as a tuple
    products.append((title, price, promo_price, weight, "Non_Branded", "Vomar"))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD

# Define the file name
file_name = "choco.xlsx"

# Check if the Excel file already exists
if os.path.exists(file_name):
    # If the file exists, load it
    wb = openpyxl.load_workbook(file_name)
    ws = wb.active
else:
    # If the file does not exist, create a new workbook and worksheet
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Products"
    # Write the headers
    ws.append(["Product Title", "Price", "Promo Price", "Weight", "Branded", "Retailer", "Timestamp"])

# Write product data to Excel
for product in products:
    ws.append((*product, timestamp))  # Write product data with timestamp

# Save the workbook to an Excel file
wb.save(file_name)

print(f"Data has been successfully saved to {file_name}")

# Close the driver
driver.quit()


Data has been successfully saved to choco.xlsx


# Aldi

In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime
from selenium.webdriver.chrome.service import Service
import openpyxl  # For Excel file handling
import os  # For checking if file exists

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize Chrome driver with Service
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of URLs to loop through
urls = [
    "https://www.aldi.nl/zoeken.html?query=rotsjes",
    "https://www.aldi.nl/zoeken.html?query=time+4+choco",
    "https://www.aldi.nl/zoeken.html?query=chocolade+pinda"
]

# Define the file name
file_name = "choco.xlsx"

# Check if the Excel file already exists
if os.path.exists(file_name):
    wb = openpyxl.load_workbook(file_name)  # Load existing file
    ws = wb.active
else:
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Products"
    ws.append(["Product Title", "Price", "Promo Price", "Weight", "Non_Branded", "Retailer", "Timestamp"])  # Headers

# Loop through the URLs and scrape data
for url in urls:
    driver.get(url)  # Navigate to the page first

    # Wait for the products to load
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "product-tile__content"))
        )
    except:
        print(f"Warning: No products found for {url}")

    # Get the page source after JavaScript renders it
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # List to hold product data
    products = []

    # Loop through all product tiles
    for product in soup.find_all('div', class_='product-tile__content'):
        # Extract product title
        title_element = product.find('h2', class_='product-tile__content__upper__product-name')
        title = title_element.get_text(strip=True) if title_element else 'Title not found'

        # Extract current price
        current_price_element = product.find('div', class_='tag__label tag__label--price')
        current_price = current_price_element.get_text(strip=True) if current_price_element else 'Price not found'

        # Extract promo price (only the number, exclude percentage discount)
        promo_price_element = product.find('p', class_='text product-tile__content__lower__wrapper__price-section__discount__striked')
        promo_price = promo_price_element.get_text(strip=True) if promo_price_element else 'No promo price'

        # Extract weight
        weight_element = product.find('p', class_='product-tile__content__lower__wrapper__legal__text')
        weight = weight_element.get_text(strip=True) if weight_element else 'Weight not found'

        # Append product data
        products.append((title, current_price, promo_price, weight, "Non_Branded", "Aldi"))

    # Add timestamp
    timestamp = datetime.now().strftime('%Y-%m-%d')

    # Write product data to Excel
    for product in products:
        ws.append((*product, timestamp))

# Save the workbook
wb.save(file_name)

print(f"Data successfully saved to {file_name}")

# Close the driver
driver.quit()


Data successfully saved to choco.xlsx


# Albert Heijn

In [1]:
import undetected_chromedriver as uc
from fake_useragent import UserAgent
import time
import random
from datetime import datetime
from openpyxl import Workbook, load_workbook
import os
from bs4 import BeautifulSoup
import re  # Importing the regex module

# Instellen van User-Agent
ua = UserAgent()
user_agent = ua.random

# Chrome opties configureren
options = uc.options.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--disable-features=VizDisplayCompositor")
options.add_argument(f"user-agent={user_agent}")


# Start de WebDriver met undetected_chromedriver
driver = uc.Chrome(options=options)

# URL die je wilt scrapen
url = "https://www.ah.nl/producten/1854/chocolade-bites?merk=AH"

# Ga naar de pagina
driver.get(url)

# Wacht een paar seconden zodat de pagina volledig laadt
time.sleep(random.randint(3, 5))

# Verkrijg de HTML van de pagina
html = driver.page_source

# Parse de HTML met BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# Lijst om productdata op te slaan
products = []

# Loop door alle productartikelen op de pagina
for article in soup.find_all('article', class_='product-card-portrait_root__ZiRpZ'):
    # Extract prijs
    price_span = article.find('span', class_='sr-only')
    price = price_span.get('aria-label') if price_span else 'N/A'
    
    # Verwijder "Prijs: €" en extra spaties
    if price != 'N/A':
        price = re.sub(r'Prijs:\s*€\s*', '', price)  # Verwijder "Prijs: €"
        price = price.strip()  # Verwijder extra spaties rondom de prijs

    # Extract promo prijs
    promo_price_span = article.find('div', class_='price-amount_highlight__ekL92')
    promo_price = "N/A"
    if promo_price_span:
        promo_price_span_inner = promo_price_span.find('span', class_='sr-only')
        if promo_price_span_inner:
            promo_price = promo_price_span_inner.get('aria-label')

    # Verwijder promo prijs "Prijs: €" en extra spaties indien nodig
    if promo_price != "N/A":
        promo_price = re.sub(r'Prijs:\s*€\s*', '', promo_price)
        promo_price = promo_price.strip()  # Verwijder extra spaties rondom de promo prijs

    # Extract titel
    title_tag = article.find('a', class_='link_root__EqRHd')
    title = title_tag.get('title') if title_tag else 'N/A'

    # Extract gewicht
    weight_span = article.find('span', class_='price_unitSize__Hk6E4')
    weight = weight_span.get_text(strip=True) if weight_span else 'N/A'

    # Voeg de verkregen data toe aan de lijst
    products.append((title, price, promo_price, weight, "Non_Branded", "AH"))

# Verkrijg de huidige timestamp voor wanneer de data werd gescrapet
timestamp = datetime.now().strftime('%Y-%m-%d')

# Bestandsnaam en sheetnaam
file_name = 'choco.xlsx'
sheet_name = 'AH_Choco'

# Laad of maak een nieuw werkboek aan
if os.path.exists(file_name):
    workbook = load_workbook(file_name)
    sheet = workbook[sheet_name] if sheet_name in workbook.sheetnames else workbook.create_sheet(sheet_name)
else:
    workbook = Workbook()
    sheet = workbook.active
    sheet.title = sheet_name

# Schrijf de header als het een nieuw blad is
if sheet.max_row == 1:
    sheet.append(['Title', 'Price', 'Promo Price', 'Weight', 'Category', 'Store', 'Timestamp'])

# Voeg de productdata toe
for product in products:
    sheet.append([*product, timestamp])

# Sla het Excel-bestand op
workbook.save(file_name)
print(f"✅ Data succesvol opgeslagen naar {file_name} in blad '{sheet_name}'.")

# Sluit de browser na het scrapen
driver.quit()


✅ Data succesvol opgeslagen naar choco.xlsx in blad 'AH_Choco'.


In [2]:
import undetected_chromedriver as uc
from fake_useragent import UserAgent
import time
import random
from datetime import datetime
from openpyxl import Workbook, load_workbook
import os
from bs4 import BeautifulSoup
import re  # Importing the regex module

# Instellen van User-Agent
ua = UserAgent()
user_agent = ua.random

# Chrome opties configureren
options = uc.options.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--disable-features=VizDisplayCompositor")
options.add_argument(f"user-agent={user_agent}")


# Start de WebDriver met undetected_chromedriver
driver = uc.Chrome(options=options)

# URL die je wilt scrapen
url = "https://www.ah.nl/zoeken?query=ah%20choco%20pinda%27s%20zoet"

# Ga naar de pagina
driver.get(url)

# Wacht een paar seconden zodat de pagina volledig laadt
time.sleep(random.randint(3, 5))

# Verkrijg de HTML van de pagina
html = driver.page_source

# Parse de HTML met BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# Lijst om productdata op te slaan
products = []

# Loop door alle productartikelen op de pagina
for article in soup.find_all('article', class_='product-card-portrait_root__ZiRpZ'):
    # Extract prijs
    price_span = article.find('span', class_='sr-only')
    price = price_span.get('aria-label') if price_span else 'N/A'
    
    # Verwijder "Prijs: €" en extra spaties
    if price != 'N/A':
        price = re.sub(r'Prijs:\s*€\s*', '', price)  # Verwijder "Prijs: €"
        price = price.strip()  # Verwijder extra spaties rondom de prijs

    # Extract promo prijs
    promo_price_span = article.find('div', class_='price-amount_highlight__ekL92')
    promo_price = "N/A"
    if promo_price_span:
        promo_price_span_inner = promo_price_span.find('span', class_='sr-only')
        if promo_price_span_inner:
            promo_price = promo_price_span_inner.get('aria-label')

    # Verwijder promo prijs "Prijs: €" en extra spaties indien nodig
    if promo_price != "N/A":
        promo_price = re.sub(r'Prijs:\s*€\s*', '', promo_price)
        promo_price = promo_price.strip()  # Verwijder extra spaties rondom de promo prijs

    # Extract titel
    title_tag = article.find('a', class_='link_root__EqRHd')
    title = title_tag.get('title') if title_tag else 'N/A'

    # Extract gewicht
    weight_span = article.find('span', class_='price_unitSize__Hk6E4')
    weight = weight_span.get_text(strip=True) if weight_span else 'N/A'

    # Voeg de verkregen data toe aan de lijst
    products.append((title, price, promo_price, weight, "Non_Branded", "AH"))

# Verkrijg de huidige timestamp voor wanneer de data werd gescrapet
timestamp = datetime.now().strftime('%Y-%m-%d')

# Bestandsnaam en sheetnaam
file_name = 'choco.xlsx'
sheet_name = 'AH_Choco'

# Laad of maak een nieuw werkboek aan
if os.path.exists(file_name):
    workbook = load_workbook(file_name)
    sheet = workbook[sheet_name] if sheet_name in workbook.sheetnames else workbook.create_sheet(sheet_name)
else:
    workbook = Workbook()
    sheet = workbook.active
    sheet.title = sheet_name

# Schrijf de header als het een nieuw blad is
if sheet.max_row == 1:
    sheet.append(['Title', 'Price', 'Promo Price', 'Weight', 'Category', 'Store', 'Timestamp'])

# Voeg de productdata toe
for product in products:
    sheet.append([*product, timestamp])

# Sla het Excel-bestand op
workbook.save(file_name)
print(f"✅ Data succesvol opgeslagen naar {file_name} in blad '{sheet_name}'.")

# Sluit de browser na het scrapen
driver.quit()


✅ Data succesvol opgeslagen naar choco.xlsx in blad 'AH_Choco'.


## M&M

### AH

In [3]:
import undetected_chromedriver as uc
from fake_useragent import UserAgent
import time
import random
from datetime import datetime
from openpyxl import Workbook, load_workbook
import os
from bs4 import BeautifulSoup
import re  # Importing the regex module

# Instellen van User-Agent
ua = UserAgent()
user_agent = ua.random

# Chrome opties configureren
options = uc.options.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--disable-features=VizDisplayCompositor")
options.add_argument(f"user-agent={user_agent}")


# Start de WebDriver met undetected_chromedriver
driver = uc.Chrome(options=options)

# URL die je wilt scrapen
url = "https://www.ah.nl/producten/1854/chocolade-bites?merk=M%26M%27S"

# Ga naar de pagina
driver.get(url)

# Wacht een paar seconden zodat de pagina volledig laadt
time.sleep(random.randint(3, 5))

# Verkrijg de HTML van de pagina
html = driver.page_source

# Parse de HTML met BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# Lijst om productdata op te slaan
products = []

# Loop door alle productartikelen op de pagina
for article in soup.find_all('article', class_='product-card-portrait_root__ZiRpZ'):
    # Extract prijs
    price_span = article.find('span', class_='sr-only')
    price = price_span.get('aria-label') if price_span else 'N/A'
    
    # Verwijder "Prijs: €" en extra spaties
    if price != 'N/A':
        price = re.sub(r'Prijs:\s*€\s*', '', price)  # Verwijder "Prijs: €"
        price = price.strip()  # Verwijder extra spaties rondom de prijs

    # Extract promo prijs
    promo_price_span = article.find('div', class_='price-amount_highlight__ekL92')
    promo_price = "N/A"
    if promo_price_span:
        promo_price_span_inner = promo_price_span.find('span', class_='sr-only')
        if promo_price_span_inner:
            promo_price = promo_price_span_inner.get('aria-label')

    # Verwijder promo prijs "Prijs: €" en extra spaties indien nodig
    if promo_price != "N/A":
        promo_price = re.sub(r'Prijs:\s*€\s*', '', promo_price)
        promo_price = promo_price.strip()  # Verwijder extra spaties rondom de promo prijs

    # Extract titel
    title_tag = article.find('a', class_='link_root__EqRHd')
    title = title_tag.get('title') if title_tag else 'N/A'

    # Extract gewicht
    weight_span = article.find('span', class_='price_unitSize__Hk6E4')
    weight = weight_span.get_text(strip=True) if weight_span else 'N/A'

    # Voeg de verkregen data toe aan de lijst
    products.append((title, price, promo_price, weight, "Branded", "AH"))

# Verkrijg de huidige timestamp voor wanneer de data werd gescrapet
timestamp = datetime.now().strftime('%Y-%m-%d')

# Bestandsnaam en sheetnaam
file_name = 'choco.xlsx'
sheet_name = 'AH_Choco'

# Laad of maak een nieuw werkboek aan
if os.path.exists(file_name):
    workbook = load_workbook(file_name)
    sheet = workbook[sheet_name] if sheet_name in workbook.sheetnames else workbook.create_sheet(sheet_name)
else:
    workbook = Workbook()
    sheet = workbook.active
    sheet.title = sheet_name

# Schrijf de header als het een nieuw blad is
if sheet.max_row == 1:
    sheet.append(['Title', 'Price', 'Promo Price', 'Weight', 'Category', 'Store', 'Timestamp'])

# Voeg de productdata toe
for product in products:
    sheet.append([*product, timestamp])

# Sla het Excel-bestand op
workbook.save(file_name)
print(f"✅ Data succesvol opgeslagen naar {file_name} in blad '{sheet_name}'.")

# Sluit de browser na het scrapen
driver.quit()


✅ Data succesvol opgeslagen naar choco.xlsx in blad 'AH_Choco'.


### Jumbo

In [4]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
from datetime import datetime
import re
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import openpyxl
from openpyxl import load_workbook

# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Navigate to the Jumbo products page
url = "https://www.jumbo.com/producten/menms/?searchType=keyword&searchTerms=m%26m"
driver.get(url)

# Wait for the page to load and accept cookies
try:
    accept_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.ID, "onetrust-reject-all-handler"))
    )
    accept_button.click()
except:
    print("No accept cookies button found.")

# Wait for products to load
WebDriverWait(driver, 20).until(
    EC.presence_of_all_elements_located((By.CLASS_NAME, "jum-card"))
)

# Load page source into BeautifulSoup
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

# Extract product data
products = []

for product_card in soup.find_all("div", class_="jum-card"):
    # Extract product title
    title_tag = product_card.find("a", class_="title-link")
    title = title_tag.text.strip() if title_tag else "Title not found"

    # Extract promo price
    promo_price_div = product_card.find("div", class_="promo-price")
    promo_price = (
        re.search(r"[\d]+[.,][\d]+", promo_price_div.text.strip()).group()
        if promo_price_div and promo_price_div.text
        else "Promo price not found"
    )

    # Extract price
    price_whole = product_card.find("span", class_="whole")
    price_fractional = product_card.find("span", class_="fractional")
    price = (
        f"{price_whole.text.strip()},{price_fractional.text.strip()}"
        if price_whole and price_fractional
        else "Price not found"
    )

    # Extract weight
    subtitle_div = product_card.find("div", class_="subtitle")
    weight_span = subtitle_div.find("span", class_="text") if subtitle_div else None
    weight = weight_span.text.strip() if weight_span else "Weight not found"

    # Append to products list
    products.append((title, promo_price, price, weight, "branded", "Jumbo"))

# Write to Excel
timestamp = datetime.now().strftime('%Y-%m-%d')
file_name = "choco.xlsx"

try:
    # Try loading the existing workbook
    workbook = load_workbook(file_name)
    sheet = workbook.active
except FileNotFoundError:
    # If the file does not exist, create a new workbook and sheet
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    # Write the header row
    sheet.append(["Title", "Promo Price", "Price", "Weight", "Brand", "Store", "Timestamp"])

# Write data to the Excel sheet
for product in products:
    sheet.append((*product, timestamp))

# Save the workbook
workbook.save(file_name)

print(f"Extracted {len(products)} products and saved to {file_name}.")
driver.quit()


Extracted 19 products and saved to choco.xlsx.


### Plus

In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re  # Importing the regular expression module
from datetime import datetime  # Importing datetime for timestamp
from selenium.webdriver.chrome.service import Service
import openpyxl
from openpyxl import load_workbook

# Initialize Chrome driver with Service
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

url = "https://www.plus.nl/producten/snoep-koek-chocolade-chips-noten/chocolade/chocoladesnoepjes?merk=M%26M%27S"
driver.get(url)

# Click the "Weigeren" button to reject cookies on the Plus site

# Wait for the "Weigeren" button to be clickable
accept_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'btn-cookies-refuse')]")))
accept_button.click()

html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

time.sleep(3)

# List to store the extracted product information
products = []

# Loop through all product articles
for article in soup.find_all('a', id=re.compile(".*-produt_item_link")):
    # Extract the product title from the title attribute of the anchor tag
    title = article.get('title', 'Title not found')

    # Extract the price from the price integers and decimals
    price_integer = article.find('div', class_='font-bold product-header-price-integer')
    price_decimals = article.find('div', class_='font-black product-header-price-decimals')

    if price_integer and price_decimals:
        price = f"{price_integer.get_text(strip=True)}{price_decimals.get_text(strip=True)}"
    else:
        price = 'Price not found'

    # Extract the previous (old) price from the price-previous div
    previous_price_span = article.find('div', class_='product-header-price-previous')
    if previous_price_span:
        # Extract the old price as text
        promo_price = previous_price_span.get_text(strip=True)
    else:
        promo_price = 'Promo price not found'

    # Extract the weight from the 'Per 250 g' span
    weight_span = article.find('span', class_='OSFillParent')
    weight = weight_span.get_text(strip=True) if weight_span else 'Weight not found'

    # Store the extracted information as a tuple, including promo price
    products.append((title, price, promo_price, weight, "branded", "Plus"))

# Get current timestamp for the data
timestamp = datetime.now().strftime('%Y-%m-%d')  # Format: YYYY-MM-DD HH:MM:SS

# Write the data to an Excel file
file_name = "choco.xlsx"

try:
    # Try loading the existing workbook
    workbook = load_workbook(file_name)
    sheet = workbook.active
except FileNotFoundError:
    # If the file does not exist, create a new workbook and sheet
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    # Write the header row
    sheet.append(["Title", "Price", "Promo Price", "Weight", "Brand", "Store", "Timestamp"])

# Write data to the Excel sheet
for product in products:
    sheet.append((*product, timestamp))

# Save the workbook
workbook.save(file_name)

print(f"Data has been successfully saved to {file_name}")

# Close the driver
driver.quit()

Data has been successfully saved to choco.xlsx
